aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-08-31 03:45:21 -0400
committerIngo Molnar <mingo@elte.hu>2010-08-31 03:45:46 -0400
commitdaab7fc734a53fdeaf844b7c03053118ad1769da (patch)
tree575deb3cdcc6dda562acaed6f7c29bc81ae01cf2 /kernel
parent774ea0bcb27f57b6fd521b3b6c43237782fed4b9 (diff)
parent2bfc96a127bc1cc94d26bfaa40159966064f9c8c (diff)
Merge commit 'v2.6.36-rc3' into x86/memblock
Conflicts: arch/x86/kernel/trampoline.c mm/memblock.c Merge reason: Resolve the conflicts, update to latest upstream. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c141
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit.h26
-rw-r--r--kernel/audit_tree.c237
-rw-r--r--kernel/audit_watch.c274
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c19
-rw-r--r--kernel/cgroup.c40
-rw-r--r--kernel/compat.c17
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/debug_core.c6
-rw-r--r--kernel/debug/gdbstub.c191
-rw-r--r--kernel/debug/kdb/kdb_main.c138
-rw-r--r--kernel/debug/kdb/kdb_private.h9
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/exec_domain.c22
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/hrtimer.c17
-rw-r--r--kernel/hw_breakpoint.c90
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kfifo.c753
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kthread.c164
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/module.c1088
-rw-r--r--kernel/padata.c755
-rw-r--r--kernel/panic.c60
-rw-r--r--kernel/params.c233
-rw-r--r--kernel/perf_event.c460
-rw-r--r--kernel/pid.c56
-rw-r--r--kernel/pm_qos_params.c225
-rw-r--r--kernel/posix-cpu-timers.c44
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c27
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c21
-rw-r--r--kernel/power/snapshot.c3
-rw-r--r--kernel/power/suspend.c13
-rw-r--r--kernel/power/swap.c12
-rw-r--r--kernel/printk.c43
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/range.c4
-rw-r--r--kernel/rcupdate.c160
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutorture.c3
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/sched.c407
-rw-r--r--kernel/sched_clock.c95
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c534
-rw-r--r--kernel/sched_rt.c3
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c1068
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/softlockup.c293
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c202
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c76
-rw-r--r--kernel/time.c16
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c15
-rw-r--r--kernel/time/timekeeping.c93
-rw-r--r--kernel/timer.c54
-rw-r--r--kernel/trace/Kconfig83
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/blktrace.c88
-rw-r--r--kernel/trace/ftrace.c5
-rw-r--r--kernel/trace/kmemtrace.c529
-rw-r--r--kernel/trace/ring_buffer.c43
-rw-r--r--kernel/trace/trace.c189
-rw-r--r--kernel/trace/trace.h107
-rw-r--r--kernel/trace/trace_boot.c185
-rw-r--r--kernel/trace/trace_clock.c7
-rw-r--r--kernel/trace/trace_entries.h94
-rw-r--r--kernel/trace/trace_event_perf.c27
-rw-r--r--kernel/trace/trace_events.c440
-rw-r--r--kernel/trace/trace_events_filter.c27
-rw-r--r--kernel/trace/trace_export.c8
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c13
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kdb.c136
-rw-r--r--kernel/trace/trace_kprobe.c383
-rw-r--r--kernel/trace/trace_ksym.c508
-rw-r--r--kernel/trace/trace_output.c69
-rw-r--r--kernel/trace/trace_sched_wakeup.c7
-rw-r--r--kernel/trace/trace_selftest.c87
-rw-r--r--kernel/trace/trace_stack.c8
-rw-r--r--kernel/trace/trace_syscalls.c7
-rw-r--r--kernel/trace/trace_sysprof.c329
-rw-r--r--kernel/user_namespace.c44
-rw-r--r--kernel/watchdog.c570
-rw-r--r--kernel/workqueue.c3182
-rw-r--r--kernel/workqueue_sched.h9
109 files changed, 8887 insertions, 7136 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 80e61c3e44ae..4c19ebe9053c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,14 +69,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o
69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
70obj-$(CONFIG_SMP) += stop_machine.o 70obj-$(CONFIG_SMP) += stop_machine.o
71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/ 74obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
76obj-$(CONFIG_GCOV_KERNEL) += gcov/
76obj-$(CONFIG_KPROBES) += kprobes.o 77obj-$(CONFIG_KPROBES) += kprobes.o
77obj-$(CONFIG_KGDB) += debug/ 78obj-$(CONFIG_KGDB) += debug/
78obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
79obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 79obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
81obj-$(CONFIG_SECCOMP) += seccomp.o 82obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 83obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -98,8 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/ 100obj-$(CONFIG_RING_BUFFER) += trace/
100obj-$(CONFIG_SMP) += sched_cpupri.o 101obj-$(CONFIG_SMP) += sched_cpupri.o
101obj-$(CONFIG_SLOW_WORK) += slow-work.o
102obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
103obj-$(CONFIG_PERF_EVENTS) += perf_event.o 102obj-$(CONFIG_PERF_EVENTS) += perf_event.o
104obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 103obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
105obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 104obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 385b88461c29..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
122 spin_unlock(&acct_lock); 122 spin_unlock(&acct_lock);
123 123
124 /* May block */ 124 /* May block */
125 if (vfs_statfs(file->f_path.dentry, &sbuf)) 125 if (vfs_statfs(&file->f_path, &sbuf))
126 return res; 126 return res;
127 suspend = sbuf.f_blocks * SUSPEND; 127 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 128 resume = sbuf.f_blocks * RESUME;
diff --git a/kernel/async.c b/kernel/async.c
index 15319d6c18fe..cd9dbb913c77 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/bug.h>
53#include <linux/module.h> 52#include <linux/module.h>
54#include <linux/wait.h> 53#include <linux/wait.h>
55#include <linux/sched.h> 54#include <linux/sched.h>
56#include <linux/init.h>
57#include <linux/kthread.h>
58#include <linux/delay.h>
59#include <linux/slab.h> 55#include <linux/slab.h>
56#include <linux/workqueue.h>
60#include <asm/atomic.h> 57#include <asm/atomic.h>
61 58
62static async_cookie_t next_cookie = 1; 59static async_cookie_t next_cookie = 1;
63 60
64#define MAX_THREADS 256
65#define MAX_WORK 32768 61#define MAX_WORK 32768
66 62
67static LIST_HEAD(async_pending); 63static LIST_HEAD(async_pending);
68static LIST_HEAD(async_running); 64static LIST_HEAD(async_running);
69static DEFINE_SPINLOCK(async_lock); 65static DEFINE_SPINLOCK(async_lock);
70 66
71static int async_enabled = 0;
72
73struct async_entry { 67struct async_entry {
74 struct list_head list; 68 struct list_head list;
75 async_cookie_t cookie; 69 struct work_struct work;
76 async_func_ptr *func; 70 async_cookie_t cookie;
77 void *data; 71 async_func_ptr *func;
78 struct list_head *running; 72 void *data;
73 struct list_head *running;
79}; 74};
80 75
81static DECLARE_WAIT_QUEUE_HEAD(async_done); 76static DECLARE_WAIT_QUEUE_HEAD(async_done);
82static DECLARE_WAIT_QUEUE_HEAD(async_new);
83 77
84static atomic_t entry_count; 78static atomic_t entry_count;
85static atomic_t thread_count;
86 79
87extern int initcall_debug; 80extern int initcall_debug;
88 81
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running)
117 spin_unlock_irqrestore(&async_lock, flags); 110 spin_unlock_irqrestore(&async_lock, flags);
118 return ret; 111 return ret;
119} 112}
113
120/* 114/*
121 * pick the first pending entry and run it 115 * pick the first pending entry and run it
122 */ 116 */
123static void run_one_entry(void) 117static void async_run_entry_fn(struct work_struct *work)
124{ 118{
119 struct async_entry *entry =
120 container_of(work, struct async_entry, work);
125 unsigned long flags; 121 unsigned long flags;
126 struct async_entry *entry;
127 ktime_t calltime, delta, rettime; 122 ktime_t calltime, delta, rettime;
128 123
129 /* 1) pick one task from the pending queue */ 124 /* 1) move self to the running queue */
130
131 spin_lock_irqsave(&async_lock, flags); 125 spin_lock_irqsave(&async_lock, flags);
132 if (list_empty(&async_pending))
133 goto out;
134 entry = list_first_entry(&async_pending, struct async_entry, list);
135
136 /* 2) move it to the running queue */
137 list_move_tail(&entry->list, entry->running); 126 list_move_tail(&entry->list, entry->running);
138 spin_unlock_irqrestore(&async_lock, flags); 127 spin_unlock_irqrestore(&async_lock, flags);
139 128
140 /* 3) run it (and print duration)*/ 129 /* 2) run (and print duration) */
141 if (initcall_debug && system_state == SYSTEM_BOOTING) { 130 if (initcall_debug && system_state == SYSTEM_BOOTING) {
142 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
143 entry->func, task_pid_nr(current)); 132 entry->func, task_pid_nr(current));
@@ -153,31 +142,25 @@ static void run_one_entry(void)
153 (long long)ktime_to_ns(delta) >> 10); 142 (long long)ktime_to_ns(delta) >> 10);
154 } 143 }
155 144
156 /* 4) remove it from the running queue */ 145 /* 3) remove self from the running queue */
157 spin_lock_irqsave(&async_lock, flags); 146 spin_lock_irqsave(&async_lock, flags);
158 list_del(&entry->list); 147 list_del(&entry->list);
159 148
160 /* 5) free the entry */ 149 /* 4) free the entry */
161 kfree(entry); 150 kfree(entry);
162 atomic_dec(&entry_count); 151 atomic_dec(&entry_count);
163 152
164 spin_unlock_irqrestore(&async_lock, flags); 153 spin_unlock_irqrestore(&async_lock, flags);
165 154
166 /* 6) wake up any waiters. */ 155 /* 5) wake up any waiters */
167 wake_up(&async_done); 156 wake_up(&async_done);
168 return;
169
170out:
171 spin_unlock_irqrestore(&async_lock, flags);
172} 157}
173 158
174
175static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) 159static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
176{ 160{
177 struct async_entry *entry; 161 struct async_entry *entry;
178 unsigned long flags; 162 unsigned long flags;
179 async_cookie_t newcookie; 163 async_cookie_t newcookie;
180
181 164
182 /* allow irq-off callers */ 165 /* allow irq-off callers */
183 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); 166 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
186 * If we're out of memory or if there's too much work 169 * If we're out of memory or if there's too much work
187 * pending already, we execute synchronously. 170 * pending already, we execute synchronously.
188 */ 171 */
189 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { 172 if (!entry || atomic_read(&entry_count) > MAX_WORK) {
190 kfree(entry); 173 kfree(entry);
191 spin_lock_irqsave(&async_lock, flags); 174 spin_lock_irqsave(&async_lock, flags);
192 newcookie = next_cookie++; 175 newcookie = next_cookie++;
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
196 ptr(data, newcookie); 179 ptr(data, newcookie);
197 return newcookie; 180 return newcookie;
198 } 181 }
182 INIT_WORK(&entry->work, async_run_entry_fn);
199 entry->func = ptr; 183 entry->func = ptr;
200 entry->data = data; 184 entry->data = data;
201 entry->running = running; 185 entry->running = running;
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
205 list_add_tail(&entry->list, &async_pending); 189 list_add_tail(&entry->list, &async_pending);
206 atomic_inc(&entry_count); 190 atomic_inc(&entry_count);
207 spin_unlock_irqrestore(&async_lock, flags); 191 spin_unlock_irqrestore(&async_lock, flags);
208 wake_up(&async_new); 192
193 /* schedule for execution */
194 queue_work(system_unbound_wq, &entry->work);
195
209 return newcookie; 196 return newcookie;
210} 197}
211 198
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
312 async_synchronize_cookie_domain(cookie, &async_running); 299 async_synchronize_cookie_domain(cookie, &async_running);
313} 300}
314EXPORT_SYMBOL_GPL(async_synchronize_cookie); 301EXPORT_SYMBOL_GPL(async_synchronize_cookie);
315
316
317static int async_thread(void *unused)
318{
319 DECLARE_WAITQUEUE(wq, current);
320 add_wait_queue(&async_new, &wq);
321
322 while (!kthread_should_stop()) {
323 int ret = HZ;
324 set_current_state(TASK_INTERRUPTIBLE);
325 /*
326 * check the list head without lock.. false positives
327 * are dealt with inside run_one_entry() while holding
328 * the lock.
329 */
330 rmb();
331 if (!list_empty(&async_pending))
332 run_one_entry();
333 else
334 ret = schedule_timeout(HZ);
335
336 if (ret == 0) {
337 /*
338 * we timed out, this means we as thread are redundant.
339 * we sign off and die, but we to avoid any races there
340 * is a last-straw check to see if work snuck in.
341 */
342 atomic_dec(&thread_count);
343 wmb(); /* manager must see our departure first */
344 if (list_empty(&async_pending))
345 break;
346 /*
347 * woops work came in between us timing out and us
348 * signing off; we need to stay alive and keep working.
349 */
350 atomic_inc(&thread_count);
351 }
352 }
353 remove_wait_queue(&async_new, &wq);
354
355 return 0;
356}
357
358static int async_manager_thread(void *unused)
359{
360 DECLARE_WAITQUEUE(wq, current);
361 add_wait_queue(&async_new, &wq);
362
363 while (!kthread_should_stop()) {
364 int tc, ec;
365
366 set_current_state(TASK_INTERRUPTIBLE);
367
368 tc = atomic_read(&thread_count);
369 rmb();
370 ec = atomic_read(&entry_count);
371
372 while (tc < ec && tc < MAX_THREADS) {
373 if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
374 tc))) {
375 msleep(100);
376 continue;
377 }
378 atomic_inc(&thread_count);
379 tc++;
380 }
381
382 schedule();
383 }
384 remove_wait_queue(&async_new, &wq);
385
386 return 0;
387}
388
389static int __init async_init(void)
390{
391 async_enabled =
392 !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
393
394 WARN_ON(!async_enabled);
395 return 0;
396}
397
398core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#include <linux/netlink.h> 58#include <linux/netlink.h>
59#include <linux/inotify.h>
60#include <linux/freezer.h> 59#include <linux/freezer.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
62 61
@@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
407 audit_hold_skb(skb); 406 audit_hold_skb(skb);
408 } else 407 } else
409 /* drop the extra reference if sent ok */ 408 /* drop the extra reference if sent ok */
410 kfree_skb(skb); 409 consume_skb(skb);
411} 410}
412 411
413static int kauditd_thread(void *dummy) 412static int kauditd_thread(void *dummy)
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
103extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
104extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
105 105
106extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
107
106/* audit watch functions */ 108/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch); 109#ifdef CONFIG_AUDIT_WATCH
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch); 110extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch); 111extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); 112extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule); 113extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
113extern void audit_remove_watch(struct audit_watch *watch); 114extern void audit_remove_watch_rule(struct audit_krule *krule);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch); 115extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch); 116extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
118 117#else
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, 118#define audit_put_watch(w) {}
120 struct audit_watch *watch); 119#define audit_get_watch(w) {}
120#define audit_to_watch(k, p, l, o) (-EINVAL)
121#define audit_add_watch(k, l) (-EINVAL)
122#define audit_remove_watch_rule(k) BUG()
123#define audit_watch_path(w) ""
124#define audit_watch_compare(w, i, d) 0
125
126#endif /* CONFIG_AUDIT_WATCH */
121 127
122#ifdef CONFIG_AUDIT_TREE 128#ifdef CONFIG_AUDIT_TREE
123extern struct audit_chunk *audit_tree_lookup(const struct inode *); 129extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
1#include "audit.h" 1#include "audit.h"
2#include <linux/inotify.h> 2#include <linux/fsnotify_backend.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
22 22
23struct audit_chunk { 23struct audit_chunk {
24 struct list_head hash; 24 struct list_head hash;
25 struct inotify_watch watch; 25 struct fsnotify_mark mark;
26 struct list_head trees; /* with root here */ 26 struct list_head trees; /* with root here */
27 int dead; 27 int dead;
28 int count; 28 int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
59 * tree is refcounted; one reference for "some rules on rules_list refer to 59 * tree is refcounted; one reference for "some rules on rules_list refer to
60 * it", one for each chunk with pointer to it. 60 * it", one for each chunk with pointer to it.
61 * 61 *
62 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount 62 * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
63 * of watch contributes 1 to .refs). 63 * of watch contributes 1 to .refs).
64 * 64 *
65 * node.index allows to get from node.list to containing chunk. 65 * node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
68 * that makes a difference. Some. 68 * that makes a difference. Some.
69 */ 69 */
70 70
71static struct inotify_handle *rtree_ih; 71static struct fsnotify_group *audit_tree_group;
72 72
73static struct audit_tree *alloc_tree(const char *s) 73static struct audit_tree *alloc_tree(const char *s)
74{ 74{
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
111 return tree->pathname; 111 return tree->pathname;
112} 112}
113 113
114static struct audit_chunk *alloc_chunk(int count)
115{
116 struct audit_chunk *chunk;
117 size_t size;
118 int i;
119
120 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
121 chunk = kzalloc(size, GFP_KERNEL);
122 if (!chunk)
123 return NULL;
124
125 INIT_LIST_HEAD(&chunk->hash);
126 INIT_LIST_HEAD(&chunk->trees);
127 chunk->count = count;
128 atomic_long_set(&chunk->refs, 1);
129 for (i = 0; i < count; i++) {
130 INIT_LIST_HEAD(&chunk->owners[i].list);
131 chunk->owners[i].index = i;
132 }
133 inotify_init_watch(&chunk->watch);
134 return chunk;
135}
136
137static void free_chunk(struct audit_chunk *chunk) 114static void free_chunk(struct audit_chunk *chunk)
138{ 115{
139 int i; 116 int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
157 audit_put_chunk(chunk); 134 audit_put_chunk(chunk);
158} 135}
159 136
137static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
138{
139 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
140 call_rcu(&chunk->head, __put_chunk);
141}
142
143static struct audit_chunk *alloc_chunk(int count)
144{
145 struct audit_chunk *chunk;
146 size_t size;
147 int i;
148
149 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
150 chunk = kzalloc(size, GFP_KERNEL);
151 if (!chunk)
152 return NULL;
153
154 INIT_LIST_HEAD(&chunk->hash);
155 INIT_LIST_HEAD(&chunk->trees);
156 chunk->count = count;
157 atomic_long_set(&chunk->refs, 1);
158 for (i = 0; i < count; i++) {
159 INIT_LIST_HEAD(&chunk->owners[i].list);
160 chunk->owners[i].index = i;
161 }
162 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
163 return chunk;
164}
165
160enum {HASH_SIZE = 128}; 166enum {HASH_SIZE = 128};
161static struct list_head chunk_hash_heads[HASH_SIZE]; 167static struct list_head chunk_hash_heads[HASH_SIZE];
162static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); 168static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
167 return chunk_hash_heads + n % HASH_SIZE; 173 return chunk_hash_heads + n % HASH_SIZE;
168} 174}
169 175
170/* hash_lock is held by caller */ 176/* hash_lock & entry->lock is held by caller */
171static void insert_hash(struct audit_chunk *chunk) 177static void insert_hash(struct audit_chunk *chunk)
172{ 178{
173 struct list_head *list = chunk_hash(chunk->watch.inode); 179 struct fsnotify_mark *entry = &chunk->mark;
180 struct list_head *list;
181
182 if (!entry->i.inode)
183 return;
184 list = chunk_hash(entry->i.inode);
174 list_add_rcu(&chunk->hash, list); 185 list_add_rcu(&chunk->hash, list);
175} 186}
176 187
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
181 struct audit_chunk *p; 192 struct audit_chunk *p;
182 193
183 list_for_each_entry_rcu(p, list, hash) { 194 list_for_each_entry_rcu(p, list, hash) {
184 if (p->watch.inode == inode) { 195 /* mark.inode may have gone NULL, but who cares? */
196 if (p->mark.i.inode == inode) {
185 atomic_long_inc(&p->refs); 197 atomic_long_inc(&p->refs);
186 return p; 198 return p;
187 } 199 }
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
210static void untag_chunk(struct node *p) 222static void untag_chunk(struct node *p)
211{ 223{
212 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark;
213 struct audit_chunk *new; 226 struct audit_chunk *new;
214 struct audit_tree *owner; 227 struct audit_tree *owner;
215 int size = chunk->count - 1; 228 int size = chunk->count - 1;
216 int i, j; 229 int i, j;
217 230
218 if (!pin_inotify_watch(&chunk->watch)) { 231 fsnotify_get_mark(entry);
219 /*
220 * Filesystem is shutting down; all watches are getting
221 * evicted, just take it off the node list for this
222 * tree and let the eviction logics take care of the
223 * rest.
224 */
225 owner = p->owner;
226 if (owner->root == chunk) {
227 list_del_init(&owner->same_root);
228 owner->root = NULL;
229 }
230 list_del_init(&p->list);
231 p->owner = NULL;
232 put_tree(owner);
233 return;
234 }
235 232
236 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
237 234
238 /* 235 spin_lock(&entry->lock);
239 * pin_inotify_watch() succeeded, so the watch won't go away 236 if (chunk->dead || !entry->i.inode) {
240 * from under us. 237 spin_unlock(&entry->lock);
241 */
242 mutex_lock(&chunk->watch.inode->inotify_mutex);
243 if (chunk->dead) {
244 mutex_unlock(&chunk->watch.inode->inotify_mutex);
245 goto out; 238 goto out;
246 } 239 }
247 240
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
256 list_del_init(&p->list); 249 list_del_init(&p->list);
257 list_del_rcu(&chunk->hash); 250 list_del_rcu(&chunk->hash);
258 spin_unlock(&hash_lock); 251 spin_unlock(&hash_lock);
259 inotify_evict_watch(&chunk->watch); 252 spin_unlock(&entry->lock);
260 mutex_unlock(&chunk->watch.inode->inotify_mutex); 253 fsnotify_destroy_mark(entry);
261 put_inotify_watch(&chunk->watch); 254 fsnotify_put_mark(entry);
262 goto out; 255 goto out;
263 } 256 }
264 257
265 new = alloc_chunk(size); 258 new = alloc_chunk(size);
266 if (!new) 259 if (!new)
267 goto Fallback; 260 goto Fallback;
268 if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { 261 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
269 free_chunk(new); 263 free_chunk(new);
270 goto Fallback; 264 goto Fallback;
271 } 265 }
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
298 list_for_each_entry(owner, &new->trees, same_root) 292 list_for_each_entry(owner, &new->trees, same_root)
299 owner->root = new; 293 owner->root = new;
300 spin_unlock(&hash_lock); 294 spin_unlock(&hash_lock);
301 inotify_evict_watch(&chunk->watch); 295 spin_unlock(&entry->lock);
302 mutex_unlock(&chunk->watch.inode->inotify_mutex); 296 fsnotify_destroy_mark(entry);
303 put_inotify_watch(&chunk->watch); 297 fsnotify_put_mark(entry);
304 goto out; 298 goto out;
305 299
306Fallback: 300Fallback:
@@ -314,31 +308,33 @@ Fallback:
314 p->owner = NULL; 308 p->owner = NULL;
315 put_tree(owner); 309 put_tree(owner);
316 spin_unlock(&hash_lock); 310 spin_unlock(&hash_lock);
317 mutex_unlock(&chunk->watch.inode->inotify_mutex); 311 spin_unlock(&entry->lock);
318out: 312out:
319 unpin_inotify_watch(&chunk->watch); 313 fsnotify_put_mark(entry);
320 spin_lock(&hash_lock); 314 spin_lock(&hash_lock);
321} 315}
322 316
323static int create_chunk(struct inode *inode, struct audit_tree *tree) 317static int create_chunk(struct inode *inode, struct audit_tree *tree)
324{ 318{
319 struct fsnotify_mark *entry;
325 struct audit_chunk *chunk = alloc_chunk(1); 320 struct audit_chunk *chunk = alloc_chunk(1);
326 if (!chunk) 321 if (!chunk)
327 return -ENOMEM; 322 return -ENOMEM;
328 323
329 if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { 324 entry = &chunk->mark;
325 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
330 free_chunk(chunk); 326 free_chunk(chunk);
331 return -ENOSPC; 327 return -ENOSPC;
332 } 328 }
333 329
334 mutex_lock(&inode->inotify_mutex); 330 spin_lock(&entry->lock);
335 spin_lock(&hash_lock); 331 spin_lock(&hash_lock);
336 if (tree->goner) { 332 if (tree->goner) {
337 spin_unlock(&hash_lock); 333 spin_unlock(&hash_lock);
338 chunk->dead = 1; 334 chunk->dead = 1;
339 inotify_evict_watch(&chunk->watch); 335 spin_unlock(&entry->lock);
340 mutex_unlock(&inode->inotify_mutex); 336 fsnotify_destroy_mark(entry);
341 put_inotify_watch(&chunk->watch); 337 fsnotify_put_mark(entry);
342 return 0; 338 return 0;
343 } 339 }
344 chunk->owners[0].index = (1U << 31); 340 chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
351 } 347 }
352 insert_hash(chunk); 348 insert_hash(chunk);
353 spin_unlock(&hash_lock); 349 spin_unlock(&hash_lock);
354 mutex_unlock(&inode->inotify_mutex); 350 spin_unlock(&entry->lock);
355 return 0; 351 return 0;
356} 352}
357 353
358/* the first tagged inode becomes root of tree */ 354/* the first tagged inode becomes root of tree */
359static int tag_chunk(struct inode *inode, struct audit_tree *tree) 355static int tag_chunk(struct inode *inode, struct audit_tree *tree)
360{ 356{
361 struct inotify_watch *watch; 357 struct fsnotify_mark *old_entry, *chunk_entry;
362 struct audit_tree *owner; 358 struct audit_tree *owner;
363 struct audit_chunk *chunk, *old; 359 struct audit_chunk *chunk, *old;
364 struct node *p; 360 struct node *p;
365 int n; 361 int n;
366 362
367 if (inotify_find_watch(rtree_ih, inode, &watch) < 0) 363 old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
364 if (!old_entry)
368 return create_chunk(inode, tree); 365 return create_chunk(inode, tree);
369 366
370 old = container_of(watch, struct audit_chunk, watch); 367 old = container_of(old_entry, struct audit_chunk, mark);
371 368
372 /* are we already there? */ 369 /* are we already there? */
373 spin_lock(&hash_lock); 370 spin_lock(&hash_lock);
374 for (n = 0; n < old->count; n++) { 371 for (n = 0; n < old->count; n++) {
375 if (old->owners[n].owner == tree) { 372 if (old->owners[n].owner == tree) {
376 spin_unlock(&hash_lock); 373 spin_unlock(&hash_lock);
377 put_inotify_watch(&old->watch); 374 fsnotify_put_mark(old_entry);
378 return 0; 375 return 0;
379 } 376 }
380 } 377 }
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
382 379
383 chunk = alloc_chunk(old->count + 1); 380 chunk = alloc_chunk(old->count + 1);
384 if (!chunk) { 381 if (!chunk) {
385 put_inotify_watch(&old->watch); 382 fsnotify_put_mark(old_entry);
386 return -ENOMEM; 383 return -ENOMEM;
387 } 384 }
388 385
389 mutex_lock(&inode->inotify_mutex); 386 chunk_entry = &chunk->mark;
390 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 387
391 mutex_unlock(&inode->inotify_mutex); 388 spin_lock(&old_entry->lock);
392 put_inotify_watch(&old->watch); 389 if (!old_entry->i.inode) {
390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry);
393 free_chunk(chunk); 393 free_chunk(chunk);
394 return -ENOENT;
395 }
396
397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock);
400 free_chunk(chunk);
401 fsnotify_put_mark(old_entry);
394 return -ENOSPC; 402 return -ENOSPC;
395 } 403 }
404
405 /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
406 spin_lock(&chunk_entry->lock);
396 spin_lock(&hash_lock); 407 spin_lock(&hash_lock);
408
409 /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
397 if (tree->goner) { 410 if (tree->goner) {
398 spin_unlock(&hash_lock); 411 spin_unlock(&hash_lock);
399 chunk->dead = 1; 412 chunk->dead = 1;
400 inotify_evict_watch(&chunk->watch); 413 spin_unlock(&chunk_entry->lock);
401 mutex_unlock(&inode->inotify_mutex); 414 spin_unlock(&old_entry->lock);
402 put_inotify_watch(&old->watch); 415
403 put_inotify_watch(&chunk->watch); 416 fsnotify_destroy_mark(chunk_entry);
417
418 fsnotify_put_mark(chunk_entry);
419 fsnotify_put_mark(old_entry);
404 return 0; 420 return 0;
405 } 421 }
406 list_replace_init(&old->trees, &chunk->trees); 422 list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
426 list_add(&tree->same_root, &chunk->trees); 442 list_add(&tree->same_root, &chunk->trees);
427 } 443 }
428 spin_unlock(&hash_lock); 444 spin_unlock(&hash_lock);
429 inotify_evict_watch(&old->watch); 445 spin_unlock(&chunk_entry->lock);
430 mutex_unlock(&inode->inotify_mutex); 446 spin_unlock(&old_entry->lock);
431 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ 447 fsnotify_destroy_mark(old_entry);
432 put_inotify_watch(&old->watch); /* and kill it */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
449 fsnotify_put_mark(old_entry); /* and kill it */
433 return 0; 450 return 0;
434} 451}
435 452
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
584 601
585 spin_lock(&hash_lock); 602 spin_lock(&hash_lock);
586 list_for_each_entry(node, &tree->chunks, list) { 603 list_for_each_entry(node, &tree->chunks, list) {
587 struct inode *inode = find_chunk(node)->watch.inode; 604 struct audit_chunk *chunk = find_chunk(node);
605 /* this could be NULL if the watch is dieing else where... */
606 struct inode *inode = chunk->mark.i.inode;
588 node->index |= 1U<<31; 607 node->index |= 1U<<31;
589 if (iterate_mounts(compare_root, inode, root_mnt)) 608 if (iterate_mounts(compare_root, inode, root_mnt))
590 node->index &= ~(1U<<31); 609 node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
846 * Here comes the stuff asynchronous to auditctl operations 865 * Here comes the stuff asynchronous to auditctl operations
847 */ 866 */
848 867
849/* inode->inotify_mutex is locked */
850static void evict_chunk(struct audit_chunk *chunk) 868static void evict_chunk(struct audit_chunk *chunk)
851{ 869{
852 struct audit_tree *owner; 870 struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
885 mutex_unlock(&audit_filter_mutex); 903 mutex_unlock(&audit_filter_mutex);
886} 904}
887 905
888static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, 906static int audit_tree_handle_event(struct fsnotify_group *group,
889 u32 cookie, const char *dname, struct inode *inode) 907 struct fsnotify_mark *inode_mark,
908 struct fsnotify_mark *vfsmonut_mark,
909 struct fsnotify_event *event)
910{
911 BUG();
912 return -EOPNOTSUPP;
913}
914
915static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
890{ 916{
891 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 917 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
892 918
893 if (mask & IN_IGNORED) { 919 evict_chunk(chunk);
894 evict_chunk(chunk); 920 fsnotify_put_mark(entry);
895 put_inotify_watch(watch);
896 }
897} 921}
898 922
899static void destroy_watch(struct inotify_watch *watch) 923static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
924 struct fsnotify_mark *inode_mark,
925 struct fsnotify_mark *vfsmount_mark,
926 __u32 mask, void *data, int data_type)
900{ 927{
901 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 928 return false;
902 call_rcu(&chunk->head, __put_chunk);
903} 929}
904 930
905static const struct inotify_operations rtree_inotify_ops = { 931static const struct fsnotify_ops audit_tree_ops = {
906 .handle_event = handle_event, 932 .handle_event = audit_tree_handle_event,
907 .destroy_watch = destroy_watch, 933 .should_send_event = audit_tree_send_event,
934 .free_group_priv = NULL,
935 .free_event_priv = NULL,
936 .freeing_mark = audit_tree_freeing_mark,
908}; 937};
909 938
910static int __init audit_tree_init(void) 939static int __init audit_tree_init(void)
911{ 940{
912 int i; 941 int i;
913 942
914 rtree_ih = inotify_init(&rtree_inotify_ops); 943 audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
915 if (IS_ERR(rtree_ih)) 944 if (IS_ERR(audit_tree_group))
916 audit_panic("cannot initialize inotify handle for rectree watches"); 945 audit_panic("cannot initialize fsnotify group for rectree watches");
917 946
918 for (i = 0; i < HASH_SIZE; i++) 947 for (i = 0; i < HASH_SIZE; i++)
919 INIT_LIST_HEAD(&chunk_hash_heads[i]); 948 INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..f0c9b2e7542d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fsnotify_backend.h>
27#include <linux/namei.h> 28#include <linux/namei.h>
28#include <linux/netlink.h> 29#include <linux/netlink.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31#include <linux/inotify.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include "audit.h" 33#include "audit.h"
34 34
35/* 35/*
36 * Reference counting: 36 * Reference counting:
37 * 37 *
38 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED 38 * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
39 * event. Each audit_watch holds a reference to its associated parent. 39 * event. Each audit_watch holds a reference to its associated parent.
40 * 40 *
41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to 41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
51 unsigned long ino; /* associated inode number */ 51 unsigned long ino; /* associated inode number */
52 struct audit_parent *parent; /* associated parent */ 52 struct audit_parent *parent; /* associated parent */
53 struct list_head wlist; /* entry in parent->watches list */ 53 struct list_head wlist; /* entry in parent->watches list */
54 struct list_head rules; /* associated rules */ 54 struct list_head rules; /* anchor for krule->rlist */
55}; 55};
56 56
57struct audit_parent { 57struct audit_parent {
58 struct list_head ilist; /* entry in inotify registration list */ 58 struct list_head watches; /* anchor for audit_watch->wlist */
59 struct list_head watches; /* associated watches */ 59 struct fsnotify_mark mark; /* fsnotify mark on the inode */
60 struct inotify_watch wdata; /* inotify watch data */
61 unsigned flags; /* status flags */
62}; 60};
63 61
64/* Inotify handle. */ 62/* fsnotify handle. */
65struct inotify_handle *audit_ih; 63struct fsnotify_group *audit_watch_group;
66 64
67/* 65/* fsnotify events we care about. */
68 * audit_parent status flags: 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
69 * 67 FS_MOVE_SELF | FS_EVENT_ON_CHILD)
70 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
71 * a filesystem event to ensure we're adding audit watches to a valid parent.
72 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
73 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
74 * we can receive while holding nameidata.
75 */
76#define AUDIT_PARENT_INVALID 0x001
77 68
78/* Inotify events we care about. */ 69static void audit_free_parent(struct audit_parent *parent)
79#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 70{
71 WARN_ON(!list_empty(&parent->watches));
72 kfree(parent);
73}
80 74
81static void audit_free_parent(struct inotify_watch *i_watch) 75static void audit_watch_free_mark(struct fsnotify_mark *entry)
82{ 76{
83 struct audit_parent *parent; 77 struct audit_parent *parent;
84 78
85 parent = container_of(i_watch, struct audit_parent, wdata); 79 parent = container_of(entry, struct audit_parent, mark);
86 WARN_ON(!list_empty(&parent->watches)); 80 audit_free_parent(parent);
87 kfree(parent); 81}
82
83static void audit_get_parent(struct audit_parent *parent)
84{
85 if (likely(parent))
86 fsnotify_get_mark(&parent->mark);
87}
88
89static void audit_put_parent(struct audit_parent *parent)
90{
91 if (likely(parent))
92 fsnotify_put_mark(&parent->mark);
93}
94
95/*
96 * Find and return the audit_parent on the given inode. If found a reference
97 * is taken on this parent.
98 */
99static inline struct audit_parent *audit_find_parent(struct inode *inode)
100{
101 struct audit_parent *parent = NULL;
102 struct fsnotify_mark *entry;
103
104 entry = fsnotify_find_inode_mark(audit_watch_group, inode);
105 if (entry)
106 parent = container_of(entry, struct audit_parent, mark);
107
108 return parent;
88} 109}
89 110
90void audit_get_watch(struct audit_watch *watch) 111void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
105void audit_remove_watch(struct audit_watch *watch) 126void audit_remove_watch(struct audit_watch *watch)
106{ 127{
107 list_del(&watch->wlist); 128 list_del(&watch->wlist);
108 put_inotify_watch(&watch->parent->wdata); 129 audit_put_parent(watch->parent);
109 watch->parent = NULL; 130 watch->parent = NULL;
110 audit_put_watch(watch); /* match initial get */ 131 audit_put_watch(watch); /* match initial get */
111} 132}
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
115 return watch->path; 136 return watch->path;
116} 137}
117 138
118struct list_head *audit_watch_rules(struct audit_watch *watch) 139int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
119{
120 return &watch->rules;
121}
122
123unsigned long audit_watch_inode(struct audit_watch *watch)
124{ 140{
125 return watch->ino; 141 return (watch->ino != (unsigned long)-1) &&
126} 142 (watch->ino == ino) &&
127 143 (watch->dev == dev);
128dev_t audit_watch_dev(struct audit_watch *watch)
129{
130 return watch->dev;
131} 144}
132 145
133/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
134static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct nameidata *ndp)
135{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode;
136 struct audit_parent *parent; 150 struct audit_parent *parent;
137 s32 wd; 151 int ret;
138 152
139 parent = kzalloc(sizeof(*parent), GFP_KERNEL); 153 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
140 if (unlikely(!parent)) 154 if (unlikely(!parent))
141 return ERR_PTR(-ENOMEM); 155 return ERR_PTR(-ENOMEM);
142 156
143 INIT_LIST_HEAD(&parent->watches); 157 INIT_LIST_HEAD(&parent->watches);
144 parent->flags = 0; 158
145 159 fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
146 inotify_init_watch(&parent->wdata); 160 parent->mark.mask = AUDIT_FS_WATCH;
147 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ 161 ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
148 get_inotify_watch(&parent->wdata); 162 if (ret < 0) {
149 wd = inotify_add_watch(audit_ih, &parent->wdata, 163 audit_free_parent(parent);
150 ndp->path.dentry->d_inode, AUDIT_IN_WATCH); 164 return ERR_PTR(ret);
151 if (wd < 0) {
152 audit_free_parent(&parent->wdata);
153 return ERR_PTR(wd);
154 } 165 }
155 166
156 return parent; 167 return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
179{ 190{
180 struct audit_watch *watch; 191 struct audit_watch *watch;
181 192
182 if (!audit_ih) 193 if (!audit_watch_group)
183 return -EOPNOTSUPP; 194 return -EOPNOTSUPP;
184 195
185 if (path[0] != '/' || path[len-1] == '/' || 196 if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
217 228
218 new->dev = old->dev; 229 new->dev = old->dev;
219 new->ino = old->ino; 230 new->ino = old->ino;
220 get_inotify_watch(&old->parent->wdata); 231 audit_get_parent(old->parent);
221 new->parent = old->parent; 232 new->parent = old->parent;
222 233
223out: 234out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
251 struct audit_entry *oentry, *nentry; 262 struct audit_entry *oentry, *nentry;
252 263
253 mutex_lock(&audit_filter_mutex); 264 mutex_lock(&audit_filter_mutex);
265 /* Run all of the watches on this parent looking for the one that
266 * matches the given dname */
254 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
255 if (audit_compare_dname_path(dname, owatch->path, NULL)) 268 if (audit_compare_dname_path(dname, owatch->path, NULL))
256 continue; 269 continue;
257 270
258 /* If the update involves invalidating rules, do the inode-based 271 /* If the update involves invalidating rules, do the inode-based
259 * filtering now, so we don't omit records. */ 272 * filtering now, so we don't omit records. */
260 if (invalidating && current->audit_context) 273 if (invalidating && !audit_dummy_context())
261 audit_filter_inodes(current, current->audit_context); 274 audit_filter_inodes(current, current->audit_context);
262 275
276 /* updating ino will likely change which audit_hash_list we
277 * are on so we need a new watch for the new list */
263 nwatch = audit_dupe_watch(owatch); 278 nwatch = audit_dupe_watch(owatch);
264 if (IS_ERR(nwatch)) { 279 if (IS_ERR(nwatch)) {
265 mutex_unlock(&audit_filter_mutex); 280 mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
275 list_del(&oentry->rule.rlist); 290 list_del(&oentry->rule.rlist);
276 list_del_rcu(&oentry->list); 291 list_del_rcu(&oentry->list);
277 292
278 nentry = audit_dupe_rule(&oentry->rule, nwatch); 293 nentry = audit_dupe_rule(&oentry->rule);
279 if (IS_ERR(nentry)) { 294 if (IS_ERR(nentry)) {
280 list_del(&oentry->rule.list); 295 list_del(&oentry->rule.list);
281 audit_panic("error updating watch, removing"); 296 audit_panic("error updating watch, removing");
282 } else { 297 } else {
283 int h = audit_hash_ino((u32)ino); 298 int h = audit_hash_ino((u32)ino);
299
300 /*
301 * nentry->rule.watch == oentry->rule.watch so
302 * we must drop that reference and set it to our
303 * new watch.
304 */
305 audit_put_watch(nentry->rule.watch);
306 audit_get_watch(nwatch);
307 nentry->rule.watch = nwatch;
284 list_add(&nentry->rule.rlist, &nwatch->rules); 308 list_add(&nentry->rule.rlist, &nwatch->rules);
285 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 309 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
286 list_replace(&oentry->rule.list, 310 list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
312 struct audit_entry *e; 336 struct audit_entry *e;
313 337
314 mutex_lock(&audit_filter_mutex); 338 mutex_lock(&audit_filter_mutex);
315 parent->flags |= AUDIT_PARENT_INVALID;
316 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 339 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
317 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 340 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
318 e = container_of(r, struct audit_entry, rule); 341 e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
325 audit_remove_watch(w); 348 audit_remove_watch(w);
326 } 349 }
327 mutex_unlock(&audit_filter_mutex); 350 mutex_unlock(&audit_filter_mutex);
328}
329
330/* Unregister inotify watches for parents on in_list.
331 * Generates an IN_IGNORED event. */
332void audit_inotify_unregister(struct list_head *in_list)
333{
334 struct audit_parent *p, *n;
335 351
336 list_for_each_entry_safe(p, n, in_list, ilist) { 352 fsnotify_destroy_mark(&parent->mark);
337 list_del(&p->ilist);
338 inotify_rm_watch(audit_ih, &p->wdata);
339 /* the unpin matching the pin in audit_do_del_rule() */
340 unpin_inotify_watch(&p->wdata);
341 }
342} 353}
343 354
344/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
389 } 400 }
390} 401}
391 402
392/* Associate the given rule with an existing parent inotify_watch. 403/* Associate the given rule with an existing parent.
393 * Caller must hold audit_filter_mutex. */ 404 * Caller must hold audit_filter_mutex. */
394static void audit_add_to_parent(struct audit_krule *krule, 405static void audit_add_to_parent(struct audit_krule *krule,
395 struct audit_parent *parent) 406 struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
397 struct audit_watch *w, *watch = krule->watch; 408 struct audit_watch *w, *watch = krule->watch;
398 int watch_found = 0; 409 int watch_found = 0;
399 410
411 BUG_ON(!mutex_is_locked(&audit_filter_mutex));
412
400 list_for_each_entry(w, &parent->watches, wlist) { 413 list_for_each_entry(w, &parent->watches, wlist) {
401 if (strcmp(watch->path, w->path)) 414 if (strcmp(watch->path, w->path))
402 continue; 415 continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
413 } 426 }
414 427
415 if (!watch_found) { 428 if (!watch_found) {
416 get_inotify_watch(&parent->wdata); 429 audit_get_parent(parent);
417 watch->parent = parent; 430 watch->parent = parent;
418 431
419 list_add(&watch->wlist, &parent->watches); 432 list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
423 436
424/* Find a matching watch entry, or add this one. 437/* Find a matching watch entry, or add this one.
425 * Caller must hold audit_filter_mutex. */ 438 * Caller must hold audit_filter_mutex. */
426int audit_add_watch(struct audit_krule *krule) 439int audit_add_watch(struct audit_krule *krule, struct list_head **list)
427{ 440{
428 struct audit_watch *watch = krule->watch; 441 struct audit_watch *watch = krule->watch;
429 struct inotify_watch *i_watch;
430 struct audit_parent *parent; 442 struct audit_parent *parent;
431 struct nameidata *ndp = NULL, *ndw = NULL; 443 struct nameidata *ndp = NULL, *ndw = NULL;
432 int ret = 0; 444 int h, ret = 0;
433 445
434 mutex_unlock(&audit_filter_mutex); 446 mutex_unlock(&audit_filter_mutex);
435 447
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
441 goto error; 453 goto error;
442 } 454 }
443 455
456 mutex_lock(&audit_filter_mutex);
457
444 /* update watch filter fields */ 458 /* update watch filter fields */
445 if (ndw) { 459 if (ndw) {
446 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; 460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
447 watch->ino = ndw->path.dentry->d_inode->i_ino; 461 watch->ino = ndw->path.dentry->d_inode->i_ino;
448 } 462 }
449 463
450 /* The audit_filter_mutex must not be held during inotify calls because 464 /* either find an old parent or attach a new one */
451 * we hold it during inotify event callback processing. If an existing 465 parent = audit_find_parent(ndp->path.dentry->d_inode);
452 * inotify watch is found, inotify_find_watch() grabs a reference before 466 if (!parent) {
453 * returning.
454 */
455 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
456 &i_watch) < 0) {
457 parent = audit_init_parent(ndp); 467 parent = audit_init_parent(ndp);
458 if (IS_ERR(parent)) { 468 if (IS_ERR(parent)) {
459 /* caller expects mutex locked */
460 mutex_lock(&audit_filter_mutex);
461 ret = PTR_ERR(parent); 469 ret = PTR_ERR(parent);
462 goto error; 470 goto error;
463 } 471 }
464 } else 472 }
465 parent = container_of(i_watch, struct audit_parent, wdata);
466
467 mutex_lock(&audit_filter_mutex);
468 473
469 /* parent was moved before we took audit_filter_mutex */ 474 audit_add_to_parent(krule, parent);
470 if (parent->flags & AUDIT_PARENT_INVALID)
471 ret = -ENOENT;
472 else
473 audit_add_to_parent(krule, parent);
474 475
475 /* match get in audit_init_parent or inotify_find_watch */ 476 /* match get in audit_find_parent or audit_init_parent */
476 put_inotify_watch(&parent->wdata); 477 audit_put_parent(parent);
477 478
479 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h];
478error: 481error:
479 audit_put_nd(ndp, ndw); /* NULL args OK */ 482 audit_put_nd(ndp, ndw); /* NULL args OK */
480 return ret; 483 return ret;
481 484
482} 485}
483 486
484void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) 487void audit_remove_watch_rule(struct audit_krule *krule)
485{ 488{
486 struct audit_watch *watch = krule->watch; 489 struct audit_watch *watch = krule->watch;
487 struct audit_parent *parent = watch->parent; 490 struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
492 audit_remove_watch(watch); 495 audit_remove_watch(watch);
493 496
494 if (list_empty(&parent->watches)) { 497 if (list_empty(&parent->watches)) {
495 /* Put parent on the inotify un-registration 498 audit_get_parent(parent);
496 * list. Grab a reference before releasing 499 fsnotify_destroy_mark(&parent->mark);
497 * audit_filter_mutex, to be released in 500 audit_put_parent(parent);
498 * audit_inotify_unregister().
499 * If filesystem is going away, just leave
500 * the sucker alone, eviction will take
501 * care of it. */
502 if (pin_inotify_watch(&parent->wdata))
503 list_add(&parent->ilist, list);
504 } 501 }
505 } 502 }
506} 503}
507 504
508/* Update watch data in audit rules based on inotify events. */ 505static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
509static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, 506 struct fsnotify_mark *inode_mark,
510 u32 cookie, const char *dname, struct inode *inode) 507 struct fsnotify_mark *vfsmount_mark,
508 __u32 mask, void *data, int data_type)
509{
510 return true;
511}
512
513/* Update watch data in audit rules based on fsnotify events. */
514static int audit_watch_handle_event(struct fsnotify_group *group,
515 struct fsnotify_mark *inode_mark,
516 struct fsnotify_mark *vfsmount_mark,
517 struct fsnotify_event *event)
511{ 518{
519 struct inode *inode;
520 __u32 mask = event->mask;
521 const char *dname = event->file_name;
512 struct audit_parent *parent; 522 struct audit_parent *parent;
513 523
514 parent = container_of(i_watch, struct audit_parent, wdata); 524 parent = container_of(inode_mark, struct audit_parent, mark);
515 525
516 if (mask & (IN_CREATE|IN_MOVED_TO) && inode) 526 BUG_ON(group != audit_watch_group);
517 audit_update_watch(parent, dname, inode->i_sb->s_dev, 527
518 inode->i_ino, 0); 528 switch (event->data_type) {
519 else if (mask & (IN_DELETE|IN_MOVED_FROM)) 529 case (FSNOTIFY_EVENT_PATH):
530 inode = event->path.dentry->d_inode;
531 break;
532 case (FSNOTIFY_EVENT_INODE):
533 inode = event->inode;
534 break;
535 default:
536 BUG();
537 inode = NULL;
538 break;
539 };
540
541 if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
542 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
543 else if (mask & (FS_DELETE|FS_MOVED_FROM))
520 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); 544 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
521 /* inotify automatically removes the watch and sends IN_IGNORED */ 545 else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
522 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
523 audit_remove_parent_watches(parent);
524 /* inotify does not remove the watch, so remove it manually */
525 else if(mask & IN_MOVE_SELF) {
526 audit_remove_parent_watches(parent); 546 audit_remove_parent_watches(parent);
527 inotify_remove_watch_locked(audit_ih, i_watch); 547
528 } else if (mask & IN_IGNORED) 548 return 0;
529 put_inotify_watch(i_watch);
530} 549}
531 550
532static const struct inotify_operations audit_inotify_ops = { 551static const struct fsnotify_ops audit_watch_fsnotify_ops = {
533 .handle_event = audit_handle_ievent, 552 .should_send_event = audit_watch_should_send_event,
534 .destroy_watch = audit_free_parent, 553 .handle_event = audit_watch_handle_event,
554 .free_group_priv = NULL,
555 .freeing_mark = NULL,
556 .free_event_priv = NULL,
535}; 557};
536 558
537static int __init audit_watch_init(void) 559static int __init audit_watch_init(void)
538{ 560{
539 audit_ih = inotify_init(&audit_inotify_ops); 561 audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
540 if (IS_ERR(audit_ih)) 562 if (IS_ERR(audit_watch_group)) {
541 audit_panic("cannot initialize inotify handle"); 563 audit_watch_group = NULL;
564 audit_panic("cannot create audit fsnotify group");
565 }
542 return 0; 566 return 0;
543} 567}
544subsys_initcall(audit_watch_init); 568device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
71{ 71{
72 int i; 72 int i;
73 struct audit_krule *erule = &e->rule; 73 struct audit_krule *erule = &e->rule;
74
74 /* some rules don't have associated watches */ 75 /* some rules don't have associated watches */
75 if (erule->watch) 76 if (erule->watch)
76 audit_put_watch(erule->watch); 77 audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
746 * rule with the new rule in the filterlist, then free the old rule. 747 * rule with the new rule in the filterlist, then free the old rule.
747 * The rlist element is undefined; list manipulations are handled apart from 748 * The rlist element is undefined; list manipulations are handled apart from
748 * the initial copy. */ 749 * the initial copy. */
749struct audit_entry *audit_dupe_rule(struct audit_krule *old, 750struct audit_entry *audit_dupe_rule(struct audit_krule *old)
750 struct audit_watch *watch)
751{ 751{
752 u32 fcount = old->field_count; 752 u32 fcount = old->field_count;
753 struct audit_entry *entry; 753 struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
769 new->prio = old->prio; 769 new->prio = old->prio;
770 new->buflen = old->buflen; 770 new->buflen = old->buflen;
771 new->inode_f = old->inode_f; 771 new->inode_f = old->inode_f;
772 new->watch = NULL;
773 new->field_count = old->field_count; 772 new->field_count = old->field_count;
773
774 /* 774 /*
775 * note that we are OK with not refcounting here; audit_match_tree() 775 * note that we are OK with not refcounting here; audit_match_tree()
776 * never dereferences tree and we can't get false positives there 776 * never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
811 } 811 }
812 } 812 }
813 813
814 if (watch) { 814 if (old->watch) {
815 audit_get_watch(watch); 815 audit_get_watch(old->watch);
816 new->watch = watch; 816 new->watch = old->watch;
817 } 817 }
818 818
819 return entry; 819 return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
866 struct audit_watch *watch = entry->rule.watch; 866 struct audit_watch *watch = entry->rule.watch;
867 struct audit_tree *tree = entry->rule.tree; 867 struct audit_tree *tree = entry->rule.tree;
868 struct list_head *list; 868 struct list_head *list;
869 int h, err; 869 int err;
870#ifdef CONFIG_AUDITSYSCALL 870#ifdef CONFIG_AUDITSYSCALL
871 int dont_count = 0; 871 int dont_count = 0;
872 872
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
889 889
890 if (watch) { 890 if (watch) {
891 /* audit_filter_mutex is dropped and re-taken during this call */ 891 /* audit_filter_mutex is dropped and re-taken during this call */
892 err = audit_add_watch(&entry->rule); 892 err = audit_add_watch(&entry->rule, &list);
893 if (err) { 893 if (err) {
894 mutex_unlock(&audit_filter_mutex); 894 mutex_unlock(&audit_filter_mutex);
895 goto error; 895 goto error;
896 } 896 }
897 /* entry->rule.watch may have changed during audit_add_watch() */
898 watch = entry->rule.watch;
899 h = audit_hash_ino((u32)audit_watch_inode(watch));
900 list = &audit_inode_hash[h];
901 } 897 }
902 if (tree) { 898 if (tree) {
903 err = audit_add_tree_rule(&entry->rule); 899 err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
949 struct audit_watch *watch = entry->rule.watch; 945 struct audit_watch *watch = entry->rule.watch;
950 struct audit_tree *tree = entry->rule.tree; 946 struct audit_tree *tree = entry->rule.tree;
951 struct list_head *list; 947 struct list_head *list;
952 LIST_HEAD(inotify_list);
953 int ret = 0; 948 int ret = 0;
954#ifdef CONFIG_AUDITSYSCALL 949#ifdef CONFIG_AUDITSYSCALL
955 int dont_count = 0; 950 int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
969 } 964 }
970 965
971 if (e->rule.watch) 966 if (e->rule.watch)
972 audit_remove_watch_rule(&e->rule, &inotify_list); 967 audit_remove_watch_rule(&e->rule);
973 968
974 if (e->rule.tree) 969 if (e->rule.tree)
975 audit_remove_tree_rule(&e->rule); 970 audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
987#endif 982#endif
988 mutex_unlock(&audit_filter_mutex); 983 mutex_unlock(&audit_filter_mutex);
989 984
990 if (!list_empty(&inotify_list))
991 audit_inotify_unregister(&inotify_list);
992
993out: 985out:
994 if (watch) 986 if (watch)
995 audit_put_watch(watch); /* match initial get */ 987 audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
1323{ 1315{
1324 struct audit_entry *entry = container_of(r, struct audit_entry, rule); 1316 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1325 struct audit_entry *nentry; 1317 struct audit_entry *nentry;
1326 struct audit_watch *watch;
1327 struct audit_tree *tree;
1328 int err = 0; 1318 int err = 0;
1329 1319
1330 if (!security_audit_rule_known(r)) 1320 if (!security_audit_rule_known(r))
1331 return 0; 1321 return 0;
1332 1322
1333 watch = r->watch; 1323 nentry = audit_dupe_rule(r);
1334 tree = r->tree;
1335 nentry = audit_dupe_rule(r, watch);
1336 if (IS_ERR(nentry)) { 1324 if (IS_ERR(nentry)) {
1337 /* save the first error encountered for the 1325 /* save the first error encountered for the
1338 * return value */ 1326 * return value */
1339 err = PTR_ERR(nentry); 1327 err = PTR_ERR(nentry);
1340 audit_panic("error updating LSM filters"); 1328 audit_panic("error updating LSM filters");
1341 if (watch) 1329 if (r->watch)
1342 list_del(&r->rlist); 1330 list_del(&r->rlist);
1343 list_del_rcu(&entry->list); 1331 list_del_rcu(&entry->list);
1344 list_del(&r->list); 1332 list_del(&r->list);
1345 } else { 1333 } else {
1346 if (watch) { 1334 if (r->watch || r->tree)
1347 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1348 list_del(&r->rlist);
1349 } else if (tree)
1350 list_replace_init(&r->rlist, &nentry->rule.rlist); 1335 list_replace_init(&r->rlist, &nentry->rule.rlist);
1351 list_replace_rcu(&entry->list, &nentry->list); 1336 list_replace_rcu(&entry->list, &nentry->list);
1352 list_replace(&r->list, &nentry->rule.list); 1337 list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..1b31c130d034 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
65#include <linux/binfmts.h> 65#include <linux/binfmts.h>
66#include <linux/highmem.h> 66#include <linux/highmem.h>
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/inotify.h>
69#include <linux/capability.h> 68#include <linux/capability.h>
70#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
71 70
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
549 } 548 }
550 break; 549 break;
551 case AUDIT_WATCH: 550 case AUDIT_WATCH:
552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) 551 if (name)
553 result = (name->dev == audit_watch_dev(rule->watch) && 552 result = audit_watch_compare(rule->watch, name->ino, name->dev);
554 name->ino == audit_watch_inode(rule->watch));
555 break; 553 break;
556 case AUDIT_DIR: 554 case AUDIT_DIR:
557 if (ctx) 555 if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
1726 struct audit_tree_refs *p; 1724 struct audit_tree_refs *p;
1727 struct audit_chunk *chunk; 1725 struct audit_chunk *chunk;
1728 int count; 1726 int count;
1729 if (likely(list_empty(&inode->inotify_watches))) 1727 if (likely(hlist_empty(&inode->i_fsnotify_marks)))
1730 return; 1728 return;
1731 context = current->audit_context; 1729 context = current->audit_context;
1732 p = context->trees; 1730 p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
1769 seq = read_seqbegin(&rename_lock); 1767 seq = read_seqbegin(&rename_lock);
1770 for(;;) { 1768 for(;;) {
1771 struct inode *inode = d->d_inode; 1769 struct inode *inode = d->d_inode;
1772 if (inode && unlikely(!list_empty(&inode->inotify_watches))) { 1770 if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
1773 struct audit_chunk *chunk; 1771 struct audit_chunk *chunk;
1774 chunk = audit_tree_lookup(inode); 1772 chunk = audit_tree_lookup(inode);
1775 if (chunk) { 1773 if (chunk) {
@@ -1837,13 +1835,8 @@ void __audit_getname(const char *name)
1837 context->names[context->name_count].ino = (unsigned long)-1; 1835 context->names[context->name_count].ino = (unsigned long)-1;
1838 context->names[context->name_count].osid = 0; 1836 context->names[context->name_count].osid = 0;
1839 ++context->name_count; 1837 ++context->name_count;
1840 if (!context->pwd.dentry) { 1838 if (!context->pwd.dentry)
1841 read_lock(&current->fs->lock); 1839 get_fs_pwd(current->fs, &context->pwd);
1842 context->pwd = current->fs->pwd;
1843 path_get(&current->fs->pwd);
1844 read_unlock(&current->fs->lock);
1845 }
1846
1847} 1840}
1848 1841
1849/* audit_putname - intercept a putname request 1842/* audit_putname - intercept a putname request
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3ac6f5b0a64b..192f88c5b0f9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1102,7 +1102,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1102 if (opts->release_agent) 1102 if (opts->release_agent)
1103 return -EINVAL; 1103 return -EINVAL;
1104 opts->release_agent = 1104 opts->release_agent =
1105 kstrndup(token + 14, PATH_MAX, GFP_KERNEL); 1105 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1106 if (!opts->release_agent)
1107 return -ENOMEM; 1107 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
@@ -1123,7 +1123,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123 if (opts->name) 1123 if (opts->name)
1124 return -EINVAL; 1124 return -EINVAL;
1125 opts->name = kstrndup(name, 1125 opts->name = kstrndup(name,
1126 MAX_CGROUP_ROOT_NAMELEN, 1126 MAX_CGROUP_ROOT_NAMELEN - 1,
1127 GFP_KERNEL); 1127 GFP_KERNEL);
1128 if (!opts->name) 1128 if (!opts->name)
1129 return -ENOMEM; 1129 return -ENOMEM;
@@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = {
1623 .kill_sb = cgroup_kill_sb, 1623 .kill_sb = cgroup_kill_sb,
1624}; 1624};
1625 1625
1626static struct kobject *cgroup_kobj;
1627
1626static inline struct cgroup *__d_cgrp(struct dentry *dentry) 1628static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1627{ 1629{
1628 return dentry->d_fsdata; 1630 return dentry->d_fsdata;
@@ -1788,6 +1790,29 @@ out:
1788 return retval; 1790 return retval;
1789} 1791}
1790 1792
1793/**
1794 * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
1795 * @tsk: the task to be attached
1796 */
1797int cgroup_attach_task_current_cg(struct task_struct *tsk)
1798{
1799 struct cgroupfs_root *root;
1800 struct cgroup *cur_cg;
1801 int retval = 0;
1802
1803 cgroup_lock();
1804 for_each_active_root(root) {
1805 cur_cg = task_cgroup_from_root(current, root);
1806 retval = cgroup_attach_task(cur_cg, tsk);
1807 if (retval)
1808 break;
1809 }
1810 cgroup_unlock();
1811
1812 return retval;
1813}
1814EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
1815
1791/* 1816/*
1792 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1817 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1793 * held. May take task_lock of task 1818 * held. May take task_lock of task
@@ -3871,9 +3896,18 @@ int __init cgroup_init(void)
3871 hhead = css_set_hash(init_css_set.subsys); 3896 hhead = css_set_hash(init_css_set.subsys);
3872 hlist_add_head(&init_css_set.hlist, hhead); 3897 hlist_add_head(&init_css_set.hlist, hhead);
3873 BUG_ON(!init_root_id(&rootnode)); 3898 BUG_ON(!init_root_id(&rootnode));
3899
3900 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
3901 if (!cgroup_kobj) {
3902 err = -ENOMEM;
3903 goto out;
3904 }
3905
3874 err = register_filesystem(&cgroup_fs_type); 3906 err = register_filesystem(&cgroup_fs_type);
3875 if (err < 0) 3907 if (err < 0) {
3908 kobject_put(cgroup_kobj);
3876 goto out; 3909 goto out;
3910 }
3877 3911
3878 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 3912 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
3879 3913
diff --git a/kernel/compat.c b/kernel/compat.c
index 5adab05a3172..e167efce8423 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
279 struct compat_rlimit __user *rlim) 279 struct compat_rlimit __user *rlim)
280{ 280{
281 struct rlimit r; 281 struct rlimit r;
282 int ret;
283 mm_segment_t old_fs = get_fs ();
284
285 if (resource >= RLIM_NLIMITS)
286 return -EINVAL;
287 282
288 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || 283 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
289 __get_user(r.rlim_cur, &rlim->rlim_cur) || 284 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
294 r.rlim_cur = RLIM_INFINITY; 289 r.rlim_cur = RLIM_INFINITY;
295 if (r.rlim_max == COMPAT_RLIM_INFINITY) 290 if (r.rlim_max == COMPAT_RLIM_INFINITY)
296 r.rlim_max = RLIM_INFINITY; 291 r.rlim_max = RLIM_INFINITY;
297 set_fs(KERNEL_DS); 292 return do_prlimit(current, resource, &r, NULL);
298 ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
299 set_fs(old_fs);
300 return ret;
301} 293}
302 294
303#ifdef COMPAT_RLIM_OLD_INFINITY 295#ifdef COMPAT_RLIM_OLD_INFINITY
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
329 321
330#endif 322#endif
331 323
332asmlinkage long compat_sys_getrlimit (unsigned int resource, 324asmlinkage long compat_sys_getrlimit(unsigned int resource,
333 struct compat_rlimit __user *rlim) 325 struct compat_rlimit __user *rlim)
334{ 326{
335 struct rlimit r; 327 struct rlimit r;
336 int ret; 328 int ret;
337 mm_segment_t old_fs = get_fs();
338 329
339 set_fs(KERNEL_DS); 330 ret = do_prlimit(current, resource, NULL, &r);
340 ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
341 set_fs(old_fs);
342 if (!ret) { 331 if (!ret) {
343 if (r.rlim_cur > COMPAT_RLIM_INFINITY) 332 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
344 r.rlim_cur = COMPAT_RLIM_INFINITY; 333 r.rlim_cur = COMPAT_RLIM_INFINITY;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
235 return -EINVAL; 235 return -EINVAL;
236 236
237 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
238 set_cpu_active(cpu, false);
239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
240 if (err) { 239 if (err) {
241 set_cpu_active(cpu, true);
242
243 nr_calls--; 240 nr_calls--;
244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
245 printk("%s: attempt to take down CPU %u failed\n", 242 printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 246
250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251 if (err) { 248 if (err) {
252 set_cpu_active(cpu, true);
253 /* CPU didn't die: tell everyone. Can't complain. */ 249 /* CPU didn't die: tell everyone. Can't complain. */
254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
255 251
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 goto out_notify; 317 goto out_notify;
322 BUG_ON(!cpu_online(cpu)); 318 BUG_ON(!cpu_online(cpu));
323 319
324 set_cpu_active(cpu, true);
325
326 /* Now call notifier in preparation. */ 320 /* Now call notifier in preparation. */
327 cpu_notify(CPU_ONLINE | mod, hcpu); 321 cpu_notify(CPU_ONLINE | mod, hcpu);
328 322
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..b23c0979bbe7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
105 /* for custom sched domain */ 105 /* for custom sched domain */
106 int relax_domain_level; 106 int relax_domain_level;
107 107
108 /* used for walking a cpuset heirarchy */ 108 /* used for walking a cpuset hierarchy */
109 struct list_head stack_list; 109 struct list_head stack_list;
110}; 110};
111 111
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2113 * but making no active use of cpusets. 2113 * but making no active use of cpusets.
2114 * 2114 *
2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2116 * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117 * 2117 *
2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2119 * before calling generate_sched_domains(). 2119 * before calling generate_sched_domains().
2120 */ 2120 */
2121static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2121void cpuset_update_active_cpus(void)
2122 unsigned long phase, void *unused_cpu)
2123{ 2122{
2124 struct sched_domain_attr *attr; 2123 struct sched_domain_attr *attr;
2125 cpumask_var_t *doms; 2124 cpumask_var_t *doms;
2126 int ndoms; 2125 int ndoms;
2127 2126
2128 switch (phase) {
2129 case CPU_ONLINE:
2130 case CPU_ONLINE_FROZEN:
2131 case CPU_DOWN_PREPARE:
2132 case CPU_DOWN_PREPARE_FROZEN:
2133 case CPU_DOWN_FAILED:
2134 case CPU_DOWN_FAILED_FROZEN:
2135 break;
2136
2137 default:
2138 return NOTIFY_DONE;
2139 }
2140
2141 cgroup_lock(); 2127 cgroup_lock();
2142 mutex_lock(&callback_mutex); 2128 mutex_lock(&callback_mutex);
2143 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2148 2134
2149 /* Have scheduler rebuild the domains */ 2135 /* Have scheduler rebuild the domains */
2150 partition_sched_domains(ndoms, doms, attr); 2136 partition_sched_domains(ndoms, doms, attr);
2151
2152 return NOTIFY_OK;
2153} 2137}
2154 2138
2155#ifdef CONFIG_MEMORY_HOTPLUG 2139#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
2203 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205 2189
2206 hotcpu_notifier(cpuset_track_online_cpus, 0);
2207 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208 2191
2209 cpuset_wq = create_singlethread_workqueue("cpuset"); 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/cred.c b/kernel/cred.c
index 60bc8b1e32e6..9a3e22641fe7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -22,10 +22,6 @@
22#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else 24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \ 25#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 26 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif 27#endif
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 8bc5eeffec8a..de407c78178d 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2000-2001 VERITAS Software Corporation. 6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation 7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> 8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> 9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> 10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. 11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc. 12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -605,6 +605,8 @@ cpu_master_loop:
605 if (dbg_kdb_mode) { 605 if (dbg_kdb_mode) {
606 kgdb_connected = 1; 606 kgdb_connected = 1;
607 error = kdb_stub(ks); 607 error = kdb_stub(ks);
608 if (error == -1)
609 continue;
608 kgdb_connected = 0; 610 kgdb_connected = 0;
609 } else { 611 } else {
610 error = gdb_serial_stub(ks); 612 error = gdb_serial_stub(ks);
@@ -739,7 +741,7 @@ static struct console kgdbcons = {
739}; 741};
740 742
741#ifdef CONFIG_MAGIC_SYSRQ 743#ifdef CONFIG_MAGIC_SYSRQ
742static void sysrq_handle_dbg(int key, struct tty_struct *tty) 744static void sysrq_handle_dbg(int key)
743{ 745{
744 if (!dbg_io_ops) { 746 if (!dbg_io_ops) {
745 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); 747 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e8fd6868682d..481a7bd2dfe7 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2000-2001 VERITAS Software Corporation. 6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation 7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> 8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> 9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> 10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. 11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc. 12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES +
52 * GDB remote protocol parser: 52 * GDB remote protocol parser:
53 */ 53 */
54 54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB 55#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void) 56static int gdbstub_read_wait(void)
68{ 57{
@@ -123,8 +112,8 @@ static void get_packet(char *buffer)
123 buffer[count] = 0; 112 buffer[count] = 0;
124 113
125 if (ch == '#') { 114 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4; 115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait()); 116 xmitcsum += hex_to_bin(gdbstub_read_wait());
128 117
129 if (checksum != xmitcsum) 118 if (checksum != xmitcsum)
130 /* failed checksum */ 119 /* failed checksum */
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
236 * buf. Return a pointer to the last char put in buf (null). May 225 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error. 226 * return an error.
238 */ 227 */
239int kgdb_mem2hex(char *mem, char *buf, int count) 228char *kgdb_mem2hex(char *mem, char *buf, int count)
240{ 229{
241 char *tmp; 230 char *tmp;
242 int err; 231 int err;
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
248 tmp = buf + count; 237 tmp = buf + count;
249 238
250 err = probe_kernel_read(tmp, mem, count); 239 err = probe_kernel_read(tmp, mem, count);
251 if (!err) { 240 if (err)
252 while (count > 0) { 241 return NULL;
253 buf = pack_hex_byte(buf, *tmp); 242 while (count > 0) {
254 tmp++; 243 buf = pack_hex_byte(buf, *tmp);
255 count--; 244 tmp++;
256 } 245 count--;
257
258 *buf = 0;
259 } 246 }
247 *buf = 0;
260 248
261 return err; 249 return buf;
262} 250}
263 251
264/* 252/*
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
280 tmp_hex = tmp_raw - 1; 268 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) { 269 while (tmp_hex >= buf) {
282 tmp_raw--; 270 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--); 271 *tmp_raw = hex_to_bin(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4; 272 *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
285 } 273 }
286 274
287 return probe_kernel_write(mem, tmp_raw, count); 275 return probe_kernel_write(mem, tmp_raw, count);
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
304 (*ptr)++; 292 (*ptr)++;
305 } 293 }
306 while (**ptr) { 294 while (**ptr) {
307 hex_val = hex(**ptr); 295 hex_val = hex_to_bin(**ptr);
308 if (hex_val < 0) 296 if (hex_val < 0)
309 break; 297 break;
310 298
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
339 return probe_kernel_write(mem, c, size); 327 return probe_kernel_write(mem, c, size);
340} 328}
341 329
330#if DBG_MAX_REG_NUM > 0
331void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
332{
333 int i;
334 int idx = 0;
335 char *ptr = (char *)gdb_regs;
336
337 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
338 dbg_get_reg(i, ptr + idx, regs);
339 idx += dbg_reg_def[i].size;
340 }
341}
342
343void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
344{
345 int i;
346 int idx = 0;
347 char *ptr = (char *)gdb_regs;
348
349 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
350 dbg_set_reg(i, ptr + idx, regs);
351 idx += dbg_reg_def[i].size;
352 }
353}
354#endif /* DBG_MAX_REG_NUM > 0 */
355
342/* Write memory due to an 'M' or 'X' packet. */ 356/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary) 357static int write_mem_msg(int binary)
344{ 358{
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error)
378 * remapped to negative TIDs. 392 * remapped to negative TIDs.
379 */ 393 */
380 394
381#define BUF_THREAD_ID_SIZE 16 395#define BUF_THREAD_ID_SIZE 8
382 396
383static char *pack_threadid(char *pkt, unsigned char *id) 397static char *pack_threadid(char *pkt, unsigned char *id)
384{ 398{
385 char *limit; 399 unsigned char *limit;
400 int lzero = 1;
401
402 limit = id + (BUF_THREAD_ID_SIZE / 2);
403 while (id < limit) {
404 if (!lzero || *id != 0) {
405 pkt = pack_hex_byte(pkt, *id);
406 lzero = 0;
407 }
408 id++;
409 }
386 410
387 limit = pkt + BUF_THREAD_ID_SIZE; 411 if (lzero)
388 while (pkt < limit) 412 pkt = pack_hex_byte(pkt, 0);
389 pkt = pack_hex_byte(pkt, *id++);
390 413
391 return pkt; 414 return pkt;
392} 415}
393 416
394static void int_to_threadref(unsigned char *id, int value) 417static void int_to_threadref(unsigned char *id, int value)
395{ 418{
396 unsigned char *scan; 419 put_unaligned_be32(value, id);
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403} 420}
404 421
405static struct task_struct *getthread(struct pt_regs *regs, int tid) 422static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo); 480 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464} 481}
465 482
466/* Handle the 'g' get registers request */ 483static void gdb_get_regs_helper(struct kgdb_state *ks)
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{ 484{
469 struct task_struct *thread; 485 struct task_struct *thread;
470 void *local_debuggerinfo; 486 void *local_debuggerinfo;
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
505 */ 521 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread); 522 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 } 523 }
524}
525
526/* Handle the 'g' get registers request */
527static void gdb_cmd_getregs(struct kgdb_state *ks)
528{
529 gdb_get_regs_helper(ks);
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); 530 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509} 531}
510 532
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
527 char *ptr = &remcom_in_buffer[1]; 549 char *ptr = &remcom_in_buffer[1];
528 unsigned long length; 550 unsigned long length;
529 unsigned long addr; 551 unsigned long addr;
530 int err; 552 char *err;
531 553
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && 554 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) { 555 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); 556 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err) 557 if (!err)
536 error_packet(remcom_out_buffer, err); 558 error_packet(remcom_out_buffer, -EINVAL);
537 } else { 559 } else {
538 error_packet(remcom_out_buffer, -EINVAL); 560 error_packet(remcom_out_buffer, -EINVAL);
539 } 561 }
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
550 strcpy(remcom_out_buffer, "OK"); 572 strcpy(remcom_out_buffer, "OK");
551} 573}
552 574
575#if DBG_MAX_REG_NUM > 0
576static char *gdb_hex_reg_helper(int regnum, char *out)
577{
578 int i;
579 int offset = 0;
580
581 for (i = 0; i < regnum; i++)
582 offset += dbg_reg_def[i].size;
583 return kgdb_mem2hex((char *)gdb_regs + offset, out,
584 dbg_reg_def[i].size);
585}
586
587/* Handle the 'p' individual regster get */
588static void gdb_cmd_reg_get(struct kgdb_state *ks)
589{
590 unsigned long regnum;
591 char *ptr = &remcom_in_buffer[1];
592
593 kgdb_hex2long(&ptr, &regnum);
594 if (regnum >= DBG_MAX_REG_NUM) {
595 error_packet(remcom_out_buffer, -EINVAL);
596 return;
597 }
598 gdb_get_regs_helper(ks);
599 gdb_hex_reg_helper(regnum, remcom_out_buffer);
600}
601
602/* Handle the 'P' individual regster set */
603static void gdb_cmd_reg_set(struct kgdb_state *ks)
604{
605 unsigned long regnum;
606 char *ptr = &remcom_in_buffer[1];
607 int i = 0;
608
609 kgdb_hex2long(&ptr, &regnum);
610 if (*ptr++ != '=' ||
611 !(!kgdb_usethread || kgdb_usethread == current) ||
612 !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
613 error_packet(remcom_out_buffer, -EINVAL);
614 return;
615 }
616 memset(gdb_regs, 0, sizeof(gdb_regs));
617 while (i < sizeof(gdb_regs) * 2)
618 if (hex_to_bin(ptr[i]) >= 0)
619 i++;
620 else
621 break;
622 i = i / 2;
623 kgdb_hex2mem(ptr, (char *)gdb_regs, i);
624 dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
625 strcpy(remcom_out_buffer, "OK");
626}
627#endif /* DBG_MAX_REG_NUM > 0 */
628
553/* Handle the 'X' memory binary write bytes */ 629/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks) 630static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{ 631{
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
612{ 688{
613 struct task_struct *g; 689 struct task_struct *g;
614 struct task_struct *p; 690 struct task_struct *p;
615 unsigned char thref[8]; 691 unsigned char thref[BUF_THREAD_ID_SIZE];
616 char *ptr; 692 char *ptr;
617 int i; 693 int i;
618 int cpu; 694 int cpu;
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
632 for_each_online_cpu(cpu) { 708 for_each_online_cpu(cpu) {
633 ks->thr_query = 0; 709 ks->thr_query = 0;
634 int_to_threadref(thref, -cpu - 2); 710 int_to_threadref(thref, -cpu - 2);
635 pack_threadid(ptr, thref); 711 ptr = pack_threadid(ptr, thref);
636 ptr += BUF_THREAD_ID_SIZE;
637 *(ptr++) = ','; 712 *(ptr++) = ',';
638 i++; 713 i++;
639 } 714 }
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
642 do_each_thread(g, p) { 717 do_each_thread(g, p) {
643 if (i >= ks->thr_query && !finished) { 718 if (i >= ks->thr_query && !finished) {
644 int_to_threadref(thref, p->pid); 719 int_to_threadref(thref, p->pid);
645 pack_threadid(ptr, thref); 720 ptr = pack_threadid(ptr, thref);
646 ptr += BUF_THREAD_ID_SIZE;
647 *(ptr++) = ','; 721 *(ptr++) = ',';
648 ks->thr_query++; 722 ks->thr_query++;
649 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) 723 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
858 int error = 0; 932 int error = 0;
859 int tmp; 933 int tmp;
860 934
861 /* Clear the out buffer. */ 935 /* Initialize comm buffer and globals. */
862 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); 936 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
937 kgdb_usethread = kgdb_info[ks->cpu].task;
938 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
939 ks->pass_exception = 0;
863 940
864 if (kgdb_connected) { 941 if (kgdb_connected) {
865 unsigned char thref[8]; 942 unsigned char thref[BUF_THREAD_ID_SIZE];
866 char *ptr; 943 char *ptr;
867 944
868 /* Reply to host that an exception has occurred */ 945 /* Reply to host that an exception has occurred */
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
876 put_packet(remcom_out_buffer); 953 put_packet(remcom_out_buffer);
877 } 954 }
878 955
879 kgdb_usethread = kgdb_info[ks->cpu].task;
880 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
881 ks->pass_exception = 0;
882
883 while (1) { 956 while (1) {
884 error = 0; 957 error = 0;
885 958
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
904 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 977 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
905 gdb_cmd_memwrite(ks); 978 gdb_cmd_memwrite(ks);
906 break; 979 break;
980#if DBG_MAX_REG_NUM > 0
981 case 'p': /* pXX Return gdb register XX (in hex) */
982 gdb_cmd_reg_get(ks);
983 break;
984 case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
985 gdb_cmd_reg_set(ks);
986 break;
987#endif /* DBG_MAX_REG_NUM > 0 */
907 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 988 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_binwrite(ks); 989 gdb_cmd_binwrite(ks);
909 break; 990 break;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..caf057a3de0e 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
312 312
313 if (endp == arg) { 313 if (endp == arg) {
314 /* 314 /*
315 * Try base 16, for us folks too lazy to type the 315 * Also try base 16, for us folks too lazy to type the
316 * leading 0x... 316 * leading 0x...
317 */ 317 */
318 val = simple_strtoul(arg, &endp, 16); 318 val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
325 return 0; 325 return 0;
326} 326}
327 327
328int kdbgetu64arg(const char *arg, u64 *value)
329{
330 char *endp;
331 u64 val;
332
333 val = simple_strtoull(arg, &endp, 0);
334
335 if (endp == arg) {
336
337 val = simple_strtoull(arg, &endp, 16);
338 if (endp == arg)
339 return KDB_BADINT;
340 }
341
342 *value = val;
343
344 return 0;
345}
346
328/* 347/*
329 * kdb_set - This function implements the 'set' command. Alter an 348 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one. 349 * existing environment variable or create a new one.
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
1770 */ 1789 */
1771static int kdb_rd(int argc, const char **argv) 1790static int kdb_rd(int argc, const char **argv)
1772{ 1791{
1773 int diag = kdb_check_regs(); 1792 int len = kdb_check_regs();
1774 if (diag) 1793#if DBG_MAX_REG_NUM > 0
1775 return diag; 1794 int i;
1795 char *rname;
1796 int rsize;
1797 u64 reg64;
1798 u32 reg32;
1799 u16 reg16;
1800 u8 reg8;
1801
1802 if (len)
1803 return len;
1804
1805 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1806 rsize = dbg_reg_def[i].size * 2;
1807 if (rsize > 16)
1808 rsize = 2;
1809 if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
1810 len = 0;
1811 kdb_printf("\n");
1812 }
1813 if (len)
1814 len += kdb_printf(" ");
1815 switch(dbg_reg_def[i].size * 8) {
1816 case 8:
1817 rname = dbg_get_reg(i, &reg8, kdb_current_regs);
1818 if (!rname)
1819 break;
1820 len += kdb_printf("%s: %02x", rname, reg8);
1821 break;
1822 case 16:
1823 rname = dbg_get_reg(i, &reg16, kdb_current_regs);
1824 if (!rname)
1825 break;
1826 len += kdb_printf("%s: %04x", rname, reg16);
1827 break;
1828 case 32:
1829 rname = dbg_get_reg(i, &reg32, kdb_current_regs);
1830 if (!rname)
1831 break;
1832 len += kdb_printf("%s: %08x", rname, reg32);
1833 break;
1834 case 64:
1835 rname = dbg_get_reg(i, &reg64, kdb_current_regs);
1836 if (!rname)
1837 break;
1838 len += kdb_printf("%s: %016llx", rname, reg64);
1839 break;
1840 default:
1841 len += kdb_printf("%s: ??", dbg_reg_def[i].name);
1842 }
1843 }
1844 kdb_printf("\n");
1845#else
1846 if (len)
1847 return len;
1776 1848
1777 kdb_dumpregs(kdb_current_regs); 1849 kdb_dumpregs(kdb_current_regs);
1850#endif
1778 return 0; 1851 return 0;
1779} 1852}
1780 1853
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
1782 * kdb_rm - This function implements the 'rm' (register modify) command. 1855 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents 1856 * rm register-name new-contents
1784 * Remarks: 1857 * Remarks:
1785 * Currently doesn't allow modification of control or 1858 * Allows register modification with the same restrictions as gdb
1786 * debug registers.
1787 */ 1859 */
1788static int kdb_rm(int argc, const char **argv) 1860static int kdb_rm(int argc, const char **argv)
1789{ 1861{
1862#if DBG_MAX_REG_NUM > 0
1790 int diag; 1863 int diag;
1791 int ind = 0; 1864 const char *rname;
1792 unsigned long contents; 1865 int i;
1866 u64 reg64;
1867 u32 reg32;
1868 u16 reg16;
1869 u8 reg8;
1793 1870
1794 if (argc != 2) 1871 if (argc != 2)
1795 return KDB_ARGCOUNT; 1872 return KDB_ARGCOUNT;
1796 /* 1873 /*
1797 * Allow presence or absence of leading '%' symbol. 1874 * Allow presence or absence of leading '%' symbol.
1798 */ 1875 */
1799 if (argv[1][0] == '%') 1876 rname = argv[1];
1800 ind = 1; 1877 if (*rname == '%')
1878 rname++;
1801 1879
1802 diag = kdbgetularg(argv[2], &contents); 1880 diag = kdbgetu64arg(argv[2], &reg64);
1803 if (diag) 1881 if (diag)
1804 return diag; 1882 return diag;
1805 1883
1806 diag = kdb_check_regs(); 1884 diag = kdb_check_regs();
1807 if (diag) 1885 if (diag)
1808 return diag; 1886 return diag;
1887
1888 diag = KDB_BADREG;
1889 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1890 if (strcmp(rname, dbg_reg_def[i].name) == 0) {
1891 diag = 0;
1892 break;
1893 }
1894 }
1895 if (!diag) {
1896 switch(dbg_reg_def[i].size * 8) {
1897 case 8:
1898 reg8 = reg64;
1899 dbg_set_reg(i, &reg8, kdb_current_regs);
1900 break;
1901 case 16:
1902 reg16 = reg64;
1903 dbg_set_reg(i, &reg16, kdb_current_regs);
1904 break;
1905 case 32:
1906 reg32 = reg64;
1907 dbg_set_reg(i, &reg32, kdb_current_regs);
1908 break;
1909 case 64:
1910 dbg_set_reg(i, &reg64, kdb_current_regs);
1911 break;
1912 }
1913 }
1914 return diag;
1915#else
1809 kdb_printf("ERROR: Register set currently not implemented\n"); 1916 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0; 1917 return 0;
1918#endif
1811} 1919}
1812 1920
1813#if defined(CONFIG_MAGIC_SYSRQ) 1921#if defined(CONFIG_MAGIC_SYSRQ)
@@ -1821,7 +1929,7 @@ static int kdb_sr(int argc, const char **argv)
1821 if (argc != 1) 1929 if (argc != 1)
1822 return KDB_ARGCOUNT; 1930 return KDB_ARGCOUNT;
1823 kdb_trap_printk++; 1931 kdb_trap_printk++;
1824 __handle_sysrq(*argv[1], NULL, 0); 1932 __handle_sysrq(*argv[1], false);
1825 kdb_trap_printk--; 1933 kdb_trap_printk--;
1826 1934
1827 return 0; 1935 return 0;
@@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val)
2440 */ 2548 */
2441static int kdb_summary(int argc, const char **argv) 2549static int kdb_summary(int argc, const char **argv)
2442{ 2550{
2551 struct timespec now;
2443 struct kdb_tm tm; 2552 struct kdb_tm tm;
2444 struct sysinfo val; 2553 struct sysinfo val;
2445 2554
@@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv)
2454 kdb_printf("domainname %s\n", init_uts_ns.name.domainname); 2563 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2455 kdb_printf("ccversion %s\n", __stringify(CCVERSION)); 2564 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2456 2565
2457 kdb_gmtime(&xtime, &tm); 2566 now = __current_kernel_time();
2567 kdb_gmtime(&now, &tm);
2458 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " 2568 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2459 "tz_minuteswest %d\n", 2569 "tz_minuteswest %d\n",
2460 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, 2570 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..be775f7e81e0 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 144extern int kdb_putword(unsigned long, unsigned long, size_t);
145 145
146extern int kdbgetularg(const char *, unsigned long *); 146extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **);
148extern char *kdbgetenv(const char *); 147extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **); 149 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *); 150extern int kdbgetsymval(const char *, kdb_symtab_t *);
@@ -257,7 +255,14 @@ extern void kdb_ps1(const struct task_struct *p);
257extern void kdb_print_nameval(const char *name, unsigned long val); 255extern void kdb_print_nameval(const char *name, unsigned long val);
258extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 256extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
259extern void kdb_meminfo_proc_show(void); 257extern void kdb_meminfo_proc_show(void);
258#ifdef CONFIG_KALLSYMS
260extern const char *kdb_walk_kallsyms(loff_t *pos); 259extern const char *kdb_walk_kallsyms(loff_t *pos);
260#else /* ! CONFIG_KALLSYMS */
261static inline const char *kdb_walk_kallsyms(loff_t *pos)
262{
263 return NULL;
264}
265#endif /* ! CONFIG_KALLSYMS */
261extern char *kdb_getstr(char *, size_t, char *); 266extern char *kdb_getstr(char *, size_t, char *);
262 267
263/* Defines for kdb_symbol_print */ 268/* Defines for kdb_symbol_print */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 45344d5c53dd..6b2485dcb050 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -82,8 +82,8 @@ static char *kdb_name_table[100]; /* arbitrary size */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) 82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{ 83{
84 int ret = 0; 84 int ret = 0;
85 unsigned long symbolsize; 85 unsigned long symbolsize = 0;
86 unsigned long offset; 86 unsigned long offset = 0;
87#define knt1_size 128 /* must be >= kallsyms table size */ 87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL; 88 char *knt1 = NULL;
89 89
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index dd62f8e714ca..0dbeae374225 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -134,23 +134,14 @@ unregister:
134 return 0; 134 return 0;
135} 135}
136 136
137int 137int __set_personality(unsigned int personality)
138__set_personality(unsigned int personality)
139{ 138{
140 struct exec_domain *ep, *oep; 139 struct exec_domain *oep = current_thread_info()->exec_domain;
141
142 ep = lookup_exec_domain(personality);
143 if (ep == current_thread_info()->exec_domain) {
144 current->personality = personality;
145 module_put(ep->module);
146 return 0;
147 }
148 140
141 current_thread_info()->exec_domain = lookup_exec_domain(personality);
149 current->personality = personality; 142 current->personality = personality;
150 oep = current_thread_info()->exec_domain;
151 current_thread_info()->exec_domain = ep;
152
153 module_put(oep->module); 143 module_put(oep->module);
144
154 return 0; 145 return 0;
155} 146}
156 147
@@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 183{
193 unsigned int old = current->personality; 184 unsigned int old = current->personality;
194 185
195 if (personality != 0xffffffff) { 186 if (personality != 0xffffffff)
196 set_personality(personality); 187 set_personality(personality);
197 if (current->personality != personality)
198 return -EINVAL;
199 }
200 188
201 return old; 189 return old;
202} 190}
diff --git a/kernel/exit.c b/kernel/exit.c
index ceffc67b564a..03120229db28 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -771,9 +771,12 @@ static void forget_original_parent(struct task_struct *father)
771 struct task_struct *p, *n, *reaper; 771 struct task_struct *p, *n, *reaper;
772 LIST_HEAD(dead_children); 772 LIST_HEAD(dead_children);
773 773
774 exit_ptrace(father);
775
776 write_lock_irq(&tasklist_lock); 774 write_lock_irq(&tasklist_lock);
775 /*
776 * Note that exit_ptrace() and find_new_reaper() might
777 * drop tasklist_lock and reacquire it.
778 */
779 exit_ptrace(father);
777 reaper = find_new_reaper(father); 780 reaper = find_new_reaper(father);
778 781
779 list_for_each_entry_safe(p, n, &father->children, sibling) { 782 list_for_each_entry_safe(p, n, &father->children, sibling) {
@@ -1383,8 +1386,7 @@ static int wait_task_stopped(struct wait_opts *wo,
1383 if (!unlikely(wo->wo_flags & WNOWAIT)) 1386 if (!unlikely(wo->wo_flags & WNOWAIT))
1384 *p_code = 0; 1387 *p_code = 0;
1385 1388
1386 /* don't need the RCU readlock here as we're holding a spinlock */ 1389 uid = task_uid(p);
1387 uid = __task_cred(p)->uid;
1388unlock_sig: 1390unlock_sig:
1389 spin_unlock_irq(&p->sighand->siglock); 1391 spin_unlock_irq(&p->sighand->siglock);
1390 if (!exit_code) 1392 if (!exit_code)
@@ -1457,7 +1459,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1457 } 1459 }
1458 if (!unlikely(wo->wo_flags & WNOWAIT)) 1460 if (!unlikely(wo->wo_flags & WNOWAIT))
1459 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1461 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1460 uid = __task_cred(p)->uid; 1462 uid = task_uid(p);
1461 spin_unlock_irq(&p->sighand->siglock); 1463 spin_unlock_irq(&p->sighand->siglock);
1462 1464
1463 pid = task_pid_vnr(p); 1465 pid = task_pid_vnr(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..b7e9d60a675d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -300,7 +300,7 @@ out:
300#ifdef CONFIG_MMU 300#ifdef CONFIG_MMU
301static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 301static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
302{ 302{
303 struct vm_area_struct *mpnt, *tmp, **pprev; 303 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
304 struct rb_node **rb_link, *rb_parent; 304 struct rb_node **rb_link, *rb_parent;
305 int retval; 305 int retval;
306 unsigned long charge; 306 unsigned long charge;
@@ -328,6 +328,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (retval) 328 if (retval)
329 goto out; 329 goto out;
330 330
331 prev = NULL;
331 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 332 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
332 struct file *file; 333 struct file *file;
333 334
@@ -359,7 +360,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
359 goto fail_nomem_anon_vma_fork; 360 goto fail_nomem_anon_vma_fork;
360 tmp->vm_flags &= ~VM_LOCKED; 361 tmp->vm_flags &= ~VM_LOCKED;
361 tmp->vm_mm = mm; 362 tmp->vm_mm = mm;
362 tmp->vm_next = NULL; 363 tmp->vm_next = tmp->vm_prev = NULL;
363 file = tmp->vm_file; 364 file = tmp->vm_file;
364 if (file) { 365 if (file) {
365 struct inode *inode = file->f_path.dentry->d_inode; 366 struct inode *inode = file->f_path.dentry->d_inode;
@@ -392,6 +393,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
392 */ 393 */
393 *pprev = tmp; 394 *pprev = tmp;
394 pprev = &tmp->vm_next; 395 pprev = &tmp->vm_next;
396 tmp->vm_prev = prev;
397 prev = tmp;
395 398
396 __vma_link_rb(mm, tmp, rb_link, rb_parent); 399 __vma_link_rb(mm, tmp, rb_link, rb_parent);
397 rb_link = &tmp->vm_rb.rb_right; 400 rb_link = &tmp->vm_rb.rb_right;
@@ -752,13 +755,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
752 struct fs_struct *fs = current->fs; 755 struct fs_struct *fs = current->fs;
753 if (clone_flags & CLONE_FS) { 756 if (clone_flags & CLONE_FS) {
754 /* tsk->fs is already what we want */ 757 /* tsk->fs is already what we want */
755 write_lock(&fs->lock); 758 spin_lock(&fs->lock);
756 if (fs->in_exec) { 759 if (fs->in_exec) {
757 write_unlock(&fs->lock); 760 spin_unlock(&fs->lock);
758 return -EAGAIN; 761 return -EAGAIN;
759 } 762 }
760 fs->users++; 763 fs->users++;
761 write_unlock(&fs->lock); 764 spin_unlock(&fs->lock);
762 return 0; 765 return 0;
763 } 766 }
764 tsk->fs = copy_fs_struct(fs); 767 tsk->fs = copy_fs_struct(fs);
@@ -899,6 +902,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
899 tty_audit_fork(sig); 902 tty_audit_fork(sig);
900 903
901 sig->oom_adj = current->signal->oom_adj; 904 sig->oom_adj = current->signal->oom_adj;
905 sig->oom_score_adj = current->signal->oom_score_adj;
902 906
903 return 0; 907 return 0;
904} 908}
@@ -907,7 +911,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
907{ 911{
908 unsigned long new_flags = p->flags; 912 unsigned long new_flags = p->flags;
909 913
910 new_flags &= ~PF_SUPERPRIV; 914 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
911 new_flags |= PF_FORKNOEXEC; 915 new_flags |= PF_FORKNOEXEC;
912 new_flags |= PF_STARTING; 916 new_flags |= PF_STARTING;
913 p->flags = new_flags; 917 p->flags = new_flags;
@@ -1675,13 +1679,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1675 1679
1676 if (new_fs) { 1680 if (new_fs) {
1677 fs = current->fs; 1681 fs = current->fs;
1678 write_lock(&fs->lock); 1682 spin_lock(&fs->lock);
1679 current->fs = new_fs; 1683 current->fs = new_fs;
1680 if (--fs->users) 1684 if (--fs->users)
1681 new_fs = NULL; 1685 new_fs = NULL;
1682 else 1686 else
1683 new_fs = fs; 1687 new_fs = fs;
1684 write_unlock(&fs->lock); 1688 spin_unlock(&fs->lock);
1685 } 1689 }
1686 1690
1687 if (new_mm) { 1691 if (new_mm) {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..ce669174f355 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
90 do { 90 do {
91 seq = read_seqbegin(&xtime_lock); 91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time(); 92 xts = __current_kernel_time();
93 tom = wall_to_monotonic; 93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq)); 94 } while (read_seqretry(&xtime_lock, seq));
95 95
96 xtim = timespec_to_ktime(xts); 96 xtim = timespec_to_ktime(xts);
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
144static int hrtimer_get_target(int this_cpu, int pinned) 144static int hrtimer_get_target(int this_cpu, int pinned)
145{ 145{
146#ifdef CONFIG_NO_HZ 146#ifdef CONFIG_NO_HZ
147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { 147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
148 int preferred_cpu = get_nohz_load_balancer(); 148 return get_nohz_timer_target();
149
150 if (preferred_cpu >= 0)
151 return preferred_cpu;
152 }
153#endif 149#endif
154 return this_cpu; 150 return this_cpu;
155} 151}
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
612static void retrigger_next_event(void *arg) 608static void retrigger_next_event(void *arg)
613{ 609{
614 struct hrtimer_cpu_base *base; 610 struct hrtimer_cpu_base *base;
615 struct timespec realtime_offset; 611 struct timespec realtime_offset, wtm;
616 unsigned long seq; 612 unsigned long seq;
617 613
618 if (!hrtimer_hres_active()) 614 if (!hrtimer_hres_active())
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg)
620 616
621 do { 617 do {
622 seq = read_seqbegin(&xtime_lock); 618 seq = read_seqbegin(&xtime_lock);
623 set_normalized_timespec(&realtime_offset, 619 wtm = __get_wall_to_monotonic();
624 -wall_to_monotonic.tv_sec,
625 -wall_to_monotonic.tv_nsec);
626 } while (read_seqretry(&xtime_lock, seq)); 620 } while (read_seqretry(&xtime_lock, seq));
621 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
627 622
628 base = &__get_cpu_var(hrtimer_bases); 623 base = &__get_cpu_var(hrtimer_bases);
629 624
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..d71a987fd2bf 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/list.h>
44#include <linux/cpu.h> 45#include <linux/cpu.h>
45#include <linux/smp.h> 46#include <linux/smp.h>
46 47
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62 63
63static int nr_slots[TYPE_MAX]; 64static int nr_slots[TYPE_MAX];
64 65
66/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head);
68
65static int constraints_initialized; 69static int constraints_initialized;
66 70
67/* Gather the number of total pinned and un-pinned bp in a cpuset */ 71/* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
103 return 0; 107 return 0;
104} 108}
105 109
106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) 110/*
111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list.
113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
107{ 115{
108 struct perf_event_context *ctx = tsk->perf_event_ctxp; 116 struct perf_event_context *ctx = bp->ctx;
109 struct list_head *list; 117 struct perf_event *iter;
110 struct perf_event *bp;
111 unsigned long flags;
112 int count = 0; 118 int count = 0;
113 119
114 if (WARN_ONCE(!ctx, "No perf context for this task")) 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
115 return 0; 121 if (iter->ctx == ctx && find_slot_idx(iter) == type)
116 122 count += hw_breakpoint_weight(iter);
117 list = &ctx->event_list;
118
119 raw_spin_lock_irqsave(&ctx->lock, flags);
120
121 /*
122 * The current breakpoint counter is not included in the list
123 * at the open() callback time
124 */
125 list_for_each_entry(bp, list, event_entry) {
126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
129 } 123 }
130 124
131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
132
133 return count; 125 return count;
134} 126}
135 127
@@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
149 if (!tsk) 141 if (!tsk)
150 slots->pinned += max_task_bp_pinned(cpu, type); 142 slots->pinned += max_task_bp_pinned(cpu, type);
151 else 143 else
152 slots->pinned += task_bp_pinned(tsk, type); 144 slots->pinned += task_bp_pinned(bp, type);
153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
154 146
155 return; 147 return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
162 if (!tsk) 154 if (!tsk)
163 nr += max_task_bp_pinned(cpu, type); 155 nr += max_task_bp_pinned(cpu, type);
164 else 156 else
165 nr += task_bp_pinned(tsk, type); 157 nr += task_bp_pinned(bp, type);
166 158
167 if (nr > slots->pinned) 159 if (nr > slots->pinned)
168 slots->pinned = nr; 160 slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
188/* 180/*
189 * Add a pinned breakpoint for the given task in our constraint table 181 * Add a pinned breakpoint for the given task in our constraint table
190 */ 182 */
191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, 183static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
192 enum bp_type_idx type, int weight) 184 enum bp_type_idx type, int weight)
193{ 185{
194 unsigned int *tsk_pinned; 186 unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
196 int old_idx = 0; 188 int old_idx = 0;
197 int idx = 0; 189 int idx = 0;
198 190
199 old_count = task_bp_pinned(tsk, type); 191 old_count = task_bp_pinned(bp, type);
200 old_idx = old_count - 1; 192 old_idx = old_count - 1;
201 idx = old_idx + weight; 193 idx = old_idx + weight;
202 194
195 /* tsk_pinned[n] is the number of tasks having n breakpoints */
203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); 196 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
204 if (enable) { 197 if (enable) {
205 tsk_pinned[idx]++; 198 tsk_pinned[idx]++;
@@ -222,23 +215,41 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
222 int cpu = bp->cpu; 215 int cpu = bp->cpu;
223 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->ctx->task;
224 217
218 /* Pinned counter cpu profiling */
219 if (!tsk) {
220
221 if (enable)
222 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
223 else
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
225 return;
226 }
227
225 /* Pinned counter task profiling */ 228 /* Pinned counter task profiling */
226 if (tsk) {
227 if (cpu >= 0) {
228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
229 return;
230 }
231 229
230 if (!enable)
231 list_del(&bp->hw.bp_list);
232
233 if (cpu >= 0) {
234 toggle_bp_task_slot(bp, cpu, enable, type, weight);
235 } else {
232 for_each_online_cpu(cpu) 236 for_each_online_cpu(cpu)
233 toggle_bp_task_slot(tsk, cpu, enable, type, weight); 237 toggle_bp_task_slot(bp, cpu, enable, type, weight);
234 return;
235 } 238 }
236 239
237 /* Pinned counter cpu profiling */
238 if (enable) 240 if (enable)
239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 241 list_add_tail(&bp->hw.bp_list, &bp_task_head);
240 else 242}
241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; 243
244/*
245 * Function to perform processor-specific cleanup during unregistration
246 */
247__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
248{
249 /*
250 * A weak stub function here for those archs that don't define
251 * it inside arch/.../kernel/hw_breakpoint.c
252 */
242} 253}
243 254
244/* 255/*
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
301 weight = hw_breakpoint_weight(bp); 312 weight = hw_breakpoint_weight(bp);
302 313
303 fetch_bp_busy_slots(&slots, bp, type); 314 fetch_bp_busy_slots(&slots, bp, type);
315 /*
316 * Simulate the addition of this breakpoint to the constraints
317 * and see the result.
318 */
304 fetch_this_slot(&slots, weight); 319 fetch_this_slot(&slots, weight);
305 320
306 /* Flexible counters need to keep at least one slot */ 321 /* Flexible counters need to keep at least one slot */
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp)
339{ 354{
340 mutex_lock(&nr_bp_mutex); 355 mutex_lock(&nr_bp_mutex);
341 356
357 arch_unregister_hw_breakpoint(bp);
342 __release_bp_slot(bp); 358 __release_bp_slot(bp);
343 359
344 mutex_unlock(&nr_bp_mutex); 360 mutex_unlock(&nr_bp_mutex);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..c3003e9d91a3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
216void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) 216void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
217{ 217{
218 if (suspend) { 218 if (suspend) {
219 if (!desc->action || (desc->action->flags & IRQF_TIMER)) 219 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
220 return; 220 return;
221 desc->status |= IRQ_SUSPENDED; 221 desc->status |= IRQ_SUSPENDED;
222 } 222 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 131b1703936f..c0613f7d6730 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
151 image->nr_segments = nr_segments; 151 image->nr_segments = nr_segments;
152 segment_bytes = nr_segments * sizeof(*segments); 152 segment_bytes = nr_segments * sizeof(*segments);
153 result = copy_from_user(image->segment, segments, segment_bytes); 153 result = copy_from_user(image->segment, segments, segment_bytes);
154 if (result) 154 if (result) {
155 result = -EFAULT;
155 goto out; 156 goto out;
157 }
156 158
157 /* 159 /*
158 * Verify we have good destination addresses. The caller is 160 * Verify we have good destination addresses. The caller is
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image,
827 result = copy_from_user(ptr, buf, uchunk); 829 result = copy_from_user(ptr, buf, uchunk);
828 kunmap(page); 830 kunmap(page);
829 if (result) { 831 if (result) {
830 result = (result < 0) ? result : -EIO; 832 result = -EFAULT;
831 goto out; 833 goto out;
832 } 834 }
833 ubytes -= uchunk; 835 ubytes -= uchunk;
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image,
882 kexec_flush_icache_page(page); 884 kexec_flush_icache_page(page);
883 kunmap(page); 885 kunmap(page);
884 if (result) { 886 if (result) {
885 result = (result < 0) ? result : -EIO; 887 result = -EFAULT;
886 goto out; 888 goto out;
887 } 889 }
888 ubytes -= uchunk; 890 ubytes -= uchunk;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 35edbe22e9a9..6b5580c57644 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,8 +1,7 @@
1/* 1/*
2 * A generic kernel FIFO implementation. 2 * A generic kernel FIFO implementation
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> 4 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
6 * 5 *
7 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -11,7 +10,7 @@
11 * 10 *
12 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details. 14 * GNU General Public License for more details.
16 * 15 *
17 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
@@ -24,422 +23,588 @@
24#include <linux/module.h> 23#include <linux/module.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26#include <linux/err.h> 25#include <linux/err.h>
27#include <linux/kfifo.h>
28#include <linux/log2.h> 26#include <linux/log2.h>
29#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kfifo.h>
30 29
31static void _kfifo_init(struct kfifo *fifo, void *buffer, 30/*
32 unsigned int size) 31 * internal helper to calculate the unused elements in a fifo
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
39
40/**
41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
43 * @buffer: the preallocated buffer to be used.
44 * @size: the size of the internal buffer, this has to be a power of 2.
45 *
46 */ 32 */
47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) 33static inline unsigned int kfifo_unused(struct __kfifo *fifo)
48{ 34{
49 /* size must be a power of 2 */ 35 return (fifo->mask + 1) - (fifo->in - fifo->out);
50 BUG_ON(!is_power_of_2(size));
51
52 _kfifo_init(fifo, buffer, size);
53} 36}
54EXPORT_SYMBOL(kfifo_init);
55 37
56/** 38int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
57 * kfifo_alloc - allocates a new FIFO internal buffer 39 size_t esize, gfp_t gfp_mask)
58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
61 *
62 * This function dynamically allocates a new fifo internal buffer
63 *
64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */
68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 40{
70 unsigned char *buffer;
71
72 /* 41 /*
73 * round up to the next power of 2, since our 'let the indices 42 * round down to the next power of 2, since our 'let the indices
74 * wrap' technique works only in this case. 43 * wrap' technique works only in this case.
75 */ 44 */
76 if (!is_power_of_2(size)) { 45 if (!is_power_of_2(size))
77 BUG_ON(size > 0x80000000); 46 size = rounddown_pow_of_two(size);
78 size = roundup_pow_of_two(size); 47
48 fifo->in = 0;
49 fifo->out = 0;
50 fifo->esize = esize;
51
52 if (size < 2) {
53 fifo->data = NULL;
54 fifo->mask = 0;
55 return -EINVAL;
79 } 56 }
80 57
81 buffer = kmalloc(size, gfp_mask); 58 fifo->data = kmalloc(size * esize, gfp_mask);
82 if (!buffer) { 59
83 _kfifo_init(fifo, NULL, 0); 60 if (!fifo->data) {
61 fifo->mask = 0;
84 return -ENOMEM; 62 return -ENOMEM;
85 } 63 }
86 64 fifo->mask = size - 1;
87 _kfifo_init(fifo, buffer, size);
88 65
89 return 0; 66 return 0;
90} 67}
91EXPORT_SYMBOL(kfifo_alloc); 68EXPORT_SYMBOL(__kfifo_alloc);
92 69
93/** 70void __kfifo_free(struct __kfifo *fifo)
94 * kfifo_free - frees the FIFO internal buffer
95 * @fifo: the fifo to be freed.
96 */
97void kfifo_free(struct kfifo *fifo)
98{ 71{
99 kfree(fifo->buffer); 72 kfree(fifo->data);
100 _kfifo_init(fifo, NULL, 0); 73 fifo->in = 0;
74 fifo->out = 0;
75 fifo->esize = 0;
76 fifo->data = NULL;
77 fifo->mask = 0;
101} 78}
102EXPORT_SYMBOL(kfifo_free); 79EXPORT_SYMBOL(__kfifo_free);
103 80
104/** 81int __kfifo_init(struct __kfifo *fifo, void *buffer,
105 * kfifo_skip - skip output data 82 unsigned int size, size_t esize)
106 * @fifo: the fifo to be used.
107 * @len: number of bytes to skip
108 */
109void kfifo_skip(struct kfifo *fifo, unsigned int len)
110{ 83{
111 if (len < kfifo_len(fifo)) { 84 size /= esize;
112 __kfifo_add_out(fifo, len); 85
113 return; 86 if (!is_power_of_2(size))
87 size = rounddown_pow_of_two(size);
88
89 fifo->in = 0;
90 fifo->out = 0;
91 fifo->esize = esize;
92 fifo->data = buffer;
93
94 if (size < 2) {
95 fifo->mask = 0;
96 return -EINVAL;
114 } 97 }
115 kfifo_reset_out(fifo); 98 fifo->mask = size - 1;
99
100 return 0;
116} 101}
117EXPORT_SYMBOL(kfifo_skip); 102EXPORT_SYMBOL(__kfifo_init);
118 103
119static inline void __kfifo_in_data(struct kfifo *fifo, 104static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
120 const void *from, unsigned int len, unsigned int off) 105 unsigned int len, unsigned int off)
121{ 106{
107 unsigned int size = fifo->mask + 1;
108 unsigned int esize = fifo->esize;
122 unsigned int l; 109 unsigned int l;
123 110
111 off &= fifo->mask;
112 if (esize != 1) {
113 off *= esize;
114 size *= esize;
115 len *= esize;
116 }
117 l = min(len, size - off);
118
119 memcpy(fifo->data + off, src, l);
120 memcpy(fifo->data, src + l, len - l);
124 /* 121 /*
125 * Ensure that we sample the fifo->out index -before- we 122 * make sure that the data in the fifo is up to date before
126 * start putting bytes into the kfifo. 123 * incrementing the fifo->in index counter
127 */ 124 */
125 smp_wmb();
126}
128 127
129 smp_mb(); 128unsigned int __kfifo_in(struct __kfifo *fifo,
130 129 const void *buf, unsigned int len)
131 off = __kfifo_off(fifo, fifo->in + off); 130{
131 unsigned int l;
132 132
133 /* first put the data starting from fifo->in to buffer end */ 133 l = kfifo_unused(fifo);
134 l = min(len, fifo->size - off); 134 if (len > l)
135 memcpy(fifo->buffer + off, from, l); 135 len = l;
136 136
137 /* then put the rest (if any) at the beginning of the buffer */ 137 kfifo_copy_in(fifo, buf, len, fifo->in);
138 memcpy(fifo->buffer, from + l, len - l); 138 fifo->in += len;
139 return len;
139} 140}
141EXPORT_SYMBOL(__kfifo_in);
140 142
141static inline void __kfifo_out_data(struct kfifo *fifo, 143static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
142 void *to, unsigned int len, unsigned int off) 144 unsigned int len, unsigned int off)
143{ 145{
146 unsigned int size = fifo->mask + 1;
147 unsigned int esize = fifo->esize;
144 unsigned int l; 148 unsigned int l;
145 149
150 off &= fifo->mask;
151 if (esize != 1) {
152 off *= esize;
153 size *= esize;
154 len *= esize;
155 }
156 l = min(len, size - off);
157
158 memcpy(dst, fifo->data + off, l);
159 memcpy(dst + l, fifo->data, len - l);
146 /* 160 /*
147 * Ensure that we sample the fifo->in index -before- we 161 * make sure that the data is copied before
148 * start removing bytes from the kfifo. 162 * incrementing the fifo->out index counter
149 */ 163 */
164 smp_wmb();
165}
150 166
151 smp_rmb(); 167unsigned int __kfifo_out_peek(struct __kfifo *fifo,
168 void *buf, unsigned int len)
169{
170 unsigned int l;
152 171
153 off = __kfifo_off(fifo, fifo->out + off); 172 l = fifo->in - fifo->out;
173 if (len > l)
174 len = l;
154 175
155 /* first get the data from fifo->out until the end of the buffer */ 176 kfifo_copy_out(fifo, buf, len, fifo->out);
156 l = min(len, fifo->size - off); 177 return len;
157 memcpy(to, fifo->buffer + off, l); 178}
179EXPORT_SYMBOL(__kfifo_out_peek);
158 180
159 /* then get the rest (if any) from the beginning of the buffer */ 181unsigned int __kfifo_out(struct __kfifo *fifo,
160 memcpy(to + l, fifo->buffer, len - l); 182 void *buf, unsigned int len)
183{
184 len = __kfifo_out_peek(fifo, buf, len);
185 fifo->out += len;
186 return len;
161} 187}
188EXPORT_SYMBOL(__kfifo_out);
162 189
163static inline int __kfifo_from_user_data(struct kfifo *fifo, 190static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off, 191 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout) 192 unsigned int *copied)
166{ 193{
194 unsigned int size = fifo->mask + 1;
195 unsigned int esize = fifo->esize;
167 unsigned int l; 196 unsigned int l;
168 int ret; 197 unsigned long ret;
169 198
199 off &= fifo->mask;
200 if (esize != 1) {
201 off *= esize;
202 size *= esize;
203 len *= esize;
204 }
205 l = min(len, size - off);
206
207 ret = copy_from_user(fifo->data + off, from, l);
208 if (unlikely(ret))
209 ret = DIV_ROUND_UP(ret + len - l, esize);
210 else {
211 ret = copy_from_user(fifo->data, from + l, len - l);
212 if (unlikely(ret))
213 ret = DIV_ROUND_UP(ret, esize);
214 }
170 /* 215 /*
171 * Ensure that we sample the fifo->out index -before- we 216 * make sure that the data in the fifo is up to date before
172 * start putting bytes into the kfifo. 217 * incrementing the fifo->in index counter
173 */ 218 */
219 smp_wmb();
220 *copied = len - ret;
221 /* return the number of elements which are not copied */
222 return ret;
223}
174 224
175 smp_mb(); 225int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
226 unsigned long len, unsigned int *copied)
227{
228 unsigned int l;
229 unsigned long ret;
230 unsigned int esize = fifo->esize;
231 int err;
176 232
177 off = __kfifo_off(fifo, fifo->in + off); 233 if (esize != 1)
234 len /= esize;
178 235
179 /* first put the data starting from fifo->in to buffer end */ 236 l = kfifo_unused(fifo);
180 l = min(len, fifo->size - off); 237 if (len > l)
181 ret = copy_from_user(fifo->buffer + off, from, l); 238 len = l;
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
187 239
188 /* then put the rest (if any) at the beginning of the buffer */ 240 ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
189 ret = copy_from_user(fifo->buffer, from + l, len - l); 241 if (unlikely(ret)) {
190 *lenout += ret ? ret : len - l; 242 len -= ret;
191 return ret ? -EFAULT : 0; 243 err = -EFAULT;
244 } else
245 err = 0;
246 fifo->in += len;
247 return err;
192} 248}
249EXPORT_SYMBOL(__kfifo_from_user);
193 250
194static inline int __kfifo_to_user_data(struct kfifo *fifo, 251static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout) 252 unsigned int len, unsigned int off, unsigned int *copied)
196{ 253{
197 unsigned int l; 254 unsigned int l;
198 int ret; 255 unsigned long ret;
199 256 unsigned int size = fifo->mask + 1;
257 unsigned int esize = fifo->esize;
258
259 off &= fifo->mask;
260 if (esize != 1) {
261 off *= esize;
262 size *= esize;
263 len *= esize;
264 }
265 l = min(len, size - off);
266
267 ret = copy_to_user(to, fifo->data + off, l);
268 if (unlikely(ret))
269 ret = DIV_ROUND_UP(ret + len - l, esize);
270 else {
271 ret = copy_to_user(to + l, fifo->data, len - l);
272 if (unlikely(ret))
273 ret = DIV_ROUND_UP(ret, esize);
274 }
200 /* 275 /*
201 * Ensure that we sample the fifo->in index -before- we 276 * make sure that the data is copied before
202 * start removing bytes from the kfifo. 277 * incrementing the fifo->out index counter
203 */ 278 */
279 smp_wmb();
280 *copied = len - ret;
281 /* return the number of elements which are not copied */
282 return ret;
283}
204 284
205 smp_rmb(); 285int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
286 unsigned long len, unsigned int *copied)
287{
288 unsigned int l;
289 unsigned long ret;
290 unsigned int esize = fifo->esize;
291 int err;
206 292
207 off = __kfifo_off(fifo, fifo->out + off); 293 if (esize != 1)
294 len /= esize;
208 295
209 /* first get the data from fifo->out until the end of the buffer */ 296 l = fifo->in - fifo->out;
210 l = min(len, fifo->size - off); 297 if (len > l)
211 ret = copy_to_user(to, fifo->buffer + off, l); 298 len = l;
212 *lenout = l; 299 ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
213 if (unlikely(ret)) { 300 if (unlikely(ret)) {
214 *lenout -= ret; 301 len -= ret;
215 return -EFAULT; 302 err = -EFAULT;
216 } 303 } else
304 err = 0;
305 fifo->out += len;
306 return err;
307}
308EXPORT_SYMBOL(__kfifo_to_user);
217 309
218 /* then get the rest (if any) from the beginning of the buffer */ 310static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
219 len -= l; 311 int nents, unsigned int len)
220 ret = copy_to_user(to + l, fifo->buffer, len); 312{
221 if (unlikely(ret)) { 313 int n;
222 *lenout += len - ret; 314 unsigned int l;
223 return -EFAULT; 315 unsigned int off;
316 struct page *page;
317
318 if (!nents)
319 return 0;
320
321 if (!len)
322 return 0;
323
324 n = 0;
325 page = virt_to_page(buf);
326 off = offset_in_page(buf);
327 l = 0;
328
329 while (len >= l + PAGE_SIZE - off) {
330 struct page *npage;
331
332 l += PAGE_SIZE;
333 buf += PAGE_SIZE;
334 npage = virt_to_page(buf);
335 if (page_to_phys(page) != page_to_phys(npage) - l) {
336 sg_set_page(sgl, page, l - off, off);
337 sgl = sg_next(sgl);
338 if (++n == nents || sgl == NULL)
339 return n;
340 page = npage;
341 len -= l - off;
342 l = off = 0;
343 }
224 } 344 }
225 *lenout += len; 345 sg_set_page(sgl, page, len, off);
226 return 0; 346 return n + 1;
227} 347}
228 348
229unsigned int __kfifo_in_n(struct kfifo *fifo, 349static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
230 const void *from, unsigned int len, unsigned int recsize) 350 int nents, unsigned int len, unsigned int off)
231{ 351{
232 if (kfifo_avail(fifo) < len + recsize) 352 unsigned int size = fifo->mask + 1;
233 return len + 1; 353 unsigned int esize = fifo->esize;
354 unsigned int l;
355 unsigned int n;
234 356
235 __kfifo_in_data(fifo, from, len, recsize); 357 off &= fifo->mask;
236 return 0; 358 if (esize != 1) {
359 off *= esize;
360 size *= esize;
361 len *= esize;
362 }
363 l = min(len, size - off);
364
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367
368 if (n)
369 sg_mark_end(sgl + n - 1);
370 return n;
237} 371}
238EXPORT_SYMBOL(__kfifo_in_n);
239 372
240/** 373unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
241 * kfifo_in - puts some data into the FIFO 374 struct scatterlist *sgl, int nents, unsigned int len)
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{ 375{
256 len = min(kfifo_avail(fifo), len); 376 unsigned int l;
257 377
258 __kfifo_in_data(fifo, from, len, 0); 378 l = kfifo_unused(fifo);
259 __kfifo_add_in(fifo, len); 379 if (len > l)
260 return len; 380 len = l;
381
382 return setup_sgl(fifo, sgl, nents, len, fifo->in);
261} 383}
262EXPORT_SYMBOL(kfifo_in); 384EXPORT_SYMBOL(__kfifo_dma_in_prepare);
263 385
264unsigned int __kfifo_in_generic(struct kfifo *fifo, 386unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize) 387 struct scatterlist *sgl, int nents, unsigned int len)
266{ 388{
267 return __kfifo_in_rec(fifo, from, len, recsize); 389 unsigned int l;
390
391 l = fifo->in - fifo->out;
392 if (len > l)
393 len = l;
394
395 return setup_sgl(fifo, sgl, nents, len, fifo->out);
268} 396}
269EXPORT_SYMBOL(__kfifo_in_generic); 397EXPORT_SYMBOL(__kfifo_dma_out_prepare);
270 398
271unsigned int __kfifo_out_n(struct kfifo *fifo, 399unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
272 void *to, unsigned int len, unsigned int recsize)
273{ 400{
274 if (kfifo_len(fifo) < len + recsize) 401 unsigned int max = (1 << (recsize << 3)) - 1;
275 return len;
276 402
277 __kfifo_out_data(fifo, to, len, recsize); 403 if (len > max)
278 __kfifo_add_out(fifo, len + recsize); 404 return max;
279 return 0; 405 return len;
280} 406}
281EXPORT_SYMBOL(__kfifo_out_n);
282 407
283/** 408#define __KFIFO_PEEK(data, out, mask) \
284 * kfifo_out - gets some data from the FIFO 409 ((data)[(out) & (mask)])
285 * @fifo: the fifo to be used. 410/*
286 * @to: where the data must be copied. 411 * __kfifo_peek_n internal helper function for determinate the length of
287 * @len: the size of the destination buffer. 412 * the next record in the fifo
288 *
289 * This function copies at most @len bytes from the FIFO into the
290 * @to buffer and returns the number of copied bytes.
291 *
292 * Note that with only one concurrent reader and one concurrent
293 * writer, you don't need extra locking to use these functions.
294 */ 413 */
295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) 414static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
296{ 415{
297 len = min(kfifo_len(fifo), len); 416 unsigned int l;
417 unsigned int mask = fifo->mask;
418 unsigned char *data = fifo->data;
298 419
299 __kfifo_out_data(fifo, to, len, 0); 420 l = __KFIFO_PEEK(data, fifo->out, mask);
300 __kfifo_add_out(fifo, len);
301 421
302 return len; 422 if (--recsize)
423 l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
424
425 return l;
303} 426}
304EXPORT_SYMBOL(kfifo_out);
305 427
306/** 428#define __KFIFO_POKE(data, in, mask, val) \
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it 429 ( \
308 * @fifo: the fifo to be used. 430 (data)[(in) & (mask)] = (unsigned char)(val) \
309 * @to: where the data must be copied. 431 )
310 * @len: the size of the destination buffer. 432
311 * @offset: offset into the fifo 433/*
312 * 434 * __kfifo_poke_n internal helper function for storeing the length of
313 * This function copies at most @len bytes at @offset from the FIFO 435 * the record into the fifo
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */ 436 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, 437static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
318 unsigned offset)
319{ 438{
320 len = min(kfifo_len(fifo), len + offset); 439 unsigned int mask = fifo->mask;
440 unsigned char *data = fifo->data;
321 441
322 __kfifo_out_data(fifo, to, len, offset); 442 __KFIFO_POKE(data, fifo->in, mask, n);
323 return len; 443
444 if (recsize > 1)
445 __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
324} 446}
325EXPORT_SYMBOL(kfifo_out_peek);
326 447
327unsigned int __kfifo_out_generic(struct kfifo *fifo, 448unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{ 449{
331 return __kfifo_out_rec(fifo, to, len, recsize, total); 450 return __kfifo_peek_n(fifo, recsize);
332} 451}
333EXPORT_SYMBOL(__kfifo_out_generic); 452EXPORT_SYMBOL(__kfifo_len_r);
334 453
335unsigned int __kfifo_from_user_n(struct kfifo *fifo, 454unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
336 const void __user *from, unsigned int len, unsigned int recsize) 455 unsigned int len, size_t recsize)
337{ 456{
338 unsigned total; 457 if (len + recsize > kfifo_unused(fifo))
458 return 0;
339 459
340 if (kfifo_avail(fifo) < len + recsize) 460 __kfifo_poke_n(fifo, len, recsize);
341 return len + 1;
342 461
343 __kfifo_from_user_data(fifo, from, len, recsize, &total); 462 kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
344 return total; 463 fifo->in += len + recsize;
464 return len;
345} 465}
346EXPORT_SYMBOL(__kfifo_from_user_n); 466EXPORT_SYMBOL(__kfifo_in_r);
347 467
348/** 468static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
349 * kfifo_from_user - puts some data from user space into the FIFO 469 void *buf, unsigned int len, size_t recsize, unsigned int *n)
350 * @fifo: the fifo to be used. 470{
351 * @from: pointer to the data to be added. 471 *n = __kfifo_peek_n(fifo, recsize);
352 * @len: the length of the data to be added. 472
353 * @total: the actual returned data length. 473 if (len > *n)
354 * 474 len = *n;
355 * This function copies at most @len bytes from the @from into the 475
356 * FIFO depending and returns -EFAULT/0. 476 kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
357 * 477 return len;
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
371} 478}
372EXPORT_SYMBOL(kfifo_from_user);
373 479
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo, 480unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
375 const void __user *from, unsigned int len, unsigned int recsize) 481 unsigned int len, size_t recsize)
376{ 482{
377 return __kfifo_from_user_rec(fifo, from, len, recsize); 483 unsigned int n;
484
485 if (fifo->in == fifo->out)
486 return 0;
487
488 return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
378} 489}
379EXPORT_SYMBOL(__kfifo_from_user_generic); 490EXPORT_SYMBOL(__kfifo_out_peek_r);
380 491
381unsigned int __kfifo_to_user_n(struct kfifo *fifo, 492unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
382 void __user *to, unsigned int len, unsigned int reclen, 493 unsigned int len, size_t recsize)
383 unsigned int recsize)
384{ 494{
385 unsigned int ret, total; 495 unsigned int n;
386 496
387 if (kfifo_len(fifo) < reclen + recsize) 497 if (fifo->in == fifo->out)
388 return len; 498 return 0;
389 499
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); 500 len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
501 fifo->out += n + recsize;
502 return len;
503}
504EXPORT_SYMBOL(__kfifo_out_r);
391 505
392 if (likely(ret == 0)) 506void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
393 __kfifo_add_out(fifo, reclen + recsize); 507{
508 unsigned int n;
394 509
395 return total; 510 n = __kfifo_peek_n(fifo, recsize);
511 fifo->out += n + recsize;
396} 512}
397EXPORT_SYMBOL(__kfifo_to_user_n); 513EXPORT_SYMBOL(__kfifo_skip_r);
398 514
399/** 515int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
400 * kfifo_to_user - gets data from the FIFO and write it to user space 516 unsigned long len, unsigned int *copied, size_t recsize)
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{ 517{
415 int ret; 518 unsigned long ret;
416 len = min(kfifo_len(fifo), len); 519
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); 520 len = __kfifo_max_r(len, recsize);
418 __kfifo_add_out(fifo, *lenout); 521
419 return ret; 522 if (len + recsize > kfifo_unused(fifo)) {
523 *copied = 0;
524 return 0;
525 }
526
527 __kfifo_poke_n(fifo, len, recsize);
528
529 ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
530 if (unlikely(ret)) {
531 *copied = 0;
532 return -EFAULT;
533 }
534 fifo->in += len + recsize;
535 return 0;
420} 536}
421EXPORT_SYMBOL(kfifo_to_user); 537EXPORT_SYMBOL(__kfifo_from_user_r);
422 538
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo, 539int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
424 void __user *to, unsigned int len, unsigned int recsize, 540 unsigned long len, unsigned int *copied, size_t recsize)
425 unsigned int *total)
426{ 541{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total); 542 unsigned long ret;
543 unsigned int n;
544
545 if (fifo->in == fifo->out) {
546 *copied = 0;
547 return 0;
548 }
549
550 n = __kfifo_peek_n(fifo, recsize);
551 if (len > n)
552 len = n;
553
554 ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
555 if (unlikely(ret)) {
556 *copied = 0;
557 return -EFAULT;
558 }
559 fifo->out += n + recsize;
560 return 0;
428} 561}
429EXPORT_SYMBOL(__kfifo_to_user_generic); 562EXPORT_SYMBOL(__kfifo_to_user_r);
430 563
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) 564unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
565 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
432{ 566{
433 if (recsize == 0) 567 if (!nents)
434 return kfifo_avail(fifo); 568 BUG();
435 569
436 return __kfifo_peek_n(fifo, recsize); 570 len = __kfifo_max_r(len, recsize);
571
572 if (len + recsize > kfifo_unused(fifo))
573 return 0;
574
575 return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
437} 576}
438EXPORT_SYMBOL(__kfifo_peek_generic); 577EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
439 578
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) 579void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
580 unsigned int len, size_t recsize)
441{ 581{
442 __kfifo_skip_rec(fifo, recsize); 582 len = __kfifo_max_r(len, recsize);
583 __kfifo_poke_n(fifo, len, recsize);
584 fifo->in += len + recsize;
443} 585}
444EXPORT_SYMBOL(__kfifo_skip_generic); 586EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
445 587
588unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
589 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
590{
591 if (!nents)
592 BUG();
593
594 len = __kfifo_max_r(len, recsize);
595
596 if (len + recsize > fifo->in - fifo->out)
597 return 0;
598
599 return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
600}
601EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
602
603void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
604{
605 unsigned int len;
606
607 len = __kfifo_peek_n(fifo, recsize);
608 fifo->out += len + recsize;
609}
610EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6e9b19667a8d..9cd0591c96a2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data)
153 goto fail; 153 goto fail;
154 } 154 }
155 155
156 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 156 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp);
157 159
158 /* Exec failed? */ 160 /* Exec failed? */
159fail: 161fail:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
18#include <linux/freezer.h>
17#include <trace/events/sched.h> 19#include <trace/events/sched.h>
18 20
19static DEFINE_SPINLOCK(kthread_create_lock); 21static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
35 37
36struct kthread { 38struct kthread {
37 int should_stop; 39 int should_stop;
40 void *data;
38 struct completion exited; 41 struct completion exited;
39}; 42};
40 43
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
54} 57}
55EXPORT_SYMBOL(kthread_should_stop); 58EXPORT_SYMBOL(kthread_should_stop);
56 59
60/**
61 * kthread_data - return data value specified on kthread creation
62 * @task: kthread task in question
63 *
64 * Return the data value specified when kthread @task was created.
65 * The caller is responsible for ensuring the validity of @task when
66 * calling this function.
67 */
68void *kthread_data(struct task_struct *task)
69{
70 return to_kthread(task)->data;
71}
72
57static int kthread(void *_create) 73static int kthread(void *_create)
58{ 74{
59 /* Copy data: it's on kthread's stack */ 75 /* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
64 int ret; 80 int ret;
65 81
66 self.should_stop = 0; 82 self.should_stop = 0;
83 self.data = data;
67 init_completion(&self.exited); 84 init_completion(&self.exited);
68 current->vfork_done = &self.exited; 85 current->vfork_done = &self.exited;
69 86
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
247 264
248 return 0; 265 return 0;
249} 266}
267
268/**
269 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker
271 *
272 * This function can be used as @threadfn to kthread_create() or
273 * kthread_run() with @worker_ptr argument pointing to an initialized
274 * kthread_worker. The started kthread will process work_list until
275 * the it is stopped with kthread_stop(). A kthread can also call
276 * this function directly after extra initialization.
277 *
278 * Different kthreads can be used for the same kthread_worker as long
279 * as there's only one kthread attached to it at any given time. A
280 * kthread_worker without an attached kthread simply collects queued
281 * kthread_works.
282 */
283int kthread_worker_fn(void *worker_ptr)
284{
285 struct kthread_worker *worker = worker_ptr;
286 struct kthread_work *work;
287
288 WARN_ON(worker->task);
289 worker->task = current;
290repeat:
291 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
292
293 if (kthread_should_stop()) {
294 __set_current_state(TASK_RUNNING);
295 spin_lock_irq(&worker->lock);
296 worker->task = NULL;
297 spin_unlock_irq(&worker->lock);
298 return 0;
299 }
300
301 work = NULL;
302 spin_lock_irq(&worker->lock);
303 if (!list_empty(&worker->work_list)) {
304 work = list_first_entry(&worker->work_list,
305 struct kthread_work, node);
306 list_del_init(&work->node);
307 }
308 spin_unlock_irq(&worker->lock);
309
310 if (work) {
311 __set_current_state(TASK_RUNNING);
312 work->func(work);
313 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
314 work->done_seq = work->queue_seq;
315 smp_mb(); /* mb worker-b1 paired with flush-b0 */
316 if (atomic_read(&work->flushing))
317 wake_up_all(&work->done);
318 } else if (!freezing(current))
319 schedule();
320
321 try_to_freeze();
322 goto repeat;
323}
324EXPORT_SYMBOL_GPL(kthread_worker_fn);
325
326/**
327 * queue_kthread_work - queue a kthread_work
328 * @worker: target kthread_worker
329 * @work: kthread_work to queue
330 *
331 * Queue @work to work processor @task for async execution. @task
332 * must have been created with kthread_worker_create(). Returns %true
333 * if @work was successfully queued, %false if it was already pending.
334 */
335bool queue_kthread_work(struct kthread_worker *worker,
336 struct kthread_work *work)
337{
338 bool ret = false;
339 unsigned long flags;
340
341 spin_lock_irqsave(&worker->lock, flags);
342 if (list_empty(&work->node)) {
343 list_add_tail(&work->node, &worker->work_list);
344 work->queue_seq++;
345 if (likely(worker->task))
346 wake_up_process(worker->task);
347 ret = true;
348 }
349 spin_unlock_irqrestore(&worker->lock, flags);
350 return ret;
351}
352EXPORT_SYMBOL_GPL(queue_kthread_work);
353
354/**
355 * flush_kthread_work - flush a kthread_work
356 * @work: work to flush
357 *
358 * If @work is queued or executing, wait for it to finish execution.
359 */
360void flush_kthread_work(struct kthread_work *work)
361{
362 int seq = work->queue_seq;
363
364 atomic_inc(&work->flushing);
365
366 /*
367 * mb flush-b0 paired with worker-b1, to make sure either
368 * worker sees the above increment or we see done_seq update.
369 */
370 smp_mb__after_atomic_inc();
371
372 /* A - B <= 0 tests whether B is in front of A regardless of overflow */
373 wait_event(work->done, seq - work->done_seq <= 0);
374 atomic_dec(&work->flushing);
375
376 /*
377 * rmb flush-b1 paired with worker-b0, to make sure our caller
378 * sees every change made by work->func().
379 */
380 smp_mb__after_atomic_dec();
381}
382EXPORT_SYMBOL_GPL(flush_kthread_work);
383
384struct kthread_flush_work {
385 struct kthread_work work;
386 struct completion done;
387};
388
389static void kthread_flush_work_fn(struct kthread_work *work)
390{
391 struct kthread_flush_work *fwork =
392 container_of(work, struct kthread_flush_work, work);
393 complete(&fwork->done);
394}
395
396/**
397 * flush_kthread_worker - flush all current works on a kthread_worker
398 * @worker: worker to flush
399 *
400 * Wait until all currently executing or pending works on @worker are
401 * finished.
402 */
403void flush_kthread_worker(struct kthread_worker *worker)
404{
405 struct kthread_flush_work fwork = {
406 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
407 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
408 };
409
410 queue_kthread_work(worker, &fwork.work);
411 wait_for_completion(&fwork.done);
412}
413EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
146 146
147static inline u64 lockstat_clock(void) 147static inline u64 lockstat_clock(void)
148{ 148{
149 return cpu_clock(smp_processor_id()); 149 return local_clock();
150} 150}
151 151
152static int lock_point(unsigned long points[], unsigned long ip) 152static int lock_point(unsigned long points[], unsigned long ip)
diff --git a/kernel/module.c b/kernel/module.c
index 6c562828c85c..d0b5f8db11b4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
1/* 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU General Public License as published by
@@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb)
110} 110}
111EXPORT_SYMBOL(unregister_module_notifier); 111EXPORT_SYMBOL(unregister_module_notifier);
112 112
113struct load_info {
114 Elf_Ehdr *hdr;
115 unsigned long len;
116 Elf_Shdr *sechdrs;
117 char *secstrings, *strtab;
118 unsigned long *strmap;
119 unsigned long symoffs, stroffs;
120 struct _ddebug *debug;
121 unsigned int num_debug;
122 struct {
123 unsigned int sym, str, mod, vers, info, pcpu;
124 } index;
125};
126
113/* We require a truly strong try_module_get(): 0 means failure due to 127/* We require a truly strong try_module_get(): 0 means failure due to
114 ongoing or failed initialization etc. */ 128 ongoing or failed initialization etc. */
115static inline int strong_try_module_get(struct module *mod) 129static inline int strong_try_module_get(struct module *mod)
@@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code)
140EXPORT_SYMBOL(__module_put_and_exit); 154EXPORT_SYMBOL(__module_put_and_exit);
141 155
142/* Find a module section: 0 means not found. */ 156/* Find a module section: 0 means not found. */
143static unsigned int find_sec(Elf_Ehdr *hdr, 157static unsigned int find_sec(const struct load_info *info, const char *name)
144 Elf_Shdr *sechdrs,
145 const char *secstrings,
146 const char *name)
147{ 158{
148 unsigned int i; 159 unsigned int i;
149 160
150 for (i = 1; i < hdr->e_shnum; i++) 161 for (i = 1; i < info->hdr->e_shnum; i++) {
162 Elf_Shdr *shdr = &info->sechdrs[i];
151 /* Alloc bit cleared means "ignore it." */ 163 /* Alloc bit cleared means "ignore it." */
152 if ((sechdrs[i].sh_flags & SHF_ALLOC) 164 if ((shdr->sh_flags & SHF_ALLOC)
153 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) 165 && strcmp(info->secstrings + shdr->sh_name, name) == 0)
154 return i; 166 return i;
167 }
155 return 0; 168 return 0;
156} 169}
157 170
158/* Find a module section, or NULL. */ 171/* Find a module section, or NULL. */
159static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, 172static void *section_addr(const struct load_info *info, const char *name)
160 const char *secstrings, const char *name)
161{ 173{
162 /* Section 0 has sh_addr 0. */ 174 /* Section 0 has sh_addr 0. */
163 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; 175 return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
164} 176}
165 177
166/* Find a module section, or NULL. Fill in number of "objects" in section. */ 178/* Find a module section, or NULL. Fill in number of "objects" in section. */
167static void *section_objs(Elf_Ehdr *hdr, 179static void *section_objs(const struct load_info *info,
168 Elf_Shdr *sechdrs,
169 const char *secstrings,
170 const char *name, 180 const char *name,
171 size_t object_size, 181 size_t object_size,
172 unsigned int *num) 182 unsigned int *num)
173{ 183{
174 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); 184 unsigned int sec = find_sec(info, name);
175 185
176 /* Section 0 has sh_addr 0 and sh_size 0. */ 186 /* Section 0 has sh_addr 0 and sh_size 0. */
177 *num = sechdrs[sec].sh_size / object_size; 187 *num = info->sechdrs[sec].sh_size / object_size;
178 return (void *)sechdrs[sec].sh_addr; 188 return (void *)info->sechdrs[sec].sh_addr;
179} 189}
180 190
181/* Provided by the linker */ 191/* Provided by the linker */
@@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
227 unsigned int symnum, void *data), void *data) 237 unsigned int symnum, void *data), void *data)
228{ 238{
229 struct module *mod; 239 struct module *mod;
230 const struct symsearch arr[] = { 240 static const struct symsearch arr[] = {
231 { __start___ksymtab, __stop___ksymtab, __start___kcrctab, 241 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
232 NOT_GPL_ONLY, false }, 242 NOT_GPL_ONLY, false },
233 { __start___ksymtab_gpl, __stop___ksymtab_gpl, 243 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod,
392 mod->percpu = __alloc_reserved_percpu(size, align); 402 mod->percpu = __alloc_reserved_percpu(size, align);
393 if (!mod->percpu) { 403 if (!mod->percpu) {
394 printk(KERN_WARNING 404 printk(KERN_WARNING
395 "Could not allocate %lu bytes percpu data\n", size); 405 "%s: Could not allocate %lu bytes percpu data\n",
406 mod->name, size);
396 return -ENOMEM; 407 return -ENOMEM;
397 } 408 }
398 mod->percpu_size = size; 409 mod->percpu_size = size;
@@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod)
404 free_percpu(mod->percpu); 415 free_percpu(mod->percpu);
405} 416}
406 417
407static unsigned int find_pcpusec(Elf_Ehdr *hdr, 418static unsigned int find_pcpusec(struct load_info *info)
408 Elf_Shdr *sechdrs,
409 const char *secstrings)
410{ 419{
411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); 420 return find_sec(info, ".data..percpu");
412} 421}
413 422
414static void percpu_modcopy(struct module *mod, 423static void percpu_modcopy(struct module *mod,
@@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod,
468static inline void percpu_modfree(struct module *mod) 477static inline void percpu_modfree(struct module *mod)
469{ 478{
470} 479}
471static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 480static unsigned int find_pcpusec(struct load_info *info)
472 Elf_Shdr *sechdrs,
473 const char *secstrings)
474{ 481{
475 return 0; 482 return 0;
476} 483}
@@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
524EXPORT_TRACEPOINT_SYMBOL(module_get); 531EXPORT_TRACEPOINT_SYMBOL(module_get);
525 532
526/* Init the unload section of the module. */ 533/* Init the unload section of the module. */
527static void module_unload_init(struct module *mod) 534static int module_unload_init(struct module *mod)
528{ 535{
529 int cpu; 536 mod->refptr = alloc_percpu(struct module_ref);
537 if (!mod->refptr)
538 return -ENOMEM;
530 539
531 INIT_LIST_HEAD(&mod->source_list); 540 INIT_LIST_HEAD(&mod->source_list);
532 INIT_LIST_HEAD(&mod->target_list); 541 INIT_LIST_HEAD(&mod->target_list);
533 for_each_possible_cpu(cpu) {
534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
536 }
537 542
538 /* Hold reference count during initialization. */ 543 /* Hold reference count during initialization. */
539 __this_cpu_write(mod->refptr->incs, 1); 544 __this_cpu_write(mod->refptr->incs, 1);
540 /* Backwards compatibility macros put refcount during init. */ 545 /* Backwards compatibility macros put refcount during init. */
541 mod->waiter = current; 546 mod->waiter = current;
547
548 return 0;
542} 549}
543 550
544/* Does a already use b? */ 551/* Does a already use b? */
@@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod)
618 kfree(use); 625 kfree(use);
619 } 626 }
620 mutex_unlock(&module_mutex); 627 mutex_unlock(&module_mutex);
628
629 free_percpu(mod->refptr);
621} 630}
622 631
623#ifdef CONFIG_MODULE_FORCE_UNLOAD 632#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -891,8 +900,9 @@ int ref_module(struct module *a, struct module *b)
891} 900}
892EXPORT_SYMBOL_GPL(ref_module); 901EXPORT_SYMBOL_GPL(ref_module);
893 902
894static inline void module_unload_init(struct module *mod) 903static inline int module_unload_init(struct module *mod)
895{ 904{
905 return 0;
896} 906}
897#endif /* CONFIG_MODULE_UNLOAD */ 907#endif /* CONFIG_MODULE_UNLOAD */
898 908
@@ -1051,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1051#endif /* CONFIG_MODVERSIONS */ 1061#endif /* CONFIG_MODVERSIONS */
1052 1062
1053/* Resolve a symbol for this module. I.e. if we find one, record usage. */ 1063/* Resolve a symbol for this module. I.e. if we find one, record usage. */
1054static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1064static const struct kernel_symbol *resolve_symbol(struct module *mod,
1055 unsigned int versindex, 1065 const struct load_info *info,
1056 const char *name, 1066 const char *name,
1057 struct module *mod,
1058 char ownername[]) 1067 char ownername[])
1059{ 1068{
1060 struct module *owner; 1069 struct module *owner;
@@ -1068,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1068 if (!sym) 1077 if (!sym)
1069 goto unlock; 1078 goto unlock;
1070 1079
1071 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { 1080 if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
1081 owner)) {
1072 sym = ERR_PTR(-EINVAL); 1082 sym = ERR_PTR(-EINVAL);
1073 goto getname; 1083 goto getname;
1074 } 1084 }
@@ -1087,21 +1097,20 @@ unlock:
1087 return sym; 1097 return sym;
1088} 1098}
1089 1099
1090static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, 1100static const struct kernel_symbol *
1091 unsigned int versindex, 1101resolve_symbol_wait(struct module *mod,
1092 const char *name, 1102 const struct load_info *info,
1093 struct module *mod) 1103 const char *name)
1094{ 1104{
1095 const struct kernel_symbol *ksym; 1105 const struct kernel_symbol *ksym;
1096 char ownername[MODULE_NAME_LEN]; 1106 char owner[MODULE_NAME_LEN];
1097 1107
1098 if (wait_event_interruptible_timeout(module_wq, 1108 if (wait_event_interruptible_timeout(module_wq,
1099 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, 1109 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
1100 mod, ownername)) || 1110 || PTR_ERR(ksym) != -EBUSY,
1101 PTR_ERR(ksym) != -EBUSY,
1102 30 * HZ) <= 0) { 1111 30 * HZ) <= 0) {
1103 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", 1112 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1104 mod->name, ownername); 1113 mod->name, owner);
1105 } 1114 }
1106 return ksym; 1115 return ksym;
1107} 1116}
@@ -1110,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1110 * /sys/module/foo/sections stuff 1119 * /sys/module/foo/sections stuff
1111 * J. Corbet <corbet@lwn.net> 1120 * J. Corbet <corbet@lwn.net>
1112 */ 1121 */
1113#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1122#ifdef CONFIG_SYSFS
1114 1123
1124#ifdef CONFIG_KALLSYMS
1115static inline bool sect_empty(const Elf_Shdr *sect) 1125static inline bool sect_empty(const Elf_Shdr *sect)
1116{ 1126{
1117 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1127 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1148,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1148 kfree(sect_attrs); 1158 kfree(sect_attrs);
1149} 1159}
1150 1160
1151static void add_sect_attrs(struct module *mod, unsigned int nsect, 1161static void add_sect_attrs(struct module *mod, const struct load_info *info)
1152 char *secstrings, Elf_Shdr *sechdrs)
1153{ 1162{
1154 unsigned int nloaded = 0, i, size[2]; 1163 unsigned int nloaded = 0, i, size[2];
1155 struct module_sect_attrs *sect_attrs; 1164 struct module_sect_attrs *sect_attrs;
@@ -1157,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1157 struct attribute **gattr; 1166 struct attribute **gattr;
1158 1167
1159 /* Count loaded sections and allocate structures */ 1168 /* Count loaded sections and allocate structures */
1160 for (i = 0; i < nsect; i++) 1169 for (i = 0; i < info->hdr->e_shnum; i++)
1161 if (!sect_empty(&sechdrs[i])) 1170 if (!sect_empty(&info->sechdrs[i]))
1162 nloaded++; 1171 nloaded++;
1163 size[0] = ALIGN(sizeof(*sect_attrs) 1172 size[0] = ALIGN(sizeof(*sect_attrs)
1164 + nloaded * sizeof(sect_attrs->attrs[0]), 1173 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1175,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1175 sect_attrs->nsections = 0; 1184 sect_attrs->nsections = 0;
1176 sattr = &sect_attrs->attrs[0]; 1185 sattr = &sect_attrs->attrs[0];
1177 gattr = &sect_attrs->grp.attrs[0]; 1186 gattr = &sect_attrs->grp.attrs[0];
1178 for (i = 0; i < nsect; i++) { 1187 for (i = 0; i < info->hdr->e_shnum; i++) {
1179 if (sect_empty(&sechdrs[i])) 1188 Elf_Shdr *sec = &info->sechdrs[i];
1189 if (sect_empty(sec))
1180 continue; 1190 continue;
1181 sattr->address = sechdrs[i].sh_addr; 1191 sattr->address = sec->sh_addr;
1182 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1192 sattr->name = kstrdup(info->secstrings + sec->sh_name,
1183 GFP_KERNEL); 1193 GFP_KERNEL);
1184 if (sattr->name == NULL) 1194 if (sattr->name == NULL)
1185 goto out; 1195 goto out;
@@ -1247,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1247 kfree(notes_attrs); 1257 kfree(notes_attrs);
1248} 1258}
1249 1259
1250static void add_notes_attrs(struct module *mod, unsigned int nsect, 1260static void add_notes_attrs(struct module *mod, const struct load_info *info)
1251 char *secstrings, Elf_Shdr *sechdrs)
1252{ 1261{
1253 unsigned int notes, loaded, i; 1262 unsigned int notes, loaded, i;
1254 struct module_notes_attrs *notes_attrs; 1263 struct module_notes_attrs *notes_attrs;
@@ -1260,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1260 1269
1261 /* Count notes sections and allocate structures. */ 1270 /* Count notes sections and allocate structures. */
1262 notes = 0; 1271 notes = 0;
1263 for (i = 0; i < nsect; i++) 1272 for (i = 0; i < info->hdr->e_shnum; i++)
1264 if (!sect_empty(&sechdrs[i]) && 1273 if (!sect_empty(&info->sechdrs[i]) &&
1265 (sechdrs[i].sh_type == SHT_NOTE)) 1274 (info->sechdrs[i].sh_type == SHT_NOTE))
1266 ++notes; 1275 ++notes;
1267 1276
1268 if (notes == 0) 1277 if (notes == 0)
@@ -1276,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1276 1285
1277 notes_attrs->notes = notes; 1286 notes_attrs->notes = notes;
1278 nattr = &notes_attrs->attrs[0]; 1287 nattr = &notes_attrs->attrs[0];
1279 for (loaded = i = 0; i < nsect; ++i) { 1288 for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
1280 if (sect_empty(&sechdrs[i])) 1289 if (sect_empty(&info->sechdrs[i]))
1281 continue; 1290 continue;
1282 if (sechdrs[i].sh_type == SHT_NOTE) { 1291 if (info->sechdrs[i].sh_type == SHT_NOTE) {
1283 sysfs_bin_attr_init(nattr); 1292 sysfs_bin_attr_init(nattr);
1284 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1293 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1285 nattr->attr.mode = S_IRUGO; 1294 nattr->attr.mode = S_IRUGO;
1286 nattr->size = sechdrs[i].sh_size; 1295 nattr->size = info->sechdrs[i].sh_size;
1287 nattr->private = (void *) sechdrs[i].sh_addr; 1296 nattr->private = (void *) info->sechdrs[i].sh_addr;
1288 nattr->read = module_notes_read; 1297 nattr->read = module_notes_read;
1289 ++nattr; 1298 ++nattr;
1290 } 1299 }
@@ -1315,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod)
1315 1324
1316#else 1325#else
1317 1326
1318static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1327static inline void add_sect_attrs(struct module *mod,
1319 char *sectstrings, Elf_Shdr *sechdrs) 1328 const struct load_info *info)
1320{ 1329{
1321} 1330}
1322 1331
@@ -1324,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod)
1324{ 1333{
1325} 1334}
1326 1335
1327static inline void add_notes_attrs(struct module *mod, unsigned int nsect, 1336static inline void add_notes_attrs(struct module *mod,
1328 char *sectstrings, Elf_Shdr *sechdrs) 1337 const struct load_info *info)
1329{ 1338{
1330} 1339}
1331 1340
1332static inline void remove_notes_attrs(struct module *mod) 1341static inline void remove_notes_attrs(struct module *mod)
1333{ 1342{
1334} 1343}
1335#endif 1344#endif /* CONFIG_KALLSYMS */
1336 1345
1337#ifdef CONFIG_SYSFS
1338static void add_usage_links(struct module *mod) 1346static void add_usage_links(struct module *mod)
1339{ 1347{
1340#ifdef CONFIG_MODULE_UNLOAD 1348#ifdef CONFIG_MODULE_UNLOAD
@@ -1439,6 +1447,7 @@ out:
1439} 1447}
1440 1448
1441static int mod_sysfs_setup(struct module *mod, 1449static int mod_sysfs_setup(struct module *mod,
1450 const struct load_info *info,
1442 struct kernel_param *kparam, 1451 struct kernel_param *kparam,
1443 unsigned int num_params) 1452 unsigned int num_params)
1444{ 1453{
@@ -1463,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod,
1463 goto out_unreg_param; 1472 goto out_unreg_param;
1464 1473
1465 add_usage_links(mod); 1474 add_usage_links(mod);
1475 add_sect_attrs(mod, info);
1476 add_notes_attrs(mod, info);
1466 1477
1467 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1478 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1468 return 0; 1479 return 0;
@@ -1479,33 +1490,26 @@ out:
1479 1490
1480static void mod_sysfs_fini(struct module *mod) 1491static void mod_sysfs_fini(struct module *mod)
1481{ 1492{
1493 remove_notes_attrs(mod);
1494 remove_sect_attrs(mod);
1482 kobject_put(&mod->mkobj.kobj); 1495 kobject_put(&mod->mkobj.kobj);
1483} 1496}
1484 1497
1485#else /* CONFIG_SYSFS */ 1498#else /* !CONFIG_SYSFS */
1486
1487static inline int mod_sysfs_init(struct module *mod)
1488{
1489 return 0;
1490}
1491 1499
1492static inline int mod_sysfs_setup(struct module *mod, 1500static int mod_sysfs_setup(struct module *mod,
1501 const struct load_info *info,
1493 struct kernel_param *kparam, 1502 struct kernel_param *kparam,
1494 unsigned int num_params) 1503 unsigned int num_params)
1495{ 1504{
1496 return 0; 1505 return 0;
1497} 1506}
1498 1507
1499static inline int module_add_modinfo_attrs(struct module *mod) 1508static void mod_sysfs_fini(struct module *mod)
1500{
1501 return 0;
1502}
1503
1504static inline void module_remove_modinfo_attrs(struct module *mod)
1505{ 1509{
1506} 1510}
1507 1511
1508static void mod_sysfs_fini(struct module *mod) 1512static void module_remove_modinfo_attrs(struct module *mod)
1509{ 1513{
1510} 1514}
1511 1515
@@ -1515,7 +1519,7 @@ static void del_usage_links(struct module *mod)
1515 1519
1516#endif /* CONFIG_SYSFS */ 1520#endif /* CONFIG_SYSFS */
1517 1521
1518static void mod_kobject_remove(struct module *mod) 1522static void mod_sysfs_teardown(struct module *mod)
1519{ 1523{
1520 del_usage_links(mod); 1524 del_usage_links(mod);
1521 module_remove_modinfo_attrs(mod); 1525 module_remove_modinfo_attrs(mod);
@@ -1545,9 +1549,7 @@ static void free_module(struct module *mod)
1545 mutex_lock(&module_mutex); 1549 mutex_lock(&module_mutex);
1546 stop_machine(__unlink_module, mod, NULL); 1550 stop_machine(__unlink_module, mod, NULL);
1547 mutex_unlock(&module_mutex); 1551 mutex_unlock(&module_mutex);
1548 remove_notes_attrs(mod); 1552 mod_sysfs_teardown(mod);
1549 remove_sect_attrs(mod);
1550 mod_kobject_remove(mod);
1551 1553
1552 /* Remove dynamic debug info */ 1554 /* Remove dynamic debug info */
1553 ddebug_remove_module(mod->name); 1555 ddebug_remove_module(mod->name);
@@ -1565,10 +1567,7 @@ static void free_module(struct module *mod)
1565 module_free(mod, mod->module_init); 1567 module_free(mod, mod->module_init);
1566 kfree(mod->args); 1568 kfree(mod->args);
1567 percpu_modfree(mod); 1569 percpu_modfree(mod);
1568#if defined(CONFIG_MODULE_UNLOAD) 1570
1569 if (mod->refptr)
1570 free_percpu(mod->refptr);
1571#endif
1572 /* Free lock-classes: */ 1571 /* Free lock-classes: */
1573 lockdep_free_key_range(mod->module_core, mod->core_size); 1572 lockdep_free_key_range(mod->module_core, mod->core_size);
1574 1573
@@ -1634,25 +1633,23 @@ static int verify_export_symbols(struct module *mod)
1634} 1633}
1635 1634
1636/* Change all symbols so that st_value encodes the pointer directly. */ 1635/* Change all symbols so that st_value encodes the pointer directly. */
1637static int simplify_symbols(Elf_Shdr *sechdrs, 1636static int simplify_symbols(struct module *mod, const struct load_info *info)
1638 unsigned int symindex, 1637{
1639 const char *strtab, 1638 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
1640 unsigned int versindex, 1639 Elf_Sym *sym = (void *)symsec->sh_addr;
1641 unsigned int pcpuindex,
1642 struct module *mod)
1643{
1644 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
1645 unsigned long secbase; 1640 unsigned long secbase;
1646 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1641 unsigned int i;
1647 int ret = 0; 1642 int ret = 0;
1648 const struct kernel_symbol *ksym; 1643 const struct kernel_symbol *ksym;
1649 1644
1650 for (i = 1; i < n; i++) { 1645 for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
1646 const char *name = info->strtab + sym[i].st_name;
1647
1651 switch (sym[i].st_shndx) { 1648 switch (sym[i].st_shndx) {
1652 case SHN_COMMON: 1649 case SHN_COMMON:
1653 /* We compiled with -fno-common. These are not 1650 /* We compiled with -fno-common. These are not
1654 supposed to happen. */ 1651 supposed to happen. */
1655 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); 1652 DEBUGP("Common symbol: %s\n", name);
1656 printk("%s: please compile with -fno-common\n", 1653 printk("%s: please compile with -fno-common\n",
1657 mod->name); 1654 mod->name);
1658 ret = -ENOEXEC; 1655 ret = -ENOEXEC;
@@ -1665,9 +1662,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1665 break; 1662 break;
1666 1663
1667 case SHN_UNDEF: 1664 case SHN_UNDEF:
1668 ksym = resolve_symbol_wait(sechdrs, versindex, 1665 ksym = resolve_symbol_wait(mod, info, name);
1669 strtab + sym[i].st_name,
1670 mod);
1671 /* Ok if resolved. */ 1666 /* Ok if resolved. */
1672 if (ksym && !IS_ERR(ksym)) { 1667 if (ksym && !IS_ERR(ksym)) {
1673 sym[i].st_value = ksym->value; 1668 sym[i].st_value = ksym->value;
@@ -1679,17 +1674,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1679 break; 1674 break;
1680 1675
1681 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", 1676 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1682 mod->name, strtab + sym[i].st_name, 1677 mod->name, name, PTR_ERR(ksym));
1683 PTR_ERR(ksym));
1684 ret = PTR_ERR(ksym) ?: -ENOENT; 1678 ret = PTR_ERR(ksym) ?: -ENOENT;
1685 break; 1679 break;
1686 1680
1687 default: 1681 default:
1688 /* Divert to percpu allocation if a percpu var. */ 1682 /* Divert to percpu allocation if a percpu var. */
1689 if (sym[i].st_shndx == pcpuindex) 1683 if (sym[i].st_shndx == info->index.pcpu)
1690 secbase = (unsigned long)mod_percpu(mod); 1684 secbase = (unsigned long)mod_percpu(mod);
1691 else 1685 else
1692 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1686 secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
1693 sym[i].st_value += secbase; 1687 sym[i].st_value += secbase;
1694 break; 1688 break;
1695 } 1689 }
@@ -1698,6 +1692,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1698 return ret; 1692 return ret;
1699} 1693}
1700 1694
1695static int apply_relocations(struct module *mod, const struct load_info *info)
1696{
1697 unsigned int i;
1698 int err = 0;
1699
1700 /* Now do relocations. */
1701 for (i = 1; i < info->hdr->e_shnum; i++) {
1702 unsigned int infosec = info->sechdrs[i].sh_info;
1703
1704 /* Not a valid relocation section? */
1705 if (infosec >= info->hdr->e_shnum)
1706 continue;
1707
1708 /* Don't bother with non-allocated sections */
1709 if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
1710 continue;
1711
1712 if (info->sechdrs[i].sh_type == SHT_REL)
1713 err = apply_relocate(info->sechdrs, info->strtab,
1714 info->index.sym, i, mod);
1715 else if (info->sechdrs[i].sh_type == SHT_RELA)
1716 err = apply_relocate_add(info->sechdrs, info->strtab,
1717 info->index.sym, i, mod);
1718 if (err < 0)
1719 break;
1720 }
1721 return err;
1722}
1723
1701/* Additional bytes needed by arch in front of individual sections */ 1724/* Additional bytes needed by arch in front of individual sections */
1702unsigned int __weak arch_mod_section_prepend(struct module *mod, 1725unsigned int __weak arch_mod_section_prepend(struct module *mod,
1703 unsigned int section) 1726 unsigned int section)
@@ -1722,10 +1745,7 @@ static long get_offset(struct module *mod, unsigned int *size,
1722 might -- code, read-only data, read-write data, small data. Tally 1745 might -- code, read-only data, read-write data, small data. Tally
1723 sizes, and place the offsets into sh_entsize fields: high bit means it 1746 sizes, and place the offsets into sh_entsize fields: high bit means it
1724 belongs in init. */ 1747 belongs in init. */
1725static void layout_sections(struct module *mod, 1748static void layout_sections(struct module *mod, struct load_info *info)
1726 const Elf_Ehdr *hdr,
1727 Elf_Shdr *sechdrs,
1728 const char *secstrings)
1729{ 1749{
1730 static unsigned long const masks[][2] = { 1750 static unsigned long const masks[][2] = {
1731 /* NOTE: all executable code must be the first section 1751 /* NOTE: all executable code must be the first section
@@ -1738,21 +1758,22 @@ static void layout_sections(struct module *mod,
1738 }; 1758 };
1739 unsigned int m, i; 1759 unsigned int m, i;
1740 1760
1741 for (i = 0; i < hdr->e_shnum; i++) 1761 for (i = 0; i < info->hdr->e_shnum; i++)
1742 sechdrs[i].sh_entsize = ~0UL; 1762 info->sechdrs[i].sh_entsize = ~0UL;
1743 1763
1744 DEBUGP("Core section allocation order:\n"); 1764 DEBUGP("Core section allocation order:\n");
1745 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1765 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1746 for (i = 0; i < hdr->e_shnum; ++i) { 1766 for (i = 0; i < info->hdr->e_shnum; ++i) {
1747 Elf_Shdr *s = &sechdrs[i]; 1767 Elf_Shdr *s = &info->sechdrs[i];
1768 const char *sname = info->secstrings + s->sh_name;
1748 1769
1749 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1770 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1750 || (s->sh_flags & masks[m][1]) 1771 || (s->sh_flags & masks[m][1])
1751 || s->sh_entsize != ~0UL 1772 || s->sh_entsize != ~0UL
1752 || strstarts(secstrings + s->sh_name, ".init")) 1773 || strstarts(sname, ".init"))
1753 continue; 1774 continue;
1754 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1775 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1755 DEBUGP("\t%s\n", secstrings + s->sh_name); 1776 DEBUGP("\t%s\n", name);
1756 } 1777 }
1757 if (m == 0) 1778 if (m == 0)
1758 mod->core_text_size = mod->core_size; 1779 mod->core_text_size = mod->core_size;
@@ -1760,17 +1781,18 @@ static void layout_sections(struct module *mod,
1760 1781
1761 DEBUGP("Init section allocation order:\n"); 1782 DEBUGP("Init section allocation order:\n");
1762 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1783 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1763 for (i = 0; i < hdr->e_shnum; ++i) { 1784 for (i = 0; i < info->hdr->e_shnum; ++i) {
1764 Elf_Shdr *s = &sechdrs[i]; 1785 Elf_Shdr *s = &info->sechdrs[i];
1786 const char *sname = info->secstrings + s->sh_name;
1765 1787
1766 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1788 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1767 || (s->sh_flags & masks[m][1]) 1789 || (s->sh_flags & masks[m][1])
1768 || s->sh_entsize != ~0UL 1790 || s->sh_entsize != ~0UL
1769 || !strstarts(secstrings + s->sh_name, ".init")) 1791 || !strstarts(sname, ".init"))
1770 continue; 1792 continue;
1771 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1793 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1772 | INIT_OFFSET_MASK); 1794 | INIT_OFFSET_MASK);
1773 DEBUGP("\t%s\n", secstrings + s->sh_name); 1795 DEBUGP("\t%s\n", sname);
1774 } 1796 }
1775 if (m == 0) 1797 if (m == 0)
1776 mod->init_text_size = mod->init_size; 1798 mod->init_text_size = mod->init_size;
@@ -1809,33 +1831,28 @@ static char *next_string(char *string, unsigned long *secsize)
1809 return string; 1831 return string;
1810} 1832}
1811 1833
1812static char *get_modinfo(Elf_Shdr *sechdrs, 1834static char *get_modinfo(struct load_info *info, const char *tag)
1813 unsigned int info,
1814 const char *tag)
1815{ 1835{
1816 char *p; 1836 char *p;
1817 unsigned int taglen = strlen(tag); 1837 unsigned int taglen = strlen(tag);
1818 unsigned long size = sechdrs[info].sh_size; 1838 Elf_Shdr *infosec = &info->sechdrs[info->index.info];
1839 unsigned long size = infosec->sh_size;
1819 1840
1820 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { 1841 for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
1821 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') 1842 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1822 return p + taglen + 1; 1843 return p + taglen + 1;
1823 } 1844 }
1824 return NULL; 1845 return NULL;
1825} 1846}
1826 1847
1827static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, 1848static void setup_modinfo(struct module *mod, struct load_info *info)
1828 unsigned int infoindex)
1829{ 1849{
1830 struct module_attribute *attr; 1850 struct module_attribute *attr;
1831 int i; 1851 int i;
1832 1852
1833 for (i = 0; (attr = modinfo_attrs[i]); i++) { 1853 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1834 if (attr->setup) 1854 if (attr->setup)
1835 attr->setup(mod, 1855 attr->setup(mod, get_modinfo(info, attr->attr.name));
1836 get_modinfo(sechdrs,
1837 infoindex,
1838 attr->attr.name));
1839 } 1856 }
1840} 1857}
1841 1858
@@ -1876,11 +1893,10 @@ static int is_exported(const char *name, unsigned long value,
1876} 1893}
1877 1894
1878/* As per nm */ 1895/* As per nm */
1879static char elf_type(const Elf_Sym *sym, 1896static char elf_type(const Elf_Sym *sym, const struct load_info *info)
1880 Elf_Shdr *sechdrs,
1881 const char *secstrings,
1882 struct module *mod)
1883{ 1897{
1898 const Elf_Shdr *sechdrs = info->sechdrs;
1899
1884 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { 1900 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1885 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) 1901 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1886 return 'v'; 1902 return 'v';
@@ -1910,8 +1926,10 @@ static char elf_type(const Elf_Sym *sym,
1910 else 1926 else
1911 return 'b'; 1927 return 'b';
1912 } 1928 }
1913 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) 1929 if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
1930 ".debug")) {
1914 return 'n'; 1931 return 'n';
1932 }
1915 return '?'; 1933 return '?';
1916} 1934}
1917 1935
@@ -1936,127 +1954,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1936 return true; 1954 return true;
1937} 1955}
1938 1956
1939static unsigned long layout_symtab(struct module *mod, 1957static void layout_symtab(struct module *mod, struct load_info *info)
1940 Elf_Shdr *sechdrs,
1941 unsigned int symindex,
1942 unsigned int strindex,
1943 const Elf_Ehdr *hdr,
1944 const char *secstrings,
1945 unsigned long *pstroffs,
1946 unsigned long *strmap)
1947{ 1958{
1948 unsigned long symoffs; 1959 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
1949 Elf_Shdr *symsect = sechdrs + symindex; 1960 Elf_Shdr *strsect = info->sechdrs + info->index.str;
1950 Elf_Shdr *strsect = sechdrs + strindex;
1951 const Elf_Sym *src; 1961 const Elf_Sym *src;
1952 const char *strtab;
1953 unsigned int i, nsrc, ndst; 1962 unsigned int i, nsrc, ndst;
1954 1963
1955 /* Put symbol section at end of init part of module. */ 1964 /* Put symbol section at end of init part of module. */
1956 symsect->sh_flags |= SHF_ALLOC; 1965 symsect->sh_flags |= SHF_ALLOC;
1957 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 1966 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1958 symindex) | INIT_OFFSET_MASK; 1967 info->index.sym) | INIT_OFFSET_MASK;
1959 DEBUGP("\t%s\n", secstrings + symsect->sh_name); 1968 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
1960 1969
1961 src = (void *)hdr + symsect->sh_offset; 1970 src = (void *)info->hdr + symsect->sh_offset;
1962 nsrc = symsect->sh_size / sizeof(*src); 1971 nsrc = symsect->sh_size / sizeof(*src);
1963 strtab = (void *)hdr + strsect->sh_offset;
1964 for (ndst = i = 1; i < nsrc; ++i, ++src) 1972 for (ndst = i = 1; i < nsrc; ++i, ++src)
1965 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { 1973 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
1966 unsigned int j = src->st_name; 1974 unsigned int j = src->st_name;
1967 1975
1968 while(!__test_and_set_bit(j, strmap) && strtab[j]) 1976 while (!__test_and_set_bit(j, info->strmap)
1977 && info->strtab[j])
1969 ++j; 1978 ++j;
1970 ++ndst; 1979 ++ndst;
1971 } 1980 }
1972 1981
1973 /* Append room for core symbols at end of core part. */ 1982 /* Append room for core symbols at end of core part. */
1974 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 1983 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1975 mod->core_size = symoffs + ndst * sizeof(Elf_Sym); 1984 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
1976 1985
1977 /* Put string table section at end of init part of module. */ 1986 /* Put string table section at end of init part of module. */
1978 strsect->sh_flags |= SHF_ALLOC; 1987 strsect->sh_flags |= SHF_ALLOC;
1979 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 1988 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1980 strindex) | INIT_OFFSET_MASK; 1989 info->index.str) | INIT_OFFSET_MASK;
1981 DEBUGP("\t%s\n", secstrings + strsect->sh_name); 1990 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
1982 1991
1983 /* Append room for core symbols' strings at end of core part. */ 1992 /* Append room for core symbols' strings at end of core part. */
1984 *pstroffs = mod->core_size; 1993 info->stroffs = mod->core_size;
1985 __set_bit(0, strmap); 1994 __set_bit(0, info->strmap);
1986 mod->core_size += bitmap_weight(strmap, strsect->sh_size); 1995 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
1987
1988 return symoffs;
1989} 1996}
1990 1997
1991static void add_kallsyms(struct module *mod, 1998static void add_kallsyms(struct module *mod, const struct load_info *info)
1992 Elf_Shdr *sechdrs,
1993 unsigned int shnum,
1994 unsigned int symindex,
1995 unsigned int strindex,
1996 unsigned long symoffs,
1997 unsigned long stroffs,
1998 const char *secstrings,
1999 unsigned long *strmap)
2000{ 1999{
2001 unsigned int i, ndst; 2000 unsigned int i, ndst;
2002 const Elf_Sym *src; 2001 const Elf_Sym *src;
2003 Elf_Sym *dst; 2002 Elf_Sym *dst;
2004 char *s; 2003 char *s;
2004 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
2005 2005
2006 mod->symtab = (void *)sechdrs[symindex].sh_addr; 2006 mod->symtab = (void *)symsec->sh_addr;
2007 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 2007 mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
2008 mod->strtab = (void *)sechdrs[strindex].sh_addr; 2008 /* Make sure we get permanent strtab: don't use info->strtab. */
2009 mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
2009 2010
2010 /* Set types up while we still have access to sections. */ 2011 /* Set types up while we still have access to sections. */
2011 for (i = 0; i < mod->num_symtab; i++) 2012 for (i = 0; i < mod->num_symtab; i++)
2012 mod->symtab[i].st_info 2013 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2013 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
2014 2014
2015 mod->core_symtab = dst = mod->module_core + symoffs; 2015 mod->core_symtab = dst = mod->module_core + info->symoffs;
2016 src = mod->symtab; 2016 src = mod->symtab;
2017 *dst = *src; 2017 *dst = *src;
2018 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2018 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2019 if (!is_core_symbol(src, sechdrs, shnum)) 2019 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2020 continue; 2020 continue;
2021 dst[ndst] = *src; 2021 dst[ndst] = *src;
2022 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); 2022 dst[ndst].st_name = bitmap_weight(info->strmap,
2023 dst[ndst].st_name);
2023 ++ndst; 2024 ++ndst;
2024 } 2025 }
2025 mod->core_num_syms = ndst; 2026 mod->core_num_syms = ndst;
2026 2027
2027 mod->core_strtab = s = mod->module_core + stroffs; 2028 mod->core_strtab = s = mod->module_core + info->stroffs;
2028 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) 2029 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2029 if (test_bit(i, strmap)) 2030 if (test_bit(i, info->strmap))
2030 *++s = mod->strtab[i]; 2031 *++s = mod->strtab[i];
2031} 2032}
2032#else 2033#else
2033static inline unsigned long layout_symtab(struct module *mod, 2034static inline void layout_symtab(struct module *mod, struct load_info *info)
2034 Elf_Shdr *sechdrs,
2035 unsigned int symindex,
2036 unsigned int strindex,
2037 const Elf_Ehdr *hdr,
2038 const char *secstrings,
2039 unsigned long *pstroffs,
2040 unsigned long *strmap)
2041{ 2035{
2042 return 0;
2043} 2036}
2044 2037
2045static inline void add_kallsyms(struct module *mod, 2038static void add_kallsyms(struct module *mod, struct load_info *info)
2046 Elf_Shdr *sechdrs,
2047 unsigned int shnum,
2048 unsigned int symindex,
2049 unsigned int strindex,
2050 unsigned long symoffs,
2051 unsigned long stroffs,
2052 const char *secstrings,
2053 const unsigned long *strmap)
2054{ 2039{
2055} 2040}
2056#endif /* CONFIG_KALLSYMS */ 2041#endif /* CONFIG_KALLSYMS */
2057 2042
2058static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) 2043static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
2059{ 2044{
2045 if (!debug)
2046 return;
2060#ifdef CONFIG_DYNAMIC_DEBUG 2047#ifdef CONFIG_DYNAMIC_DEBUG
2061 if (ddebug_add_module(debug, num, debug->modname)) 2048 if (ddebug_add_module(debug, num, debug->modname))
2062 printk(KERN_ERR "dynamic debug error adding module: %s\n", 2049 printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -2087,65 +2074,47 @@ static void *module_alloc_update_bounds(unsigned long size)
2087} 2074}
2088 2075
2089#ifdef CONFIG_DEBUG_KMEMLEAK 2076#ifdef CONFIG_DEBUG_KMEMLEAK
2090static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2077static void kmemleak_load_module(const struct module *mod,
2091 Elf_Shdr *sechdrs, char *secstrings) 2078 const struct load_info *info)
2092{ 2079{
2093 unsigned int i; 2080 unsigned int i;
2094 2081
2095 /* only scan the sections containing data */ 2082 /* only scan the sections containing data */
2096 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); 2083 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2097 2084
2098 for (i = 1; i < hdr->e_shnum; i++) { 2085 for (i = 1; i < info->hdr->e_shnum; i++) {
2099 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2086 const char *name = info->secstrings + info->sechdrs[i].sh_name;
2087 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
2100 continue; 2088 continue;
2101 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 2089 if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
2102 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2103 continue; 2090 continue;
2104 2091
2105 kmemleak_scan_area((void *)sechdrs[i].sh_addr, 2092 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
2106 sechdrs[i].sh_size, GFP_KERNEL); 2093 info->sechdrs[i].sh_size, GFP_KERNEL);
2107 } 2094 }
2108} 2095}
2109#else 2096#else
2110static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2097static inline void kmemleak_load_module(const struct module *mod,
2111 Elf_Shdr *sechdrs, char *secstrings) 2098 const struct load_info *info)
2112{ 2099{
2113} 2100}
2114#endif 2101#endif
2115 2102
2116/* Allocate and load the module: note that size of section 0 is always 2103/* Sets info->hdr and info->len. */
2117 zero, and we rely on this for optional sections. */ 2104static int copy_and_check(struct load_info *info,
2118static noinline struct module *load_module(void __user *umod, 2105 const void __user *umod, unsigned long len,
2119 unsigned long len, 2106 const char __user *uargs)
2120 const char __user *uargs)
2121{ 2107{
2108 int err;
2122 Elf_Ehdr *hdr; 2109 Elf_Ehdr *hdr;
2123 Elf_Shdr *sechdrs;
2124 char *secstrings, *args, *modmagic, *strtab = NULL;
2125 char *staging;
2126 unsigned int i;
2127 unsigned int symindex = 0;
2128 unsigned int strindex = 0;
2129 unsigned int modindex, versindex, infoindex, pcpuindex;
2130 struct module *mod;
2131 long err = 0;
2132 void *ptr = NULL; /* Stops spurious gcc warning */
2133 unsigned long symoffs, stroffs, *strmap;
2134 void __percpu *percpu;
2135 struct _ddebug *debug = NULL;
2136 unsigned int num_debug = 0;
2137 2110
2138 mm_segment_t old_fs;
2139
2140 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2141 umod, len, uargs);
2142 if (len < sizeof(*hdr)) 2111 if (len < sizeof(*hdr))
2143 return ERR_PTR(-ENOEXEC); 2112 return -ENOEXEC;
2144 2113
2145 /* Suck in entire file: we'll want most of it. */ 2114 /* Suck in entire file: we'll want most of it. */
2146 /* vmalloc barfs on "unusual" numbers. Check here */ 2115 /* vmalloc barfs on "unusual" numbers. Check here */
2147 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 2116 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2148 return ERR_PTR(-ENOMEM); 2117 return -ENOMEM;
2149 2118
2150 if (copy_from_user(hdr, umod, len) != 0) { 2119 if (copy_from_user(hdr, umod, len) != 0) {
2151 err = -EFAULT; 2120 err = -EFAULT;
@@ -2153,135 +2122,225 @@ static noinline struct module *load_module(void __user *umod,
2153 } 2122 }
2154 2123
2155 /* Sanity checks against insmoding binaries or wrong arch, 2124 /* Sanity checks against insmoding binaries or wrong arch,
2156 weird elf version */ 2125 weird elf version */
2157 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2126 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
2158 || hdr->e_type != ET_REL 2127 || hdr->e_type != ET_REL
2159 || !elf_check_arch(hdr) 2128 || !elf_check_arch(hdr)
2160 || hdr->e_shentsize != sizeof(*sechdrs)) { 2129 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2161 err = -ENOEXEC; 2130 err = -ENOEXEC;
2162 goto free_hdr; 2131 goto free_hdr;
2163 } 2132 }
2164 2133
2165 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) 2134 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
2166 goto truncated; 2135 err = -ENOEXEC;
2136 goto free_hdr;
2137 }
2167 2138
2168 /* Convenience variables */ 2139 info->hdr = hdr;
2169 sechdrs = (void *)hdr + hdr->e_shoff; 2140 info->len = len;
2170 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 2141 return 0;
2171 sechdrs[0].sh_addr = 0;
2172 2142
2173 for (i = 1; i < hdr->e_shnum; i++) { 2143free_hdr:
2174 if (sechdrs[i].sh_type != SHT_NOBITS 2144 vfree(hdr);
2175 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) 2145 return err;
2176 goto truncated; 2146}
2147
2148static void free_copy(struct load_info *info)
2149{
2150 vfree(info->hdr);
2151}
2152
2153static int rewrite_section_headers(struct load_info *info)
2154{
2155 unsigned int i;
2156
2157 /* This should always be true, but let's be sure. */
2158 info->sechdrs[0].sh_addr = 0;
2159
2160 for (i = 1; i < info->hdr->e_shnum; i++) {
2161 Elf_Shdr *shdr = &info->sechdrs[i];
2162 if (shdr->sh_type != SHT_NOBITS
2163 && info->len < shdr->sh_offset + shdr->sh_size) {
2164 printk(KERN_ERR "Module len %lu truncated\n",
2165 info->len);
2166 return -ENOEXEC;
2167 }
2177 2168
2178 /* Mark all sections sh_addr with their address in the 2169 /* Mark all sections sh_addr with their address in the
2179 temporary image. */ 2170 temporary image. */
2180 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; 2171 shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
2181 2172
2182 /* Internal symbols and strings. */
2183 if (sechdrs[i].sh_type == SHT_SYMTAB) {
2184 symindex = i;
2185 strindex = sechdrs[i].sh_link;
2186 strtab = (char *)hdr + sechdrs[strindex].sh_offset;
2187 }
2188#ifndef CONFIG_MODULE_UNLOAD 2173#ifndef CONFIG_MODULE_UNLOAD
2189 /* Don't load .exit sections */ 2174 /* Don't load .exit sections */
2190 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) 2175 if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
2191 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 2176 shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
2192#endif 2177#endif
2193 } 2178 }
2194 2179
2195 modindex = find_sec(hdr, sechdrs, secstrings, 2180 /* Track but don't keep modinfo and version sections. */
2196 ".gnu.linkonce.this_module"); 2181 info->index.vers = find_sec(info, "__versions");
2197 if (!modindex) { 2182 info->index.info = find_sec(info, ".modinfo");
2183 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2184 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
2185 return 0;
2186}
2187
2188/*
2189 * Set up our basic convenience variables (pointers to section headers,
2190 * search for module section index etc), and do some basic section
2191 * verification.
2192 *
2193 * Return the temporary module pointer (we'll replace it with the final
2194 * one when we move the module sections around).
2195 */
2196static struct module *setup_load_info(struct load_info *info)
2197{
2198 unsigned int i;
2199 int err;
2200 struct module *mod;
2201
2202 /* Set up the convenience variables */
2203 info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
2204 info->secstrings = (void *)info->hdr
2205 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2206
2207 err = rewrite_section_headers(info);
2208 if (err)
2209 return ERR_PTR(err);
2210
2211 /* Find internal symbols and strings. */
2212 for (i = 1; i < info->hdr->e_shnum; i++) {
2213 if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
2214 info->index.sym = i;
2215 info->index.str = info->sechdrs[i].sh_link;
2216 info->strtab = (char *)info->hdr
2217 + info->sechdrs[info->index.str].sh_offset;
2218 break;
2219 }
2220 }
2221
2222 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2223 if (!info->index.mod) {
2198 printk(KERN_WARNING "No module found in object\n"); 2224 printk(KERN_WARNING "No module found in object\n");
2199 err = -ENOEXEC; 2225 return ERR_PTR(-ENOEXEC);
2200 goto free_hdr;
2201 } 2226 }
2202 /* This is temporary: point mod into copy of data. */ 2227 /* This is temporary: point mod into copy of data. */
2203 mod = (void *)sechdrs[modindex].sh_addr; 2228 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2204 2229
2205 if (symindex == 0) { 2230 if (info->index.sym == 0) {
2206 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 2231 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
2207 mod->name); 2232 mod->name);
2208 err = -ENOEXEC; 2233 return ERR_PTR(-ENOEXEC);
2209 goto free_hdr;
2210 } 2234 }
2211 2235
2212 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 2236 info->index.pcpu = find_pcpusec(info);
2213 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
2214 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
2215
2216 /* Don't keep modinfo and version sections. */
2217 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2218 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2219 2237
2220 /* Check module struct version now, before we try to use module. */ 2238 /* Check module struct version now, before we try to use module. */
2221 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2239 if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
2222 err = -ENOEXEC; 2240 return ERR_PTR(-ENOEXEC);
2223 goto free_hdr; 2241
2224 } 2242 return mod;
2243}
2244
2245static int check_modinfo(struct module *mod, struct load_info *info)
2246{
2247 const char *modmagic = get_modinfo(info, "vermagic");
2248 int err;
2225 2249
2226 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2227 /* This is allowed: modprobe --force will invalidate it. */ 2250 /* This is allowed: modprobe --force will invalidate it. */
2228 if (!modmagic) { 2251 if (!modmagic) {
2229 err = try_to_force_load(mod, "bad vermagic"); 2252 err = try_to_force_load(mod, "bad vermagic");
2230 if (err) 2253 if (err)
2231 goto free_hdr; 2254 return err;
2232 } else if (!same_magic(modmagic, vermagic, versindex)) { 2255 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2233 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 2256 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
2234 mod->name, modmagic, vermagic); 2257 mod->name, modmagic, vermagic);
2235 err = -ENOEXEC; 2258 return -ENOEXEC;
2236 goto free_hdr;
2237 } 2259 }
2238 2260
2239 staging = get_modinfo(sechdrs, infoindex, "staging"); 2261 if (get_modinfo(info, "staging")) {
2240 if (staging) {
2241 add_taint_module(mod, TAINT_CRAP); 2262 add_taint_module(mod, TAINT_CRAP);
2242 printk(KERN_WARNING "%s: module is from the staging directory," 2263 printk(KERN_WARNING "%s: module is from the staging directory,"
2243 " the quality is unknown, you have been warned.\n", 2264 " the quality is unknown, you have been warned.\n",
2244 mod->name); 2265 mod->name);
2245 } 2266 }
2246 2267
2247 /* Now copy in args */ 2268 /* Set up license info based on the info section */
2248 args = strndup_user(uargs, ~0UL >> 1); 2269 set_license(mod, get_modinfo(info, "license"));
2249 if (IS_ERR(args)) {
2250 err = PTR_ERR(args);
2251 goto free_hdr;
2252 }
2253 2270
2254 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) 2271 return 0;
2255 * sizeof(long), GFP_KERNEL); 2272}
2256 if (!strmap) {
2257 err = -ENOMEM;
2258 goto free_mod;
2259 }
2260 2273
2261 mod->state = MODULE_STATE_COMING; 2274static void find_module_sections(struct module *mod, struct load_info *info)
2275{
2276 mod->kp = section_objs(info, "__param",
2277 sizeof(*mod->kp), &mod->num_kp);
2278 mod->syms = section_objs(info, "__ksymtab",
2279 sizeof(*mod->syms), &mod->num_syms);
2280 mod->crcs = section_addr(info, "__kcrctab");
2281 mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
2282 sizeof(*mod->gpl_syms),
2283 &mod->num_gpl_syms);
2284 mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
2285 mod->gpl_future_syms = section_objs(info,
2286 "__ksymtab_gpl_future",
2287 sizeof(*mod->gpl_future_syms),
2288 &mod->num_gpl_future_syms);
2289 mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
2262 2290
2263 /* Allow arches to frob section contents and sizes. */ 2291#ifdef CONFIG_UNUSED_SYMBOLS
2264 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); 2292 mod->unused_syms = section_objs(info, "__ksymtab_unused",
2265 if (err < 0) 2293 sizeof(*mod->unused_syms),
2266 goto free_mod; 2294 &mod->num_unused_syms);
2295 mod->unused_crcs = section_addr(info, "__kcrctab_unused");
2296 mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
2297 sizeof(*mod->unused_gpl_syms),
2298 &mod->num_unused_gpl_syms);
2299 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
2300#endif
2301#ifdef CONFIG_CONSTRUCTORS
2302 mod->ctors = section_objs(info, ".ctors",
2303 sizeof(*mod->ctors), &mod->num_ctors);
2304#endif
2267 2305
2268 if (pcpuindex) { 2306#ifdef CONFIG_TRACEPOINTS
2269 /* We have a special allocation for this section. */ 2307 mod->tracepoints = section_objs(info, "__tracepoints",
2270 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, 2308 sizeof(*mod->tracepoints),
2271 sechdrs[pcpuindex].sh_addralign); 2309 &mod->num_tracepoints);
2272 if (err) 2310#endif
2273 goto free_mod; 2311#ifdef CONFIG_EVENT_TRACING
2274 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2312 mod->trace_events = section_objs(info, "_ftrace_events",
2275 } 2313 sizeof(*mod->trace_events),
2276 /* Keep this around for failure path. */ 2314 &mod->num_trace_events);
2277 percpu = mod_percpu(mod); 2315 /*
2316 * This section contains pointers to allocated objects in the trace
2317 * code and not scanning it leads to false positives.
2318 */
2319 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2320 mod->num_trace_events, GFP_KERNEL);
2321#endif
2322#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2323 /* sechdrs[0].sh_size is always zero */
2324 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
2325 sizeof(*mod->ftrace_callsites),
2326 &mod->num_ftrace_callsites);
2327#endif
2278 2328
2279 /* Determine total sizes, and put offsets in sh_entsize. For now 2329 mod->extable = section_objs(info, "__ex_table",
2280 this is done generically; there doesn't appear to be any 2330 sizeof(*mod->extable), &mod->num_exentries);
2281 special cases for the architectures. */ 2331
2282 layout_sections(mod, hdr, sechdrs, secstrings); 2332 if (section_addr(info, "__obsparm"))
2283 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, 2333 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2284 secstrings, &stroffs, strmap); 2334 mod->name);
2335
2336 info->debug = section_objs(info, "__verbose",
2337 sizeof(*info->debug), &info->num_debug);
2338}
2339
2340static int move_module(struct module *mod, struct load_info *info)
2341{
2342 int i;
2343 void *ptr;
2285 2344
2286 /* Do the allocs. */ 2345 /* Do the allocs. */
2287 ptr = module_alloc_update_bounds(mod->core_size); 2346 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2291,10 +2350,9 @@ static noinline struct module *load_module(void __user *umod,
2291 * leak. 2350 * leak.
2292 */ 2351 */
2293 kmemleak_not_leak(ptr); 2352 kmemleak_not_leak(ptr);
2294 if (!ptr) { 2353 if (!ptr)
2295 err = -ENOMEM; 2354 return -ENOMEM;
2296 goto free_percpu; 2355
2297 }
2298 memset(ptr, 0, mod->core_size); 2356 memset(ptr, 0, mod->core_size);
2299 mod->module_core = ptr; 2357 mod->module_core = ptr;
2300 2358
@@ -2307,50 +2365,40 @@ static noinline struct module *load_module(void __user *umod,
2307 */ 2365 */
2308 kmemleak_ignore(ptr); 2366 kmemleak_ignore(ptr);
2309 if (!ptr && mod->init_size) { 2367 if (!ptr && mod->init_size) {
2310 err = -ENOMEM; 2368 module_free(mod, mod->module_core);
2311 goto free_core; 2369 return -ENOMEM;
2312 } 2370 }
2313 memset(ptr, 0, mod->init_size); 2371 memset(ptr, 0, mod->init_size);
2314 mod->module_init = ptr; 2372 mod->module_init = ptr;
2315 2373
2316 /* Transfer each section which specifies SHF_ALLOC */ 2374 /* Transfer each section which specifies SHF_ALLOC */
2317 DEBUGP("final section addresses:\n"); 2375 DEBUGP("final section addresses:\n");
2318 for (i = 0; i < hdr->e_shnum; i++) { 2376 for (i = 0; i < info->hdr->e_shnum; i++) {
2319 void *dest; 2377 void *dest;
2378 Elf_Shdr *shdr = &info->sechdrs[i];
2320 2379
2321 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2380 if (!(shdr->sh_flags & SHF_ALLOC))
2322 continue; 2381 continue;
2323 2382
2324 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) 2383 if (shdr->sh_entsize & INIT_OFFSET_MASK)
2325 dest = mod->module_init 2384 dest = mod->module_init
2326 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); 2385 + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
2327 else 2386 else
2328 dest = mod->module_core + sechdrs[i].sh_entsize; 2387 dest = mod->module_core + shdr->sh_entsize;
2329 2388
2330 if (sechdrs[i].sh_type != SHT_NOBITS) 2389 if (shdr->sh_type != SHT_NOBITS)
2331 memcpy(dest, (void *)sechdrs[i].sh_addr, 2390 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2332 sechdrs[i].sh_size);
2333 /* Update sh_addr to point to copy in image. */ 2391 /* Update sh_addr to point to copy in image. */
2334 sechdrs[i].sh_addr = (unsigned long)dest; 2392 shdr->sh_addr = (unsigned long)dest;
2335 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); 2393 DEBUGP("\t0x%lx %s\n",
2336 } 2394 shdr->sh_addr, info->secstrings + shdr->sh_name);
2337 /* Module has been moved. */
2338 mod = (void *)sechdrs[modindex].sh_addr;
2339 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2340
2341#if defined(CONFIG_MODULE_UNLOAD)
2342 mod->refptr = alloc_percpu(struct module_ref);
2343 if (!mod->refptr) {
2344 err = -ENOMEM;
2345 goto free_init;
2346 } 2395 }
2347#endif
2348 /* Now we've moved module, initialize linked lists, etc. */
2349 module_unload_init(mod);
2350 2396
2351 /* Set up license info based on the info section */ 2397 return 0;
2352 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2398}
2353 2399
2400static int check_module_license_and_versions(struct module *mod)
2401{
2354 /* 2402 /*
2355 * ndiswrapper is under GPL by itself, but loads proprietary modules. 2403 * ndiswrapper is under GPL by itself, but loads proprietary modules.
2356 * Don't use add_taint_module(), as it would prevent ndiswrapper from 2404 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2363,77 +2411,6 @@ static noinline struct module *load_module(void __user *umod,
2363 if (strcmp(mod->name, "driverloader") == 0) 2411 if (strcmp(mod->name, "driverloader") == 0)
2364 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2412 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2365 2413
2366 /* Set up MODINFO_ATTR fields */
2367 setup_modinfo(mod, sechdrs, infoindex);
2368
2369 /* Fix up syms, so that st_value is a pointer to location. */
2370 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
2371 mod);
2372 if (err < 0)
2373 goto cleanup;
2374
2375 /* Now we've got everything in the final locations, we can
2376 * find optional sections. */
2377 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2378 sizeof(*mod->kp), &mod->num_kp);
2379 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2380 sizeof(*mod->syms), &mod->num_syms);
2381 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2382 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2383 sizeof(*mod->gpl_syms),
2384 &mod->num_gpl_syms);
2385 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2386 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2387 "__ksymtab_gpl_future",
2388 sizeof(*mod->gpl_future_syms),
2389 &mod->num_gpl_future_syms);
2390 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2391 "__kcrctab_gpl_future");
2392
2393#ifdef CONFIG_UNUSED_SYMBOLS
2394 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2395 "__ksymtab_unused",
2396 sizeof(*mod->unused_syms),
2397 &mod->num_unused_syms);
2398 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2399 "__kcrctab_unused");
2400 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2401 "__ksymtab_unused_gpl",
2402 sizeof(*mod->unused_gpl_syms),
2403 &mod->num_unused_gpl_syms);
2404 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2405 "__kcrctab_unused_gpl");
2406#endif
2407#ifdef CONFIG_CONSTRUCTORS
2408 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2409 sizeof(*mod->ctors), &mod->num_ctors);
2410#endif
2411
2412#ifdef CONFIG_TRACEPOINTS
2413 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2414 "__tracepoints",
2415 sizeof(*mod->tracepoints),
2416 &mod->num_tracepoints);
2417#endif
2418#ifdef CONFIG_EVENT_TRACING
2419 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2420 "_ftrace_events",
2421 sizeof(*mod->trace_events),
2422 &mod->num_trace_events);
2423 /*
2424 * This section contains pointers to allocated objects in the trace
2425 * code and not scanning it leads to false positives.
2426 */
2427 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2428 mod->num_trace_events, GFP_KERNEL);
2429#endif
2430#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2431 /* sechdrs[0].sh_size is always zero */
2432 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2433 "__mcount_loc",
2434 sizeof(*mod->ftrace_callsites),
2435 &mod->num_ftrace_callsites);
2436#endif
2437#ifdef CONFIG_MODVERSIONS 2414#ifdef CONFIG_MODVERSIONS
2438 if ((mod->num_syms && !mod->crcs) 2415 if ((mod->num_syms && !mod->crcs)
2439 || (mod->num_gpl_syms && !mod->gpl_crcs) 2416 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2443,56 +2420,16 @@ static noinline struct module *load_module(void __user *umod,
2443 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2420 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2444#endif 2421#endif
2445 ) { 2422 ) {
2446 err = try_to_force_load(mod, 2423 return try_to_force_load(mod,
2447 "no versions for exported symbols"); 2424 "no versions for exported symbols");
2448 if (err)
2449 goto cleanup;
2450 } 2425 }
2451#endif 2426#endif
2427 return 0;
2428}
2452 2429
2453 /* Now do relocations. */ 2430static void flush_module_icache(const struct module *mod)
2454 for (i = 1; i < hdr->e_shnum; i++) { 2431{
2455 const char *strtab = (char *)sechdrs[strindex].sh_addr; 2432 mm_segment_t old_fs;
2456 unsigned int info = sechdrs[i].sh_info;
2457
2458 /* Not a valid relocation section? */
2459 if (info >= hdr->e_shnum)
2460 continue;
2461
2462 /* Don't bother with non-allocated sections */
2463 if (!(sechdrs[info].sh_flags & SHF_ALLOC))
2464 continue;
2465
2466 if (sechdrs[i].sh_type == SHT_REL)
2467 err = apply_relocate(sechdrs, strtab, symindex, i,mod);
2468 else if (sechdrs[i].sh_type == SHT_RELA)
2469 err = apply_relocate_add(sechdrs, strtab, symindex, i,
2470 mod);
2471 if (err < 0)
2472 goto cleanup;
2473 }
2474
2475 /* Set up and sort exception table */
2476 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2477 sizeof(*mod->extable), &mod->num_exentries);
2478 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2479
2480 /* Finally, copy percpu area over. */
2481 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2482 sechdrs[pcpuindex].sh_size);
2483
2484 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2485 symoffs, stroffs, secstrings, strmap);
2486 kfree(strmap);
2487 strmap = NULL;
2488
2489 if (!mod->taints)
2490 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2491 sizeof(*debug), &num_debug);
2492
2493 err = module_finalize(hdr, sechdrs, mod);
2494 if (err < 0)
2495 goto cleanup;
2496 2433
2497 /* flush the icache in correct context */ 2434 /* flush the icache in correct context */
2498 old_fs = get_fs(); 2435 old_fs = get_fs();
@@ -2511,11 +2448,160 @@ static noinline struct module *load_module(void __user *umod,
2511 (unsigned long)mod->module_core + mod->core_size); 2448 (unsigned long)mod->module_core + mod->core_size);
2512 2449
2513 set_fs(old_fs); 2450 set_fs(old_fs);
2451}
2514 2452
2515 mod->args = args; 2453static struct module *layout_and_allocate(struct load_info *info)
2516 if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) 2454{
2517 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2455 /* Module within temporary copy. */
2518 mod->name); 2456 struct module *mod;
2457 Elf_Shdr *pcpusec;
2458 int err;
2459
2460 mod = setup_load_info(info);
2461 if (IS_ERR(mod))
2462 return mod;
2463
2464 err = check_modinfo(mod, info);
2465 if (err)
2466 return ERR_PTR(err);
2467
2468 /* Allow arches to frob section contents and sizes. */
2469 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2470 info->secstrings, mod);
2471 if (err < 0)
2472 goto out;
2473
2474 pcpusec = &info->sechdrs[info->index.pcpu];
2475 if (pcpusec->sh_size) {
2476 /* We have a special allocation for this section. */
2477 err = percpu_modalloc(mod,
2478 pcpusec->sh_size, pcpusec->sh_addralign);
2479 if (err)
2480 goto out;
2481 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2482 }
2483
2484 /* Determine total sizes, and put offsets in sh_entsize. For now
2485 this is done generically; there doesn't appear to be any
2486 special cases for the architectures. */
2487 layout_sections(mod, info);
2488
2489 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2490 * sizeof(long), GFP_KERNEL);
2491 if (!info->strmap) {
2492 err = -ENOMEM;
2493 goto free_percpu;
2494 }
2495 layout_symtab(mod, info);
2496
2497 /* Allocate and move to the final place */
2498 err = move_module(mod, info);
2499 if (err)
2500 goto free_strmap;
2501
2502 /* Module has been copied to its final place now: return it. */
2503 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2504 kmemleak_load_module(mod, info);
2505 return mod;
2506
2507free_strmap:
2508 kfree(info->strmap);
2509free_percpu:
2510 percpu_modfree(mod);
2511out:
2512 return ERR_PTR(err);
2513}
2514
2515/* mod is no longer valid after this! */
2516static void module_deallocate(struct module *mod, struct load_info *info)
2517{
2518 kfree(info->strmap);
2519 percpu_modfree(mod);
2520 module_free(mod, mod->module_init);
2521 module_free(mod, mod->module_core);
2522}
2523
2524static int post_relocation(struct module *mod, const struct load_info *info)
2525{
2526 /* Sort exception table now relocations are done. */
2527 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2528
2529 /* Copy relocated percpu area over. */
2530 percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
2531 info->sechdrs[info->index.pcpu].sh_size);
2532
2533 /* Setup kallsyms-specific fields. */
2534 add_kallsyms(mod, info);
2535
2536 /* Arch-specific module finalizing. */
2537 return module_finalize(info->hdr, info->sechdrs, mod);
2538}
2539
2540/* Allocate and load the module: note that size of section 0 is always
2541 zero, and we rely on this for optional sections. */
2542static struct module *load_module(void __user *umod,
2543 unsigned long len,
2544 const char __user *uargs)
2545{
2546 struct load_info info = { NULL, };
2547 struct module *mod;
2548 long err;
2549
2550 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2551 umod, len, uargs);
2552
2553 /* Copy in the blobs from userspace, check they are vaguely sane. */
2554 err = copy_and_check(&info, umod, len, uargs);
2555 if (err)
2556 return ERR_PTR(err);
2557
2558 /* Figure out module layout, and allocate all the memory. */
2559 mod = layout_and_allocate(&info);
2560 if (IS_ERR(mod)) {
2561 err = PTR_ERR(mod);
2562 goto free_copy;
2563 }
2564
2565 /* Now module is in final location, initialize linked lists, etc. */
2566 err = module_unload_init(mod);
2567 if (err)
2568 goto free_module;
2569
2570 /* Now we've got everything in the final locations, we can
2571 * find optional sections. */
2572 find_module_sections(mod, &info);
2573
2574 err = check_module_license_and_versions(mod);
2575 if (err)
2576 goto free_unload;
2577
2578 /* Set up MODINFO_ATTR fields */
2579 setup_modinfo(mod, &info);
2580
2581 /* Fix up syms, so that st_value is a pointer to location. */
2582 err = simplify_symbols(mod, &info);
2583 if (err < 0)
2584 goto free_modinfo;
2585
2586 err = apply_relocations(mod, &info);
2587 if (err < 0)
2588 goto free_modinfo;
2589
2590 err = post_relocation(mod, &info);
2591 if (err < 0)
2592 goto free_modinfo;
2593
2594 flush_module_icache(mod);
2595
2596 /* Now copy in args */
2597 mod->args = strndup_user(uargs, ~0UL >> 1);
2598 if (IS_ERR(mod->args)) {
2599 err = PTR_ERR(mod->args);
2600 goto free_arch_cleanup;
2601 }
2602
2603 /* Mark state as coming so strong_try_module_get() ignores us. */
2604 mod->state = MODULE_STATE_COMING;
2519 2605
2520 /* Now sew it into the lists so we can get lockdep and oops 2606 /* Now sew it into the lists so we can get lockdep and oops
2521 * info during argument parsing. Noone should access us, since 2607 * info during argument parsing. Noone should access us, since
@@ -2530,8 +2616,9 @@ static noinline struct module *load_module(void __user *umod,
2530 goto unlock; 2616 goto unlock;
2531 } 2617 }
2532 2618
2533 if (debug) 2619 /* This has to be done once we're sure module name is unique. */
2534 dynamic_debug_setup(debug, num_debug); 2620 if (!mod->taints)
2621 dynamic_debug_setup(info.debug, info.num_debug);
2535 2622
2536 /* Find duplicate symbols */ 2623 /* Find duplicate symbols */
2537 err = verify_export_symbols(mod); 2624 err = verify_export_symbols(mod);
@@ -2541,23 +2628,22 @@ static noinline struct module *load_module(void __user *umod,
2541 list_add_rcu(&mod->list, &modules); 2628 list_add_rcu(&mod->list, &modules);
2542 mutex_unlock(&module_mutex); 2629 mutex_unlock(&module_mutex);
2543 2630
2631 /* Module is ready to execute: parsing args may do that. */
2544 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2632 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2545 if (err < 0) 2633 if (err < 0)
2546 goto unlink; 2634 goto unlink;
2547 2635
2548 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2636 /* Link in to syfs. */
2637 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
2549 if (err < 0) 2638 if (err < 0)
2550 goto unlink; 2639 goto unlink;
2551 2640
2552 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2641 /* Get rid of temporary copy and strmap. */
2553 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2642 kfree(info.strmap);
2554 2643 free_copy(&info);
2555 /* Get rid of temporary copy */
2556 vfree(hdr);
2557
2558 trace_module_load(mod);
2559 2644
2560 /* Done! */ 2645 /* Done! */
2646 trace_module_load(mod);
2561 return mod; 2647 return mod;
2562 2648
2563 unlink: 2649 unlink:
@@ -2565,35 +2651,23 @@ static noinline struct module *load_module(void __user *umod,
2565 /* Unlink carefully: kallsyms could be walking list. */ 2651 /* Unlink carefully: kallsyms could be walking list. */
2566 list_del_rcu(&mod->list); 2652 list_del_rcu(&mod->list);
2567 ddebug: 2653 ddebug:
2568 dynamic_debug_remove(debug); 2654 if (!mod->taints)
2655 dynamic_debug_remove(info.debug);
2569 unlock: 2656 unlock:
2570 mutex_unlock(&module_mutex); 2657 mutex_unlock(&module_mutex);
2571 synchronize_sched(); 2658 synchronize_sched();
2659 kfree(mod->args);
2660 free_arch_cleanup:
2572 module_arch_cleanup(mod); 2661 module_arch_cleanup(mod);
2573 cleanup: 2662 free_modinfo:
2574 free_modinfo(mod); 2663 free_modinfo(mod);
2664 free_unload:
2575 module_unload_free(mod); 2665 module_unload_free(mod);
2576#if defined(CONFIG_MODULE_UNLOAD) 2666 free_module:
2577 free_percpu(mod->refptr); 2667 module_deallocate(mod, &info);
2578 free_init: 2668 free_copy:
2579#endif 2669 free_copy(&info);
2580 module_free(mod, mod->module_init);
2581 free_core:
2582 module_free(mod, mod->module_core);
2583 /* mod will be freed with core. Don't access it beyond this line! */
2584 free_percpu:
2585 free_percpu(percpu);
2586 free_mod:
2587 kfree(args);
2588 kfree(strmap);
2589 free_hdr:
2590 vfree(hdr);
2591 return ERR_PTR(err); 2670 return ERR_PTR(err);
2592
2593 truncated:
2594 printk(KERN_ERR "Module len %lu truncated\n", len);
2595 err = -ENOEXEC;
2596 goto free_hdr;
2597} 2671}
2598 2672
2599/* Call module constructors. */ 2673/* Call module constructors. */
diff --git a/kernel/padata.c b/kernel/padata.c
index fdd8ae609ce3..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,18 +26,19 @@
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sysfs.h>
29#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
30 31
31#define MAX_SEQ_NR INT_MAX - NR_CPUS 32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
32#define MAX_OBJ_NUM 1000 33#define MAX_OBJ_NUM 1000
33 34
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{ 36{
36 int cpu, target_cpu; 37 int cpu, target_cpu;
37 38
38 target_cpu = cpumask_first(pd->cpumask); 39 target_cpu = cpumask_first(pd->cpumask.pcpu);
39 for (cpu = 0; cpu < cpu_index; cpu++) 40 for (cpu = 0; cpu < cpu_index; cpu++)
40 target_cpu = cpumask_next(target_cpu, pd->cpumask); 41 target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
41 42
42 return target_cpu; 43 return target_cpu;
43} 44}
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
53 * Hash the sequence numbers to the cpus by taking 54 * Hash the sequence numbers to the cpus by taking
54 * seq_nr mod. number of cpus in use. 55 * seq_nr mod. number of cpus in use.
55 */ 56 */
56 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); 57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
57 58
58 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
59} 60}
60 61
61static void padata_parallel_worker(struct work_struct *work) 62static void padata_parallel_worker(struct work_struct *parallel_work)
62{ 63{
63 struct padata_queue *queue; 64 struct padata_parallel_queue *pqueue;
64 struct parallel_data *pd; 65 struct parallel_data *pd;
65 struct padata_instance *pinst; 66 struct padata_instance *pinst;
66 LIST_HEAD(local_list); 67 LIST_HEAD(local_list);
67 68
68 local_bh_disable(); 69 local_bh_disable();
69 queue = container_of(work, struct padata_queue, pwork); 70 pqueue = container_of(parallel_work,
70 pd = queue->pd; 71 struct padata_parallel_queue, work);
72 pd = pqueue->pd;
71 pinst = pd->pinst; 73 pinst = pd->pinst;
72 74
73 spin_lock(&queue->parallel.lock); 75 spin_lock(&pqueue->parallel.lock);
74 list_replace_init(&queue->parallel.list, &local_list); 76 list_replace_init(&pqueue->parallel.list, &local_list);
75 spin_unlock(&queue->parallel.lock); 77 spin_unlock(&pqueue->parallel.lock);
76 78
77 while (!list_empty(&local_list)) { 79 while (!list_empty(&local_list)) {
78 struct padata_priv *padata; 80 struct padata_priv *padata;
@@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work)
94 * @pinst: padata instance 96 * @pinst: padata instance
95 * @padata: object to be parallelized 97 * @padata: object to be parallelized
96 * @cb_cpu: cpu the serialization callback function will run on, 98 * @cb_cpu: cpu the serialization callback function will run on,
97 * must be in the cpumask of padata. 99 * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
98 * 100 *
99 * The parallelization callback function will run with BHs off. 101 * The parallelization callback function will run with BHs off.
100 * Note: Every object which is parallelized by padata_do_parallel 102 * Note: Every object which is parallelized by padata_do_parallel
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
104 struct padata_priv *padata, int cb_cpu) 106 struct padata_priv *padata, int cb_cpu)
105{ 107{
106 int target_cpu, err; 108 int target_cpu, err;
107 struct padata_queue *queue; 109 struct padata_parallel_queue *queue;
108 struct parallel_data *pd; 110 struct parallel_data *pd;
109 111
110 rcu_read_lock_bh(); 112 rcu_read_lock_bh();
111 113
112 pd = rcu_dereference(pinst->pd); 114 pd = rcu_dereference(pinst->pd);
113 115
114 err = 0; 116 err = -EINVAL;
115 if (!(pinst->flags & PADATA_INIT)) 117 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
118 goto out;
119
120 if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
116 goto out; 121 goto out;
117 122
118 err = -EBUSY; 123 err = -EBUSY;
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
122 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) 127 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
123 goto out; 128 goto out;
124 129
125 err = -EINVAL; 130 err = 0;
126 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
127 goto out;
128
129 err = -EINPROGRESS;
130 atomic_inc(&pd->refcnt); 131 atomic_inc(&pd->refcnt);
131 padata->pd = pd; 132 padata->pd = pd;
132 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
137 padata->seq_nr = atomic_inc_return(&pd->seq_nr); 138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
138 139
139 target_cpu = padata_cpu_hash(padata); 140 target_cpu = padata_cpu_hash(padata);
140 queue = per_cpu_ptr(pd->queue, target_cpu); 141 queue = per_cpu_ptr(pd->pqueue, target_cpu);
141 142
142 spin_lock(&queue->parallel.lock); 143 spin_lock(&queue->parallel.lock);
143 list_add_tail(&padata->list, &queue->parallel.list); 144 list_add_tail(&padata->list, &queue->parallel.list);
144 spin_unlock(&queue->parallel.lock); 145 spin_unlock(&queue->parallel.lock);
145 146
146 queue_work_on(target_cpu, pinst->wq, &queue->pwork); 147 queue_work_on(target_cpu, pinst->wq, &queue->work);
147 148
148out: 149out:
149 rcu_read_unlock_bh(); 150 rcu_read_unlock_bh();
@@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel);
171 */ 172 */
172static struct padata_priv *padata_get_next(struct parallel_data *pd) 173static struct padata_priv *padata_get_next(struct parallel_data *pd)
173{ 174{
174 int cpu, num_cpus, empty, calc_seq_nr; 175 int cpu, num_cpus;
175 int seq_nr, next_nr, overrun, next_overrun; 176 int next_nr, next_index;
176 struct padata_queue *queue, *next_queue; 177 struct padata_parallel_queue *queue, *next_queue;
177 struct padata_priv *padata; 178 struct padata_priv *padata;
178 struct padata_list *reorder; 179 struct padata_list *reorder;
179 180
180 empty = 0; 181 num_cpus = cpumask_weight(pd->cpumask.pcpu);
181 next_nr = -1;
182 next_overrun = 0;
183 next_queue = NULL;
184
185 num_cpus = cpumask_weight(pd->cpumask);
186
187 for_each_cpu(cpu, pd->cpumask) {
188 queue = per_cpu_ptr(pd->queue, cpu);
189 reorder = &queue->reorder;
190
191 /*
192 * Calculate the seq_nr of the object that should be
193 * next in this reorder queue.
194 */
195 overrun = 0;
196 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
197 + queue->cpu_index;
198 182
199 if (unlikely(calc_seq_nr > pd->max_seq_nr)) { 183 /*
200 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; 184 * Calculate the percpu reorder queue and the sequence
201 overrun = 1; 185 * number of the next object.
202 } 186 */
203 187 next_nr = pd->processed;
204 if (!list_empty(&reorder->list)) { 188 next_index = next_nr % num_cpus;
205 padata = list_entry(reorder->list.next, 189 cpu = padata_index_to_cpu(pd, next_index);
206 struct padata_priv, list); 190 next_queue = per_cpu_ptr(pd->pqueue, cpu);
207 191
208 seq_nr = padata->seq_nr; 192 if (unlikely(next_nr > pd->max_seq_nr)) {
209 BUG_ON(calc_seq_nr != seq_nr); 193 next_nr = next_nr - pd->max_seq_nr - 1;
210 } else { 194 next_index = next_nr % num_cpus;
211 seq_nr = calc_seq_nr; 195 cpu = padata_index_to_cpu(pd, next_index);
212 empty++; 196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
213 } 197 pd->processed = 0;
214
215 if (next_nr < 0 || seq_nr < next_nr
216 || (next_overrun && !overrun)) {
217 next_nr = seq_nr;
218 next_overrun = overrun;
219 next_queue = queue;
220 }
221 } 198 }
222 199
223 padata = NULL; 200 padata = NULL;
224 201
225 if (empty == num_cpus)
226 goto out;
227
228 reorder = &next_queue->reorder; 202 reorder = &next_queue->reorder;
229 203
230 if (!list_empty(&reorder->list)) { 204 if (!list_empty(&reorder->list)) {
231 padata = list_entry(reorder->list.next, 205 padata = list_entry(reorder->list.next,
232 struct padata_priv, list); 206 struct padata_priv, list);
233 207
234 if (unlikely(next_overrun)) { 208 BUG_ON(next_nr != padata->seq_nr);
235 for_each_cpu(cpu, pd->cpumask) {
236 queue = per_cpu_ptr(pd->queue, cpu);
237 atomic_set(&queue->num_obj, 0);
238 }
239 }
240 209
241 spin_lock(&reorder->lock); 210 spin_lock(&reorder->lock);
242 list_del_init(&padata->list); 211 list_del_init(&padata->list);
243 atomic_dec(&pd->reorder_objects); 212 atomic_dec(&pd->reorder_objects);
244 spin_unlock(&reorder->lock); 213 spin_unlock(&reorder->lock);
245 214
246 atomic_inc(&next_queue->num_obj); 215 pd->processed++;
247 216
248 goto out; 217 goto out;
249 } 218 }
250 219
251 queue = per_cpu_ptr(pd->queue, smp_processor_id()); 220 queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
252 if (queue->cpu_index == next_queue->cpu_index) { 221 if (queue->cpu_index == next_queue->cpu_index) {
253 padata = ERR_PTR(-ENODATA); 222 padata = ERR_PTR(-ENODATA);
254 goto out; 223 goto out;
@@ -262,7 +231,7 @@ out:
262static void padata_reorder(struct parallel_data *pd) 231static void padata_reorder(struct parallel_data *pd)
263{ 232{
264 struct padata_priv *padata; 233 struct padata_priv *padata;
265 struct padata_queue *queue; 234 struct padata_serial_queue *squeue;
266 struct padata_instance *pinst = pd->pinst; 235 struct padata_instance *pinst = pd->pinst;
267 236
268 /* 237 /*
@@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd)
301 return; 270 return;
302 } 271 }
303 272
304 queue = per_cpu_ptr(pd->queue, padata->cb_cpu); 273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
305 274
306 spin_lock(&queue->serial.lock); 275 spin_lock(&squeue->serial.lock);
307 list_add_tail(&padata->list, &queue->serial.list); 276 list_add_tail(&padata->list, &squeue->serial.list);
308 spin_unlock(&queue->serial.lock); 277 spin_unlock(&squeue->serial.lock);
309 278
310 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); 279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
311 } 280 }
312 281
313 spin_unlock_bh(&pd->lock); 282 spin_unlock_bh(&pd->lock);
@@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg)
333 padata_reorder(pd); 302 padata_reorder(pd);
334} 303}
335 304
336static void padata_serial_worker(struct work_struct *work) 305static void padata_serial_worker(struct work_struct *serial_work)
337{ 306{
338 struct padata_queue *queue; 307 struct padata_serial_queue *squeue;
339 struct parallel_data *pd; 308 struct parallel_data *pd;
340 LIST_HEAD(local_list); 309 LIST_HEAD(local_list);
341 310
342 local_bh_disable(); 311 local_bh_disable();
343 queue = container_of(work, struct padata_queue, swork); 312 squeue = container_of(serial_work, struct padata_serial_queue, work);
344 pd = queue->pd; 313 pd = squeue->pd;
345 314
346 spin_lock(&queue->serial.lock); 315 spin_lock(&squeue->serial.lock);
347 list_replace_init(&queue->serial.list, &local_list); 316 list_replace_init(&squeue->serial.list, &local_list);
348 spin_unlock(&queue->serial.lock); 317 spin_unlock(&squeue->serial.lock);
349 318
350 while (!list_empty(&local_list)) { 319 while (!list_empty(&local_list)) {
351 struct padata_priv *padata; 320 struct padata_priv *padata;
@@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
372void padata_do_serial(struct padata_priv *padata) 341void padata_do_serial(struct padata_priv *padata)
373{ 342{
374 int cpu; 343 int cpu;
375 struct padata_queue *queue; 344 struct padata_parallel_queue *pqueue;
376 struct parallel_data *pd; 345 struct parallel_data *pd;
377 346
378 pd = padata->pd; 347 pd = padata->pd;
379 348
380 cpu = get_cpu(); 349 cpu = get_cpu();
381 queue = per_cpu_ptr(pd->queue, cpu); 350 pqueue = per_cpu_ptr(pd->pqueue, cpu);
382 351
383 spin_lock(&queue->reorder.lock); 352 spin_lock(&pqueue->reorder.lock);
384 atomic_inc(&pd->reorder_objects); 353 atomic_inc(&pd->reorder_objects);
385 list_add_tail(&padata->list, &queue->reorder.list); 354 list_add_tail(&padata->list, &pqueue->reorder.list);
386 spin_unlock(&queue->reorder.lock); 355 spin_unlock(&pqueue->reorder.lock);
387 356
388 put_cpu(); 357 put_cpu();
389 358
@@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata)
391} 360}
392EXPORT_SYMBOL(padata_do_serial); 361EXPORT_SYMBOL(padata_do_serial);
393 362
394/* Allocate and initialize the internal cpumask dependend resources. */ 363static int padata_setup_cpumasks(struct parallel_data *pd,
395static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, 364 const struct cpumask *pcpumask,
396 const struct cpumask *cpumask) 365 const struct cpumask *cbcpumask)
397{ 366{
398 int cpu, cpu_index, num_cpus; 367 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
399 struct padata_queue *queue; 368 return -ENOMEM;
400 struct parallel_data *pd;
401
402 cpu_index = 0;
403 369
404 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); 370 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
405 if (!pd) 371 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
406 goto err; 372 free_cpumask_var(pd->cpumask.cbcpu);
373 return -ENOMEM;
374 }
407 375
408 pd->queue = alloc_percpu(struct padata_queue); 376 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
409 if (!pd->queue) 377 return 0;
410 goto err_free_pd; 378}
411 379
412 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) 380static void __padata_list_init(struct padata_list *pd_list)
413 goto err_free_queue; 381{
382 INIT_LIST_HEAD(&pd_list->list);
383 spin_lock_init(&pd_list->lock);
384}
414 385
415 cpumask_and(pd->cpumask, cpumask, cpu_active_mask); 386/* Initialize all percpu queues used by serial workers */
387static void padata_init_squeues(struct parallel_data *pd)
388{
389 int cpu;
390 struct padata_serial_queue *squeue;
416 391
417 for_each_cpu(cpu, pd->cpumask) { 392 for_each_cpu(cpu, pd->cpumask.cbcpu) {
418 queue = per_cpu_ptr(pd->queue, cpu); 393 squeue = per_cpu_ptr(pd->squeue, cpu);
394 squeue->pd = pd;
395 __padata_list_init(&squeue->serial);
396 INIT_WORK(&squeue->work, padata_serial_worker);
397 }
398}
419 399
420 queue->pd = pd; 400/* Initialize all percpu queues used by parallel workers */
401static void padata_init_pqueues(struct parallel_data *pd)
402{
403 int cpu_index, num_cpus, cpu;
404 struct padata_parallel_queue *pqueue;
421 405
422 queue->cpu_index = cpu_index; 406 cpu_index = 0;
407 for_each_cpu(cpu, pd->cpumask.pcpu) {
408 pqueue = per_cpu_ptr(pd->pqueue, cpu);
409 pqueue->pd = pd;
410 pqueue->cpu_index = cpu_index;
423 cpu_index++; 411 cpu_index++;
424 412
425 INIT_LIST_HEAD(&queue->reorder.list); 413 __padata_list_init(&pqueue->reorder);
426 INIT_LIST_HEAD(&queue->parallel.list); 414 __padata_list_init(&pqueue->parallel);
427 INIT_LIST_HEAD(&queue->serial.list); 415 INIT_WORK(&pqueue->work, padata_parallel_worker);
428 spin_lock_init(&queue->reorder.lock); 416 atomic_set(&pqueue->num_obj, 0);
429 spin_lock_init(&queue->parallel.lock);
430 spin_lock_init(&queue->serial.lock);
431
432 INIT_WORK(&queue->pwork, padata_parallel_worker);
433 INIT_WORK(&queue->swork, padata_serial_worker);
434 atomic_set(&queue->num_obj, 0);
435 } 417 }
436 418
437 num_cpus = cpumask_weight(pd->cpumask); 419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
438 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; 420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
421}
422
423/* Allocate and initialize the internal cpumask dependend resources. */
424static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
425 const struct cpumask *pcpumask,
426 const struct cpumask *cbcpumask)
427{
428 struct parallel_data *pd;
439 429
430 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
431 if (!pd)
432 goto err;
433
434 pd->pqueue = alloc_percpu(struct padata_parallel_queue);
435 if (!pd->pqueue)
436 goto err_free_pd;
437
438 pd->squeue = alloc_percpu(struct padata_serial_queue);
439 if (!pd->squeue)
440 goto err_free_pqueue;
441 if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
442 goto err_free_squeue;
443
444 padata_init_pqueues(pd);
445 padata_init_squeues(pd);
440 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
441 atomic_set(&pd->seq_nr, -1); 447 atomic_set(&pd->seq_nr, -1);
442 atomic_set(&pd->reorder_objects, 0); 448 atomic_set(&pd->reorder_objects, 0);
@@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
446 452
447 return pd; 453 return pd;
448 454
449err_free_queue: 455err_free_squeue:
450 free_percpu(pd->queue); 456 free_percpu(pd->squeue);
457err_free_pqueue:
458 free_percpu(pd->pqueue);
451err_free_pd: 459err_free_pd:
452 kfree(pd); 460 kfree(pd);
453err: 461err:
@@ -456,8 +464,10 @@ err:
456 464
457static void padata_free_pd(struct parallel_data *pd) 465static void padata_free_pd(struct parallel_data *pd)
458{ 466{
459 free_cpumask_var(pd->cpumask); 467 free_cpumask_var(pd->cpumask.pcpu);
460 free_percpu(pd->queue); 468 free_cpumask_var(pd->cpumask.cbcpu);
469 free_percpu(pd->pqueue);
470 free_percpu(pd->squeue);
461 kfree(pd); 471 kfree(pd);
462} 472}
463 473
@@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd)
465static void padata_flush_queues(struct parallel_data *pd) 475static void padata_flush_queues(struct parallel_data *pd)
466{ 476{
467 int cpu; 477 int cpu;
468 struct padata_queue *queue; 478 struct padata_parallel_queue *pqueue;
479 struct padata_serial_queue *squeue;
469 480
470 for_each_cpu(cpu, pd->cpumask) { 481 for_each_cpu(cpu, pd->cpumask.pcpu) {
471 queue = per_cpu_ptr(pd->queue, cpu); 482 pqueue = per_cpu_ptr(pd->pqueue, cpu);
472 flush_work(&queue->pwork); 483 flush_work(&pqueue->work);
473 } 484 }
474 485
475 del_timer_sync(&pd->timer); 486 del_timer_sync(&pd->timer);
@@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd)
477 if (atomic_read(&pd->reorder_objects)) 488 if (atomic_read(&pd->reorder_objects))
478 padata_reorder(pd); 489 padata_reorder(pd);
479 490
480 for_each_cpu(cpu, pd->cpumask) { 491 for_each_cpu(cpu, pd->cpumask.cbcpu) {
481 queue = per_cpu_ptr(pd->queue, cpu); 492 squeue = per_cpu_ptr(pd->squeue, cpu);
482 flush_work(&queue->swork); 493 flush_work(&squeue->work);
483 } 494 }
484 495
485 BUG_ON(atomic_read(&pd->refcnt) != 0); 496 BUG_ON(atomic_read(&pd->refcnt) != 0);
486} 497}
487 498
499static void __padata_start(struct padata_instance *pinst)
500{
501 pinst->flags |= PADATA_INIT;
502}
503
504static void __padata_stop(struct padata_instance *pinst)
505{
506 if (!(pinst->flags & PADATA_INIT))
507 return;
508
509 pinst->flags &= ~PADATA_INIT;
510
511 synchronize_rcu();
512
513 get_online_cpus();
514 padata_flush_queues(pinst->pd);
515 put_online_cpus();
516}
517
488/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control stucture with a new one. */
489static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
490 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
491{ 521{
492 struct parallel_data *pd_old = pinst->pd; 522 struct parallel_data *pd_old = pinst->pd;
523 int notification_mask = 0;
493 524
494 pinst->flags |= PADATA_RESET; 525 pinst->flags |= PADATA_RESET;
495 526
@@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
497 528
498 synchronize_rcu(); 529 synchronize_rcu();
499 530
531 if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
532 notification_mask |= PADATA_CPU_PARALLEL;
533 if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
534 notification_mask |= PADATA_CPU_SERIAL;
535
500 padata_flush_queues(pd_old); 536 padata_flush_queues(pd_old);
501 padata_free_pd(pd_old); 537 padata_free_pd(pd_old);
502 538
539 if (notification_mask)
540 blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
541 notification_mask,
542 &pd_new->cpumask);
543
503 pinst->flags &= ~PADATA_RESET; 544 pinst->flags &= ~PADATA_RESET;
504} 545}
505 546
506/** 547/**
507 * padata_set_cpumask - set the cpumask that padata should use 548 * padata_register_cpumask_notifier - Registers a notifier that will be called
549 * if either pcpu or cbcpu or both cpumasks change.
508 * 550 *
509 * @pinst: padata instance 551 * @pinst: A poineter to padata instance
510 * @cpumask: the cpumask to use 552 * @nblock: A pointer to notifier block.
511 */ 553 */
512int padata_set_cpumask(struct padata_instance *pinst, 554int padata_register_cpumask_notifier(struct padata_instance *pinst,
513 cpumask_var_t cpumask) 555 struct notifier_block *nblock)
514{ 556{
557 return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
558 nblock);
559}
560EXPORT_SYMBOL(padata_register_cpumask_notifier);
561
562/**
563 * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
564 * registered earlier using padata_register_cpumask_notifier
565 *
566 * @pinst: A pointer to data instance.
567 * @nlock: A pointer to notifier block.
568 */
569int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
570 struct notifier_block *nblock)
571{
572 return blocking_notifier_chain_unregister(
573 &pinst->cpumask_change_notifier,
574 nblock);
575}
576EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
577
578
579/* If cpumask contains no active cpu, we mark the instance as invalid. */
580static bool padata_validate_cpumask(struct padata_instance *pinst,
581 const struct cpumask *cpumask)
582{
583 if (!cpumask_intersects(cpumask, cpu_active_mask)) {
584 pinst->flags |= PADATA_INVALID;
585 return false;
586 }
587
588 pinst->flags &= ~PADATA_INVALID;
589 return true;
590}
591
592static int __padata_set_cpumasks(struct padata_instance *pinst,
593 cpumask_var_t pcpumask,
594 cpumask_var_t cbcpumask)
595{
596 int valid;
515 struct parallel_data *pd; 597 struct parallel_data *pd;
516 int err = 0; 598
599 valid = padata_validate_cpumask(pinst, pcpumask);
600 if (!valid) {
601 __padata_stop(pinst);
602 goto out_replace;
603 }
604
605 valid = padata_validate_cpumask(pinst, cbcpumask);
606 if (!valid)
607 __padata_stop(pinst);
608
609out_replace:
610 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
611 if (!pd)
612 return -ENOMEM;
613
614 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
615 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
616
617 padata_replace(pinst, pd);
618
619 if (valid)
620 __padata_start(pinst);
621
622 return 0;
623}
624
625/**
626 * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
627 * one is used by parallel workers and the second one
628 * by the wokers doing serialization.
629 *
630 * @pinst: padata instance
631 * @pcpumask: the cpumask to use for parallel workers
632 * @cbcpumask: the cpumsak to use for serial workers
633 */
634int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
635 cpumask_var_t cbcpumask)
636{
637 int err;
517 638
518 mutex_lock(&pinst->lock); 639 mutex_lock(&pinst->lock);
640 get_online_cpus();
519 641
642 err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
643
644 put_online_cpus();
645 mutex_unlock(&pinst->lock);
646
647 return err;
648
649}
650EXPORT_SYMBOL(padata_set_cpumasks);
651
652/**
653 * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
654 * equivalent to @cpumask.
655 *
656 * @pinst: padata instance
657 * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
658 * to parallel and serial cpumasks respectively.
659 * @cpumask: the cpumask to use
660 */
661int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
662 cpumask_var_t cpumask)
663{
664 struct cpumask *serial_mask, *parallel_mask;
665 int err = -EINVAL;
666
667 mutex_lock(&pinst->lock);
520 get_online_cpus(); 668 get_online_cpus();
521 669
522 pd = padata_alloc_pd(pinst, cpumask); 670 switch (cpumask_type) {
523 if (!pd) { 671 case PADATA_CPU_PARALLEL:
524 err = -ENOMEM; 672 serial_mask = pinst->cpumask.cbcpu;
525 goto out; 673 parallel_mask = cpumask;
674 break;
675 case PADATA_CPU_SERIAL:
676 parallel_mask = pinst->cpumask.pcpu;
677 serial_mask = cpumask;
678 break;
679 default:
680 goto out;
526 } 681 }
527 682
528 cpumask_copy(pinst->cpumask, cpumask); 683 err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
529
530 padata_replace(pinst, pd);
531 684
532out: 685out:
533 put_online_cpus(); 686 put_online_cpus();
534
535 mutex_unlock(&pinst->lock); 687 mutex_unlock(&pinst->lock);
536 688
537 return err; 689 return err;
@@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
543 struct parallel_data *pd; 695 struct parallel_data *pd;
544 696
545 if (cpumask_test_cpu(cpu, cpu_active_mask)) { 697 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
546 pd = padata_alloc_pd(pinst, pinst->cpumask); 698 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
699 pinst->cpumask.cbcpu);
547 if (!pd) 700 if (!pd)
548 return -ENOMEM; 701 return -ENOMEM;
549 702
550 padata_replace(pinst, pd); 703 padata_replace(pinst, pd);
704
705 if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
706 padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
707 __padata_start(pinst);
551 } 708 }
552 709
553 return 0; 710 return 0;
554} 711}
555 712
556/** 713 /**
557 * padata_add_cpu - add a cpu to the padata cpumask 714 * padata_add_cpu - add a cpu to one or both(parallel and serial)
715 * padata cpumasks.
558 * 716 *
559 * @pinst: padata instance 717 * @pinst: padata instance
560 * @cpu: cpu to add 718 * @cpu: cpu to add
719 * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
720 * The @mask may be any combination of the following flags:
721 * PADATA_CPU_SERIAL - serial cpumask
722 * PADATA_CPU_PARALLEL - parallel cpumask
561 */ 723 */
562int padata_add_cpu(struct padata_instance *pinst, int cpu) 724
725int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
563{ 726{
564 int err; 727 int err;
565 728
729 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
730 return -EINVAL;
731
566 mutex_lock(&pinst->lock); 732 mutex_lock(&pinst->lock);
567 733
568 get_online_cpus(); 734 get_online_cpus();
569 cpumask_set_cpu(cpu, pinst->cpumask); 735 if (mask & PADATA_CPU_SERIAL)
736 cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
737 if (mask & PADATA_CPU_PARALLEL)
738 cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
739
570 err = __padata_add_cpu(pinst, cpu); 740 err = __padata_add_cpu(pinst, cpu);
571 put_online_cpus(); 741 put_online_cpus();
572 742
@@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
578 748
579static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) 749static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
580{ 750{
581 struct parallel_data *pd; 751 struct parallel_data *pd = NULL;
582 752
583 if (cpumask_test_cpu(cpu, cpu_online_mask)) { 753 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
584 pd = padata_alloc_pd(pinst, pinst->cpumask); 754
755 if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
756 !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
757 __padata_stop(pinst);
758
759 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
760 pinst->cpumask.cbcpu);
585 if (!pd) 761 if (!pd)
586 return -ENOMEM; 762 return -ENOMEM;
587 763
@@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
591 return 0; 767 return 0;
592} 768}
593 769
594/** 770 /**
595 * padata_remove_cpu - remove a cpu from the padata cpumask 771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
772 * padata cpumasks.
596 * 773 *
597 * @pinst: padata instance 774 * @pinst: padata instance
598 * @cpu: cpu to remove 775 * @cpu: cpu to remove
776 * @mask: bitmask specifying from which cpumask @cpu should be removed
777 * The @mask may be any combination of the following flags:
778 * PADATA_CPU_SERIAL - serial cpumask
779 * PADATA_CPU_PARALLEL - parallel cpumask
599 */ 780 */
600int padata_remove_cpu(struct padata_instance *pinst, int cpu) 781int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
601{ 782{
602 int err; 783 int err;
603 784
785 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
786 return -EINVAL;
787
604 mutex_lock(&pinst->lock); 788 mutex_lock(&pinst->lock);
605 789
606 get_online_cpus(); 790 get_online_cpus();
607 cpumask_clear_cpu(cpu, pinst->cpumask); 791 if (mask & PADATA_CPU_SERIAL)
792 cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
793 if (mask & PADATA_CPU_PARALLEL)
794 cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
795
608 err = __padata_remove_cpu(pinst, cpu); 796 err = __padata_remove_cpu(pinst, cpu);
609 put_online_cpus(); 797 put_online_cpus();
610 798
@@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu);
619 * 807 *
620 * @pinst: padata instance to start 808 * @pinst: padata instance to start
621 */ 809 */
622void padata_start(struct padata_instance *pinst) 810int padata_start(struct padata_instance *pinst)
623{ 811{
812 int err = 0;
813
624 mutex_lock(&pinst->lock); 814 mutex_lock(&pinst->lock);
625 pinst->flags |= PADATA_INIT; 815
816 if (pinst->flags & PADATA_INVALID)
817 err =-EINVAL;
818
819 __padata_start(pinst);
820
626 mutex_unlock(&pinst->lock); 821 mutex_unlock(&pinst->lock);
822
823 return err;
627} 824}
628EXPORT_SYMBOL(padata_start); 825EXPORT_SYMBOL(padata_start);
629 826
@@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start);
635void padata_stop(struct padata_instance *pinst) 832void padata_stop(struct padata_instance *pinst)
636{ 833{
637 mutex_lock(&pinst->lock); 834 mutex_lock(&pinst->lock);
638 pinst->flags &= ~PADATA_INIT; 835 __padata_stop(pinst);
639 mutex_unlock(&pinst->lock); 836 mutex_unlock(&pinst->lock);
640} 837}
641EXPORT_SYMBOL(padata_stop); 838EXPORT_SYMBOL(padata_stop);
642 839
643#ifdef CONFIG_HOTPLUG_CPU 840#ifdef CONFIG_HOTPLUG_CPU
841
842static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
843{
844 return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
845 cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
846}
847
848
644static int padata_cpu_callback(struct notifier_block *nfb, 849static int padata_cpu_callback(struct notifier_block *nfb,
645 unsigned long action, void *hcpu) 850 unsigned long action, void *hcpu)
646{ 851{
@@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
653 switch (action) { 858 switch (action) {
654 case CPU_ONLINE: 859 case CPU_ONLINE:
655 case CPU_ONLINE_FROZEN: 860 case CPU_ONLINE_FROZEN:
656 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 861 if (!pinst_has_cpu(pinst, cpu))
657 break; 862 break;
658 mutex_lock(&pinst->lock); 863 mutex_lock(&pinst->lock);
659 err = __padata_add_cpu(pinst, cpu); 864 err = __padata_add_cpu(pinst, cpu);
@@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
664 869
665 case CPU_DOWN_PREPARE: 870 case CPU_DOWN_PREPARE:
666 case CPU_DOWN_PREPARE_FROZEN: 871 case CPU_DOWN_PREPARE_FROZEN:
667 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 872 if (!pinst_has_cpu(pinst, cpu))
668 break; 873 break;
669 mutex_lock(&pinst->lock); 874 mutex_lock(&pinst->lock);
670 err = __padata_remove_cpu(pinst, cpu); 875 err = __padata_remove_cpu(pinst, cpu);
@@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
675 880
676 case CPU_UP_CANCELED: 881 case CPU_UP_CANCELED:
677 case CPU_UP_CANCELED_FROZEN: 882 case CPU_UP_CANCELED_FROZEN:
678 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 883 if (!pinst_has_cpu(pinst, cpu))
679 break; 884 break;
680 mutex_lock(&pinst->lock); 885 mutex_lock(&pinst->lock);
681 __padata_remove_cpu(pinst, cpu); 886 __padata_remove_cpu(pinst, cpu);
@@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
683 888
684 case CPU_DOWN_FAILED: 889 case CPU_DOWN_FAILED:
685 case CPU_DOWN_FAILED_FROZEN: 890 case CPU_DOWN_FAILED_FROZEN:
686 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 891 if (!pinst_has_cpu(pinst, cpu))
687 break; 892 break;
688 mutex_lock(&pinst->lock); 893 mutex_lock(&pinst->lock);
689 __padata_add_cpu(pinst, cpu); 894 __padata_add_cpu(pinst, cpu);
@@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb,
694} 899}
695#endif 900#endif
696 901
902static void __padata_free(struct padata_instance *pinst)
903{
904#ifdef CONFIG_HOTPLUG_CPU
905 unregister_hotcpu_notifier(&pinst->cpu_notifier);
906#endif
907
908 padata_stop(pinst);
909 padata_free_pd(pinst->pd);
910 free_cpumask_var(pinst->cpumask.pcpu);
911 free_cpumask_var(pinst->cpumask.cbcpu);
912 kfree(pinst);
913}
914
915#define kobj2pinst(_kobj) \
916 container_of(_kobj, struct padata_instance, kobj)
917#define attr2pentry(_attr) \
918 container_of(_attr, struct padata_sysfs_entry, attr)
919
920static void padata_sysfs_release(struct kobject *kobj)
921{
922 struct padata_instance *pinst = kobj2pinst(kobj);
923 __padata_free(pinst);
924}
925
926struct padata_sysfs_entry {
927 struct attribute attr;
928 ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
929 ssize_t (*store)(struct padata_instance *, struct attribute *,
930 const char *, size_t);
931};
932
933static ssize_t show_cpumask(struct padata_instance *pinst,
934 struct attribute *attr, char *buf)
935{
936 struct cpumask *cpumask;
937 ssize_t len;
938
939 mutex_lock(&pinst->lock);
940 if (!strcmp(attr->name, "serial_cpumask"))
941 cpumask = pinst->cpumask.cbcpu;
942 else
943 cpumask = pinst->cpumask.pcpu;
944
945 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
946 nr_cpu_ids);
947 if (PAGE_SIZE - len < 2)
948 len = -EINVAL;
949 else
950 len += sprintf(buf + len, "\n");
951
952 mutex_unlock(&pinst->lock);
953 return len;
954}
955
956static ssize_t store_cpumask(struct padata_instance *pinst,
957 struct attribute *attr,
958 const char *buf, size_t count)
959{
960 cpumask_var_t new_cpumask;
961 ssize_t ret;
962 int mask_type;
963
964 if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
965 return -ENOMEM;
966
967 ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
968 nr_cpumask_bits);
969 if (ret < 0)
970 goto out;
971
972 mask_type = !strcmp(attr->name, "serial_cpumask") ?
973 PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
974 ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
975 if (!ret)
976 ret = count;
977
978out:
979 free_cpumask_var(new_cpumask);
980 return ret;
981}
982
983#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
984 static struct padata_sysfs_entry _name##_attr = \
985 __ATTR(_name, 0644, _show_name, _store_name)
986#define PADATA_ATTR_RO(_name, _show_name) \
987 static struct padata_sysfs_entry _name##_attr = \
988 __ATTR(_name, 0400, _show_name, NULL)
989
990PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
991PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
992
993/*
994 * Padata sysfs provides the following objects:
995 * serial_cpumask [RW] - cpumask for serial workers
996 * parallel_cpumask [RW] - cpumask for parallel workers
997 */
998static struct attribute *padata_default_attrs[] = {
999 &serial_cpumask_attr.attr,
1000 &parallel_cpumask_attr.attr,
1001 NULL,
1002};
1003
1004static ssize_t padata_sysfs_show(struct kobject *kobj,
1005 struct attribute *attr, char *buf)
1006{
1007 struct padata_instance *pinst;
1008 struct padata_sysfs_entry *pentry;
1009 ssize_t ret = -EIO;
1010
1011 pinst = kobj2pinst(kobj);
1012 pentry = attr2pentry(attr);
1013 if (pentry->show)
1014 ret = pentry->show(pinst, attr, buf);
1015
1016 return ret;
1017}
1018
1019static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
1020 const char *buf, size_t count)
1021{
1022 struct padata_instance *pinst;
1023 struct padata_sysfs_entry *pentry;
1024 ssize_t ret = -EIO;
1025
1026 pinst = kobj2pinst(kobj);
1027 pentry = attr2pentry(attr);
1028 if (pentry->show)
1029 ret = pentry->store(pinst, attr, buf, count);
1030
1031 return ret;
1032}
1033
1034static const struct sysfs_ops padata_sysfs_ops = {
1035 .show = padata_sysfs_show,
1036 .store = padata_sysfs_store,
1037};
1038
1039static struct kobj_type padata_attr_type = {
1040 .sysfs_ops = &padata_sysfs_ops,
1041 .default_attrs = padata_default_attrs,
1042 .release = padata_sysfs_release,
1043};
1044
697/** 1045/**
698 * padata_alloc - allocate and initialize a padata instance 1046 * padata_alloc_possible - Allocate and initialize padata instance.
1047 * Use the cpu_possible_mask for serial and
1048 * parallel workers.
699 * 1049 *
700 * @cpumask: cpumask that padata uses for parallelization
701 * @wq: workqueue to use for the allocated padata instance 1050 * @wq: workqueue to use for the allocated padata instance
702 */ 1051 */
703struct padata_instance *padata_alloc(const struct cpumask *cpumask, 1052struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
704 struct workqueue_struct *wq) 1053{
1054 return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
1055}
1056EXPORT_SYMBOL(padata_alloc_possible);
1057
1058/**
1059 * padata_alloc - allocate and initialize a padata instance and specify
1060 * cpumasks for serial and parallel workers.
1061 *
1062 * @wq: workqueue to use for the allocated padata instance
1063 * @pcpumask: cpumask that will be used for padata parallelization
1064 * @cbcpumask: cpumask that will be used for padata serialization
1065 */
1066struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1067 const struct cpumask *pcpumask,
1068 const struct cpumask *cbcpumask)
705{ 1069{
706 struct padata_instance *pinst; 1070 struct padata_instance *pinst;
707 struct parallel_data *pd; 1071 struct parallel_data *pd = NULL;
708 1072
709 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); 1073 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
710 if (!pinst) 1074 if (!pinst)
711 goto err; 1075 goto err;
712 1076
713 get_online_cpus(); 1077 get_online_cpus();
714 1078 if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
715 pd = padata_alloc_pd(pinst, cpumask);
716 if (!pd)
717 goto err_free_inst; 1079 goto err_free_inst;
1080 if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
1081 free_cpumask_var(pinst->cpumask.pcpu);
1082 goto err_free_inst;
1083 }
1084 if (!padata_validate_cpumask(pinst, pcpumask) ||
1085 !padata_validate_cpumask(pinst, cbcpumask))
1086 goto err_free_masks;
718 1087
719 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) 1088 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
720 goto err_free_pd; 1089 if (!pd)
1090 goto err_free_masks;
721 1091
722 rcu_assign_pointer(pinst->pd, pd); 1092 rcu_assign_pointer(pinst->pd, pd);
723 1093
724 pinst->wq = wq; 1094 pinst->wq = wq;
725 1095
726 cpumask_copy(pinst->cpumask, cpumask); 1096 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
1097 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
727 1098
728 pinst->flags = 0; 1099 pinst->flags = 0;
729 1100
@@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
735 1106
736 put_online_cpus(); 1107 put_online_cpus();
737 1108
1109 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
1110 kobject_init(&pinst->kobj, &padata_attr_type);
738 mutex_init(&pinst->lock); 1111 mutex_init(&pinst->lock);
739 1112
740 return pinst; 1113 return pinst;
741 1114
742err_free_pd: 1115err_free_masks:
743 padata_free_pd(pd); 1116 free_cpumask_var(pinst->cpumask.pcpu);
1117 free_cpumask_var(pinst->cpumask.cbcpu);
744err_free_inst: 1118err_free_inst:
745 kfree(pinst); 1119 kfree(pinst);
746 put_online_cpus(); 1120 put_online_cpus();
@@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc);
756 */ 1130 */
757void padata_free(struct padata_instance *pinst) 1131void padata_free(struct padata_instance *pinst)
758{ 1132{
759 padata_stop(pinst); 1133 kobject_put(&pinst->kobj);
760
761 synchronize_rcu();
762
763#ifdef CONFIG_HOTPLUG_CPU
764 unregister_hotcpu_notifier(&pinst->cpu_notifier);
765#endif
766 get_online_cpus();
767 padata_flush_queues(pinst->pd);
768 put_online_cpus();
769
770 padata_free_pd(pinst->pd);
771 free_cpumask_var(pinst->cpumask);
772 kfree(pinst);
773} 1134}
774EXPORT_SYMBOL(padata_free); 1135EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 3b16cd93fa7d..4c13b1a88ebb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,9 @@
24#include <linux/nmi.h> 24#include <linux/nmi.h>
25#include <linux/dmi.h> 25#include <linux/dmi.h>
26 26
27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18
29
27int panic_on_oops; 30int panic_on_oops;
28static unsigned long tainted_mask; 31static unsigned long tainted_mask;
29static int pause_on_oops; 32static int pause_on_oops;
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 39
37EXPORT_SYMBOL(panic_notifier_list); 40EXPORT_SYMBOL(panic_notifier_list);
38 41
39/* Returns how long it waited in ms */ 42static long no_blink(int state)
40long (*panic_blink)(long time);
41EXPORT_SYMBOL(panic_blink);
42
43static void panic_blink_one_second(void)
44{ 43{
45 static long i = 0, end; 44 return 0;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67} 45}
68 46
47/* Returns how long it waited in ms */
48long (*panic_blink)(int state);
49EXPORT_SYMBOL(panic_blink);
50
69/** 51/**
70 * panic - halt the system 52 * panic - halt the system
71 * @fmt: The text string to print 53 * @fmt: The text string to print
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...)
78{ 60{
79 static char buf[1024]; 61 static char buf[1024];
80 va_list args; 62 va_list args;
81 long i; 63 long i, i_next = 0;
64 int state = 0;
82 65
83 /* 66 /*
84 * It's possible to come here directly from a panic-assertion and 67 * It's possible to come here directly from a panic-assertion and
@@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...)
117 100
118 bust_spinlocks(0); 101 bust_spinlocks(0);
119 102
103 if (!panic_blink)
104 panic_blink = no_blink;
105
120 if (panic_timeout > 0) { 106 if (panic_timeout > 0) {
121 /* 107 /*
122 * Delay timeout seconds before rebooting the machine. 108 * Delay timeout seconds before rebooting the machine.
@@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...)
124 */ 110 */
125 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 111 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
126 112
127 for (i = 0; i < panic_timeout; i++) { 113 for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
128 touch_nmi_watchdog(); 114 touch_nmi_watchdog();
129 panic_blink_one_second(); 115 if (i >= i_next) {
116 i += panic_blink(state ^= 1);
117 i_next = i + 3600 / PANIC_BLINK_SPD;
118 }
119 mdelay(PANIC_TIMER_STEP);
130 } 120 }
131 /* 121 /*
132 * This will not be a clean reboot, with everything 122 * This will not be a clean reboot, with everything
@@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...)
152 } 142 }
153#endif 143#endif
154 local_irq_enable(); 144 local_irq_enable();
155 while (1) { 145 for (i = 0; ; i += PANIC_TIMER_STEP) {
156 touch_softlockup_watchdog(); 146 touch_softlockup_watchdog();
157 panic_blink_one_second(); 147 if (i >= i_next) {
148 i += panic_blink(state ^= 1);
149 i_next = i + 3600 / PANIC_BLINK_SPD;
150 }
151 mdelay(PANIC_TIMER_STEP);
158 } 152 }
159} 153}
160 154
@@ -344,7 +338,7 @@ static int init_oops_id(void)
344} 338}
345late_initcall(init_oops_id); 339late_initcall(init_oops_id);
346 340
347static void print_oops_end_marker(void) 341void print_oops_end_marker(void)
348{ 342{
349 init_oops_id(); 343 init_oops_id();
350 printk(KERN_WARNING "---[ end trace %016llx ]---\n", 344 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
diff --git a/kernel/params.c b/kernel/params.c
index 0b30ecd53a52..08107d181758 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,6 +31,42 @@
31#define DEBUGP(fmt, a...) 31#define DEBUGP(fmt, a...)
32#endif 32#endif
33 33
34/* Protects all parameters, and incidentally kmalloced_param list. */
35static DEFINE_MUTEX(param_lock);
36
37/* This just allows us to keep track of which parameters are kmalloced. */
38struct kmalloced_param {
39 struct list_head list;
40 char val[];
41};
42static LIST_HEAD(kmalloced_params);
43
44static void *kmalloc_parameter(unsigned int size)
45{
46 struct kmalloced_param *p;
47
48 p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
49 if (!p)
50 return NULL;
51
52 list_add(&p->list, &kmalloced_params);
53 return p->val;
54}
55
56/* Does nothing if parameter wasn't kmalloced above. */
57static void maybe_kfree_parameter(void *param)
58{
59 struct kmalloced_param *p;
60
61 list_for_each_entry(p, &kmalloced_params, list) {
62 if (p->val == param) {
63 list_del(&p->list);
64 kfree(p);
65 break;
66 }
67 }
68}
69
34static inline char dash2underscore(char c) 70static inline char dash2underscore(char c)
35{ 71{
36 if (c == '-') 72 if (c == '-')
@@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname)
49 85
50static int parse_one(char *param, 86static int parse_one(char *param,
51 char *val, 87 char *val,
52 struct kernel_param *params, 88 const struct kernel_param *params,
53 unsigned num_params, 89 unsigned num_params,
54 int (*handle_unknown)(char *param, char *val)) 90 int (*handle_unknown)(char *param, char *val))
55{ 91{
56 unsigned int i; 92 unsigned int i;
93 int err;
57 94
58 /* Find parameter */ 95 /* Find parameter */
59 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
60 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL;
61 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
62 params[i].set); 102 params[i].ops->set);
63 return params[i].set(val, &params[i]); 103 mutex_lock(&param_lock);
104 err = params[i].ops->set(val, &params[i]);
105 mutex_unlock(&param_lock);
106 return err;
64 } 107 }
65 } 108 }
66 109
@@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val)
128/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 171/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
129int parse_args(const char *name, 172int parse_args(const char *name,
130 char *args, 173 char *args,
131 struct kernel_param *params, 174 const struct kernel_param *params,
132 unsigned num, 175 unsigned num,
133 int (*unknown)(char *param, char *val)) 176 int (*unknown)(char *param, char *val))
134{ 177{
@@ -176,22 +219,29 @@ int parse_args(const char *name,
176 219
177/* Lazy bastard, eh? */ 220/* Lazy bastard, eh? */
178#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ 221#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
179 int param_set_##name(const char *val, struct kernel_param *kp) \ 222 int param_set_##name(const char *val, const struct kernel_param *kp) \
180 { \ 223 { \
181 tmptype l; \ 224 tmptype l; \
182 int ret; \ 225 int ret; \
183 \ 226 \
184 if (!val) return -EINVAL; \
185 ret = strtolfn(val, 0, &l); \ 227 ret = strtolfn(val, 0, &l); \
186 if (ret == -EINVAL || ((type)l != l)) \ 228 if (ret == -EINVAL || ((type)l != l)) \
187 return -EINVAL; \ 229 return -EINVAL; \
188 *((type *)kp->arg) = l; \ 230 *((type *)kp->arg) = l; \
189 return 0; \ 231 return 0; \
190 } \ 232 } \
191 int param_get_##name(char *buffer, struct kernel_param *kp) \ 233 int param_get_##name(char *buffer, const struct kernel_param *kp) \
192 { \ 234 { \
193 return sprintf(buffer, format, *((type *)kp->arg)); \ 235 return sprintf(buffer, format, *((type *)kp->arg)); \
194 } 236 } \
237 struct kernel_param_ops param_ops_##name = { \
238 .set = param_set_##name, \
239 .get = param_get_##name, \
240 }; \
241 EXPORT_SYMBOL(param_set_##name); \
242 EXPORT_SYMBOL(param_get_##name); \
243 EXPORT_SYMBOL(param_ops_##name)
244
195 245
196STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); 246STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
197STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); 247STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
@@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
201STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); 251STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
202STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); 252STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
203 253
204int param_set_charp(const char *val, struct kernel_param *kp) 254int param_set_charp(const char *val, const struct kernel_param *kp)
205{ 255{
206 if (!val) {
207 printk(KERN_ERR "%s: string parameter expected\n",
208 kp->name);
209 return -EINVAL;
210 }
211
212 if (strlen(val) > 1024) { 256 if (strlen(val) > 1024) {
213 printk(KERN_ERR "%s: string parameter too long\n", 257 printk(KERN_ERR "%s: string parameter too long\n",
214 kp->name); 258 kp->name);
215 return -ENOSPC; 259 return -ENOSPC;
216 } 260 }
217 261
218 /* This is a hack. We can't need to strdup in early boot, and we 262 maybe_kfree_parameter(*(char **)kp->arg);
263
264 /* This is a hack. We can't kmalloc in early boot, and we
219 * don't need to; this mangled commandline is preserved. */ 265 * don't need to; this mangled commandline is preserved. */
220 if (slab_is_available()) { 266 if (slab_is_available()) {
221 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 267 *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
222 if (!*(char **)kp->arg) 268 if (!*(char **)kp->arg)
223 return -ENOMEM; 269 return -ENOMEM;
270 strcpy(*(char **)kp->arg, val);
224 } else 271 } else
225 *(const char **)kp->arg = val; 272 *(const char **)kp->arg = val;
226 273
227 return 0; 274 return 0;
228} 275}
276EXPORT_SYMBOL(param_set_charp);
229 277
230int param_get_charp(char *buffer, struct kernel_param *kp) 278int param_get_charp(char *buffer, const struct kernel_param *kp)
231{ 279{
232 return sprintf(buffer, "%s", *((char **)kp->arg)); 280 return sprintf(buffer, "%s", *((char **)kp->arg));
233} 281}
282EXPORT_SYMBOL(param_get_charp);
283
284static void param_free_charp(void *arg)
285{
286 maybe_kfree_parameter(*((char **)arg));
287}
288
289struct kernel_param_ops param_ops_charp = {
290 .set = param_set_charp,
291 .get = param_get_charp,
292 .free = param_free_charp,
293};
294EXPORT_SYMBOL(param_ops_charp);
234 295
235/* Actually could be a bool or an int, for historical reasons. */ 296/* Actually could be a bool or an int, for historical reasons. */
236int param_set_bool(const char *val, struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
237{ 298{
238 bool v; 299 bool v;
239 300
@@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp)
258 *(int *)kp->arg = v; 319 *(int *)kp->arg = v;
259 return 0; 320 return 0;
260} 321}
322EXPORT_SYMBOL(param_set_bool);
261 323
262int param_get_bool(char *buffer, struct kernel_param *kp) 324int param_get_bool(char *buffer, const struct kernel_param *kp)
263{ 325{
264 bool val; 326 bool val;
265 if (kp->flags & KPARAM_ISBOOL) 327 if (kp->flags & KPARAM_ISBOOL)
@@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
270 /* Y and N chosen as being relatively non-coder friendly */ 332 /* Y and N chosen as being relatively non-coder friendly */
271 return sprintf(buffer, "%c", val ? 'Y' : 'N'); 333 return sprintf(buffer, "%c", val ? 'Y' : 'N');
272} 334}
335EXPORT_SYMBOL(param_get_bool);
336
337struct kernel_param_ops param_ops_bool = {
338 .set = param_set_bool,
339 .get = param_get_bool,
340};
341EXPORT_SYMBOL(param_ops_bool);
273 342
274/* This one must be bool. */ 343/* This one must be bool. */
275int param_set_invbool(const char *val, struct kernel_param *kp) 344int param_set_invbool(const char *val, const struct kernel_param *kp)
276{ 345{
277 int ret; 346 int ret;
278 bool boolval; 347 bool boolval;
@@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
285 *(bool *)kp->arg = !boolval; 354 *(bool *)kp->arg = !boolval;
286 return ret; 355 return ret;
287} 356}
357EXPORT_SYMBOL(param_set_invbool);
288 358
289int param_get_invbool(char *buffer, struct kernel_param *kp) 359int param_get_invbool(char *buffer, const struct kernel_param *kp)
290{ 360{
291 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); 361 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
292} 362}
363EXPORT_SYMBOL(param_get_invbool);
364
365struct kernel_param_ops param_ops_invbool = {
366 .set = param_set_invbool,
367 .get = param_get_invbool,
368};
369EXPORT_SYMBOL(param_ops_invbool);
293 370
294/* We break the rule and mangle the string. */ 371/* We break the rule and mangle the string. */
295static int param_array(const char *name, 372static int param_array(const char *name,
296 const char *val, 373 const char *val,
297 unsigned int min, unsigned int max, 374 unsigned int min, unsigned int max,
298 void *elem, int elemsize, 375 void *elem, int elemsize,
299 int (*set)(const char *, struct kernel_param *kp), 376 int (*set)(const char *, const struct kernel_param *kp),
300 u16 flags, 377 u16 flags,
301 unsigned int *num) 378 unsigned int *num)
302{ 379{
@@ -309,12 +386,6 @@ static int param_array(const char *name,
309 kp.arg = elem; 386 kp.arg = elem;
310 kp.flags = flags; 387 kp.flags = flags;
311 388
312 /* No equals sign? */
313 if (!val) {
314 printk(KERN_ERR "%s: expects arguments\n", name);
315 return -EINVAL;
316 }
317
318 *num = 0; 389 *num = 0;
319 /* We expect a comma-separated list of values. */ 390 /* We expect a comma-separated list of values. */
320 do { 391 do {
@@ -330,6 +401,7 @@ static int param_array(const char *name,
330 /* nul-terminate and parse */ 401 /* nul-terminate and parse */
331 save = val[len]; 402 save = val[len];
332 ((char *)val)[len] = '\0'; 403 ((char *)val)[len] = '\0';
404 BUG_ON(!mutex_is_locked(&param_lock));
333 ret = set(val, &kp); 405 ret = set(val, &kp);
334 406
335 if (ret != 0) 407 if (ret != 0)
@@ -347,17 +419,17 @@ static int param_array(const char *name,
347 return 0; 419 return 0;
348} 420}
349 421
350int param_array_set(const char *val, struct kernel_param *kp) 422static int param_array_set(const char *val, const struct kernel_param *kp)
351{ 423{
352 const struct kparam_array *arr = kp->arr; 424 const struct kparam_array *arr = kp->arr;
353 unsigned int temp_num; 425 unsigned int temp_num;
354 426
355 return param_array(kp->name, val, 1, arr->max, arr->elem, 427 return param_array(kp->name, val, 1, arr->max, arr->elem,
356 arr->elemsize, arr->set, kp->flags, 428 arr->elemsize, arr->ops->set, kp->flags,
357 arr->num ?: &temp_num); 429 arr->num ?: &temp_num);
358} 430}
359 431
360int param_array_get(char *buffer, struct kernel_param *kp) 432static int param_array_get(char *buffer, const struct kernel_param *kp)
361{ 433{
362 int i, off, ret; 434 int i, off, ret;
363 const struct kparam_array *arr = kp->arr; 435 const struct kparam_array *arr = kp->arr;
@@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp)
368 if (i) 440 if (i)
369 buffer[off++] = ','; 441 buffer[off++] = ',';
370 p.arg = arr->elem + arr->elemsize * i; 442 p.arg = arr->elem + arr->elemsize * i;
371 ret = arr->get(buffer + off, &p); 443 BUG_ON(!mutex_is_locked(&param_lock));
444 ret = arr->ops->get(buffer + off, &p);
372 if (ret < 0) 445 if (ret < 0)
373 return ret; 446 return ret;
374 off += ret; 447 off += ret;
@@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp)
377 return off; 450 return off;
378} 451}
379 452
380int param_set_copystring(const char *val, struct kernel_param *kp) 453static void param_array_free(void *arg)
454{
455 unsigned int i;
456 const struct kparam_array *arr = arg;
457
458 if (arr->ops->free)
459 for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
460 arr->ops->free(arr->elem + arr->elemsize * i);
461}
462
463struct kernel_param_ops param_array_ops = {
464 .set = param_array_set,
465 .get = param_array_get,
466 .free = param_array_free,
467};
468EXPORT_SYMBOL(param_array_ops);
469
470int param_set_copystring(const char *val, const struct kernel_param *kp)
381{ 471{
382 const struct kparam_string *kps = kp->str; 472 const struct kparam_string *kps = kp->str;
383 473
384 if (!val) {
385 printk(KERN_ERR "%s: missing param set value\n", kp->name);
386 return -EINVAL;
387 }
388 if (strlen(val)+1 > kps->maxlen) { 474 if (strlen(val)+1 > kps->maxlen) {
389 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 475 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
390 kp->name, kps->maxlen-1); 476 kp->name, kps->maxlen-1);
@@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
393 strcpy(kps->string, val); 479 strcpy(kps->string, val);
394 return 0; 480 return 0;
395} 481}
482EXPORT_SYMBOL(param_set_copystring);
396 483
397int param_get_string(char *buffer, struct kernel_param *kp) 484int param_get_string(char *buffer, const struct kernel_param *kp)
398{ 485{
399 const struct kparam_string *kps = kp->str; 486 const struct kparam_string *kps = kp->str;
400 return strlcpy(buffer, kps->string, kps->maxlen); 487 return strlcpy(buffer, kps->string, kps->maxlen);
401} 488}
489EXPORT_SYMBOL(param_get_string);
490
491struct kernel_param_ops param_ops_string = {
492 .set = param_set_copystring,
493 .get = param_get_string,
494};
495EXPORT_SYMBOL(param_ops_string);
402 496
403/* sysfs output in /sys/modules/XYZ/parameters/ */ 497/* sysfs output in /sys/modules/XYZ/parameters/ */
404#define to_module_attr(n) container_of(n, struct module_attribute, attr) 498#define to_module_attr(n) container_of(n, struct module_attribute, attr)
@@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[];
409struct param_attribute 503struct param_attribute
410{ 504{
411 struct module_attribute mattr; 505 struct module_attribute mattr;
412 struct kernel_param *param; 506 const struct kernel_param *param;
413}; 507};
414 508
415struct module_param_attrs 509struct module_param_attrs
@@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
428 int count; 522 int count;
429 struct param_attribute *attribute = to_param_attr(mattr); 523 struct param_attribute *attribute = to_param_attr(mattr);
430 524
431 if (!attribute->param->get) 525 if (!attribute->param->ops->get)
432 return -EPERM; 526 return -EPERM;
433 527
434 count = attribute->param->get(buf, attribute->param); 528 mutex_lock(&param_lock);
529 count = attribute->param->ops->get(buf, attribute->param);
530 mutex_unlock(&param_lock);
435 if (count > 0) { 531 if (count > 0) {
436 strcat(buf, "\n"); 532 strcat(buf, "\n");
437 ++count; 533 ++count;
@@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
447 int err; 543 int err;
448 struct param_attribute *attribute = to_param_attr(mattr); 544 struct param_attribute *attribute = to_param_attr(mattr);
449 545
450 if (!attribute->param->set) 546 if (!attribute->param->ops->set)
451 return -EPERM; 547 return -EPERM;
452 548
453 err = attribute->param->set(buf, attribute->param); 549 mutex_lock(&param_lock);
550 err = attribute->param->ops->set(buf, attribute->param);
551 mutex_unlock(&param_lock);
454 if (!err) 552 if (!err)
455 return len; 553 return len;
456 return err; 554 return err;
@@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
464#endif 562#endif
465 563
466#ifdef CONFIG_SYSFS 564#ifdef CONFIG_SYSFS
565void __kernel_param_lock(void)
566{
567 mutex_lock(&param_lock);
568}
569EXPORT_SYMBOL(__kernel_param_lock);
570
571void __kernel_param_unlock(void)
572{
573 mutex_unlock(&param_lock);
574}
575EXPORT_SYMBOL(__kernel_param_unlock);
576
467/* 577/*
468 * add_sysfs_param - add a parameter to sysfs 578 * add_sysfs_param - add a parameter to sysfs
469 * @mk: struct module_kobject 579 * @mk: struct module_kobject
@@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
475 * if there's an error. 585 * if there's an error.
476 */ 586 */
477static __modinit int add_sysfs_param(struct module_kobject *mk, 587static __modinit int add_sysfs_param(struct module_kobject *mk,
478 struct kernel_param *kp, 588 const struct kernel_param *kp,
479 const char *name) 589 const char *name)
480{ 590{
481 struct module_param_attrs *new; 591 struct module_param_attrs *new;
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk)
557 * /sys/module/[mod->name]/parameters/ 667 * /sys/module/[mod->name]/parameters/
558 */ 668 */
559int module_param_sysfs_setup(struct module *mod, 669int module_param_sysfs_setup(struct module *mod,
560 struct kernel_param *kparam, 670 const struct kernel_param *kparam,
561 unsigned int num_params) 671 unsigned int num_params)
562{ 672{
563 int i, err; 673 int i, err;
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod)
602 712
603void destroy_params(const struct kernel_param *params, unsigned num) 713void destroy_params(const struct kernel_param *params, unsigned num)
604{ 714{
605 /* FIXME: This should free kmalloced charp parameters. It doesn't. */ 715 unsigned int i;
716
717 for (i = 0; i < num; i++)
718 if (params[i].ops->free)
719 params[i].ops->free(params[i].arg);
606} 720}
607 721
608static void __init kernel_add_sysfs_param(const char *name, 722static void __init kernel_add_sysfs_param(const char *name,
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void)
768subsys_initcall(param_sysfs_init); 882subsys_initcall(param_sysfs_init);
769 883
770#endif /* CONFIG_SYSFS */ 884#endif /* CONFIG_SYSFS */
771
772EXPORT_SYMBOL(param_set_byte);
773EXPORT_SYMBOL(param_get_byte);
774EXPORT_SYMBOL(param_set_short);
775EXPORT_SYMBOL(param_get_short);
776EXPORT_SYMBOL(param_set_ushort);
777EXPORT_SYMBOL(param_get_ushort);
778EXPORT_SYMBOL(param_set_int);
779EXPORT_SYMBOL(param_get_int);
780EXPORT_SYMBOL(param_set_uint);
781EXPORT_SYMBOL(param_get_uint);
782EXPORT_SYMBOL(param_set_long);
783EXPORT_SYMBOL(param_get_long);
784EXPORT_SYMBOL(param_set_ulong);
785EXPORT_SYMBOL(param_get_ulong);
786EXPORT_SYMBOL(param_set_charp);
787EXPORT_SYMBOL(param_get_charp);
788EXPORT_SYMBOL(param_set_bool);
789EXPORT_SYMBOL(param_get_bool);
790EXPORT_SYMBOL(param_set_invbool);
791EXPORT_SYMBOL(param_get_invbool);
792EXPORT_SYMBOL(param_array_set);
793EXPORT_SYMBOL(param_array_get);
794EXPORT_SYMBOL(param_set_copystring);
795EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..403d1804b198 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
214 214
215static inline u64 perf_clock(void) 215static inline u64 perf_clock(void)
216{ 216{
217 return cpu_clock(raw_smp_processor_id()); 217 return local_clock();
218} 218}
219 219
220/* 220/*
@@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event,
675 struct perf_event *event, *partial_group = NULL; 675 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu; 676 const struct pmu *pmu = group_event->pmu;
677 bool txn = false; 677 bool txn = false;
678 int ret;
679 678
680 if (group_event->state == PERF_EVENT_STATE_OFF) 679 if (group_event->state == PERF_EVENT_STATE_OFF)
681 return 0; 680 return 0;
@@ -703,14 +702,8 @@ group_sched_in(struct perf_event *group_event,
703 } 702 }
704 } 703 }
705 704
706 if (!txn) 705 if (!txn || !pmu->commit_txn(pmu))
707 return 0;
708
709 ret = pmu->commit_txn(pmu);
710 if (!ret) {
711 pmu->cancel_txn(pmu);
712 return 0; 706 return 0;
713 }
714 707
715group_error: 708group_error:
716 /* 709 /*
@@ -1155,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
1155 * In order to keep per-task stats reliable we need to flip the event 1148 * In order to keep per-task stats reliable we need to flip the event
1156 * values when we flip the contexts. 1149 * values when we flip the contexts.
1157 */ 1150 */
1158 value = atomic64_read(&next_event->count); 1151 value = local64_read(&next_event->count);
1159 value = atomic64_xchg(&event->count, value); 1152 value = local64_xchg(&event->count, value);
1160 atomic64_set(&next_event->count, value); 1153 local64_set(&next_event->count, value);
1161 1154
1162 swap(event->total_time_enabled, next_event->total_time_enabled); 1155 swap(event->total_time_enabled, next_event->total_time_enabled);
1163 swap(event->total_time_running, next_event->total_time_running); 1156 swap(event->total_time_running, next_event->total_time_running);
@@ -1547,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1547 1540
1548 hwc->sample_period = sample_period; 1541 hwc->sample_period = sample_period;
1549 1542
1550 if (atomic64_read(&hwc->period_left) > 8*sample_period) { 1543 if (local64_read(&hwc->period_left) > 8*sample_period) {
1551 perf_disable(); 1544 perf_disable();
1552 perf_event_stop(event); 1545 perf_event_stop(event);
1553 atomic64_set(&hwc->period_left, 0); 1546 local64_set(&hwc->period_left, 0);
1554 perf_event_start(event); 1547 perf_event_start(event);
1555 perf_enable(); 1548 perf_enable();
1556 } 1549 }
@@ -1591,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1591 1584
1592 perf_disable(); 1585 perf_disable();
1593 event->pmu->read(event); 1586 event->pmu->read(event);
1594 now = atomic64_read(&event->count); 1587 now = local64_read(&event->count);
1595 delta = now - hwc->freq_count_stamp; 1588 delta = now - hwc->freq_count_stamp;
1596 hwc->freq_count_stamp = now; 1589 hwc->freq_count_stamp = now;
1597 1590
@@ -1743,6 +1736,11 @@ static void __perf_event_read(void *info)
1743 event->pmu->read(event); 1736 event->pmu->read(event);
1744} 1737}
1745 1738
1739static inline u64 perf_event_count(struct perf_event *event)
1740{
1741 return local64_read(&event->count) + atomic64_read(&event->child_count);
1742}
1743
1746static u64 perf_event_read(struct perf_event *event) 1744static u64 perf_event_read(struct perf_event *event)
1747{ 1745{
1748 /* 1746 /*
@@ -1762,7 +1760,7 @@ static u64 perf_event_read(struct perf_event *event)
1762 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1760 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1763 } 1761 }
1764 1762
1765 return atomic64_read(&event->count); 1763 return perf_event_count(event);
1766} 1764}
1767 1765
1768/* 1766/*
@@ -1883,7 +1881,7 @@ static void free_event_rcu(struct rcu_head *head)
1883} 1881}
1884 1882
1885static void perf_pending_sync(struct perf_event *event); 1883static void perf_pending_sync(struct perf_event *event);
1886static void perf_mmap_data_put(struct perf_mmap_data *data); 1884static void perf_buffer_put(struct perf_buffer *buffer);
1887 1885
1888static void free_event(struct perf_event *event) 1886static void free_event(struct perf_event *event)
1889{ 1887{
@@ -1891,7 +1889,7 @@ static void free_event(struct perf_event *event)
1891 1889
1892 if (!event->parent) { 1890 if (!event->parent) {
1893 atomic_dec(&nr_events); 1891 atomic_dec(&nr_events);
1894 if (event->attr.mmap) 1892 if (event->attr.mmap || event->attr.mmap_data)
1895 atomic_dec(&nr_mmap_events); 1893 atomic_dec(&nr_mmap_events);
1896 if (event->attr.comm) 1894 if (event->attr.comm)
1897 atomic_dec(&nr_comm_events); 1895 atomic_dec(&nr_comm_events);
@@ -1899,9 +1897,9 @@ static void free_event(struct perf_event *event)
1899 atomic_dec(&nr_task_events); 1897 atomic_dec(&nr_task_events);
1900 } 1898 }
1901 1899
1902 if (event->data) { 1900 if (event->buffer) {
1903 perf_mmap_data_put(event->data); 1901 perf_buffer_put(event->buffer);
1904 event->data = NULL; 1902 event->buffer = NULL;
1905 } 1903 }
1906 1904
1907 if (event->destroy) 1905 if (event->destroy)
@@ -2126,13 +2124,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2126static unsigned int perf_poll(struct file *file, poll_table *wait) 2124static unsigned int perf_poll(struct file *file, poll_table *wait)
2127{ 2125{
2128 struct perf_event *event = file->private_data; 2126 struct perf_event *event = file->private_data;
2129 struct perf_mmap_data *data; 2127 struct perf_buffer *buffer;
2130 unsigned int events = POLL_HUP; 2128 unsigned int events = POLL_HUP;
2131 2129
2132 rcu_read_lock(); 2130 rcu_read_lock();
2133 data = rcu_dereference(event->data); 2131 buffer = rcu_dereference(event->buffer);
2134 if (data) 2132 if (buffer)
2135 events = atomic_xchg(&data->poll, 0); 2133 events = atomic_xchg(&buffer->poll, 0);
2136 rcu_read_unlock(); 2134 rcu_read_unlock();
2137 2135
2138 poll_wait(file, &event->waitq, wait); 2136 poll_wait(file, &event->waitq, wait);
@@ -2143,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
2143static void perf_event_reset(struct perf_event *event) 2141static void perf_event_reset(struct perf_event *event)
2144{ 2142{
2145 (void)perf_event_read(event); 2143 (void)perf_event_read(event);
2146 atomic64_set(&event->count, 0); 2144 local64_set(&event->count, 0);
2147 perf_event_update_userpage(event); 2145 perf_event_update_userpage(event);
2148} 2146}
2149 2147
@@ -2342,14 +2340,14 @@ static int perf_event_index(struct perf_event *event)
2342void perf_event_update_userpage(struct perf_event *event) 2340void perf_event_update_userpage(struct perf_event *event)
2343{ 2341{
2344 struct perf_event_mmap_page *userpg; 2342 struct perf_event_mmap_page *userpg;
2345 struct perf_mmap_data *data; 2343 struct perf_buffer *buffer;
2346 2344
2347 rcu_read_lock(); 2345 rcu_read_lock();
2348 data = rcu_dereference(event->data); 2346 buffer = rcu_dereference(event->buffer);
2349 if (!data) 2347 if (!buffer)
2350 goto unlock; 2348 goto unlock;
2351 2349
2352 userpg = data->user_page; 2350 userpg = buffer->user_page;
2353 2351
2354 /* 2352 /*
2355 * Disable preemption so as to not let the corresponding user-space 2353 * Disable preemption so as to not let the corresponding user-space
@@ -2359,9 +2357,9 @@ void perf_event_update_userpage(struct perf_event *event)
2359 ++userpg->lock; 2357 ++userpg->lock;
2360 barrier(); 2358 barrier();
2361 userpg->index = perf_event_index(event); 2359 userpg->index = perf_event_index(event);
2362 userpg->offset = atomic64_read(&event->count); 2360 userpg->offset = perf_event_count(event);
2363 if (event->state == PERF_EVENT_STATE_ACTIVE) 2361 if (event->state == PERF_EVENT_STATE_ACTIVE)
2364 userpg->offset -= atomic64_read(&event->hw.prev_count); 2362 userpg->offset -= local64_read(&event->hw.prev_count);
2365 2363
2366 userpg->time_enabled = event->total_time_enabled + 2364 userpg->time_enabled = event->total_time_enabled +
2367 atomic64_read(&event->child_total_time_enabled); 2365 atomic64_read(&event->child_total_time_enabled);
@@ -2376,6 +2374,25 @@ unlock:
2376 rcu_read_unlock(); 2374 rcu_read_unlock();
2377} 2375}
2378 2376
2377static unsigned long perf_data_size(struct perf_buffer *buffer);
2378
2379static void
2380perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
2381{
2382 long max_size = perf_data_size(buffer);
2383
2384 if (watermark)
2385 buffer->watermark = min(max_size, watermark);
2386
2387 if (!buffer->watermark)
2388 buffer->watermark = max_size / 2;
2389
2390 if (flags & PERF_BUFFER_WRITABLE)
2391 buffer->writable = 1;
2392
2393 atomic_set(&buffer->refcount, 1);
2394}
2395
2379#ifndef CONFIG_PERF_USE_VMALLOC 2396#ifndef CONFIG_PERF_USE_VMALLOC
2380 2397
2381/* 2398/*
@@ -2383,15 +2400,15 @@ unlock:
2383 */ 2400 */
2384 2401
2385static struct page * 2402static struct page *
2386perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2403perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2387{ 2404{
2388 if (pgoff > data->nr_pages) 2405 if (pgoff > buffer->nr_pages)
2389 return NULL; 2406 return NULL;
2390 2407
2391 if (pgoff == 0) 2408 if (pgoff == 0)
2392 return virt_to_page(data->user_page); 2409 return virt_to_page(buffer->user_page);
2393 2410
2394 return virt_to_page(data->data_pages[pgoff - 1]); 2411 return virt_to_page(buffer->data_pages[pgoff - 1]);
2395} 2412}
2396 2413
2397static void *perf_mmap_alloc_page(int cpu) 2414static void *perf_mmap_alloc_page(int cpu)
@@ -2407,42 +2424,44 @@ static void *perf_mmap_alloc_page(int cpu)
2407 return page_address(page); 2424 return page_address(page);
2408} 2425}
2409 2426
2410static struct perf_mmap_data * 2427static struct perf_buffer *
2411perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2428perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2412{ 2429{
2413 struct perf_mmap_data *data; 2430 struct perf_buffer *buffer;
2414 unsigned long size; 2431 unsigned long size;
2415 int i; 2432 int i;
2416 2433
2417 size = sizeof(struct perf_mmap_data); 2434 size = sizeof(struct perf_buffer);
2418 size += nr_pages * sizeof(void *); 2435 size += nr_pages * sizeof(void *);
2419 2436
2420 data = kzalloc(size, GFP_KERNEL); 2437 buffer = kzalloc(size, GFP_KERNEL);
2421 if (!data) 2438 if (!buffer)
2422 goto fail; 2439 goto fail;
2423 2440
2424 data->user_page = perf_mmap_alloc_page(event->cpu); 2441 buffer->user_page = perf_mmap_alloc_page(cpu);
2425 if (!data->user_page) 2442 if (!buffer->user_page)
2426 goto fail_user_page; 2443 goto fail_user_page;
2427 2444
2428 for (i = 0; i < nr_pages; i++) { 2445 for (i = 0; i < nr_pages; i++) {
2429 data->data_pages[i] = perf_mmap_alloc_page(event->cpu); 2446 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2430 if (!data->data_pages[i]) 2447 if (!buffer->data_pages[i])
2431 goto fail_data_pages; 2448 goto fail_data_pages;
2432 } 2449 }
2433 2450
2434 data->nr_pages = nr_pages; 2451 buffer->nr_pages = nr_pages;
2452
2453 perf_buffer_init(buffer, watermark, flags);
2435 2454
2436 return data; 2455 return buffer;
2437 2456
2438fail_data_pages: 2457fail_data_pages:
2439 for (i--; i >= 0; i--) 2458 for (i--; i >= 0; i--)
2440 free_page((unsigned long)data->data_pages[i]); 2459 free_page((unsigned long)buffer->data_pages[i]);
2441 2460
2442 free_page((unsigned long)data->user_page); 2461 free_page((unsigned long)buffer->user_page);
2443 2462
2444fail_user_page: 2463fail_user_page:
2445 kfree(data); 2464 kfree(buffer);
2446 2465
2447fail: 2466fail:
2448 return NULL; 2467 return NULL;
@@ -2456,17 +2475,17 @@ static void perf_mmap_free_page(unsigned long addr)
2456 __free_page(page); 2475 __free_page(page);
2457} 2476}
2458 2477
2459static void perf_mmap_data_free(struct perf_mmap_data *data) 2478static void perf_buffer_free(struct perf_buffer *buffer)
2460{ 2479{
2461 int i; 2480 int i;
2462 2481
2463 perf_mmap_free_page((unsigned long)data->user_page); 2482 perf_mmap_free_page((unsigned long)buffer->user_page);
2464 for (i = 0; i < data->nr_pages; i++) 2483 for (i = 0; i < buffer->nr_pages; i++)
2465 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2484 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
2466 kfree(data); 2485 kfree(buffer);
2467} 2486}
2468 2487
2469static inline int page_order(struct perf_mmap_data *data) 2488static inline int page_order(struct perf_buffer *buffer)
2470{ 2489{
2471 return 0; 2490 return 0;
2472} 2491}
@@ -2479,18 +2498,18 @@ static inline int page_order(struct perf_mmap_data *data)
2479 * Required for architectures that have d-cache aliasing issues. 2498 * Required for architectures that have d-cache aliasing issues.
2480 */ 2499 */
2481 2500
2482static inline int page_order(struct perf_mmap_data *data) 2501static inline int page_order(struct perf_buffer *buffer)
2483{ 2502{
2484 return data->page_order; 2503 return buffer->page_order;
2485} 2504}
2486 2505
2487static struct page * 2506static struct page *
2488perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2507perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2489{ 2508{
2490 if (pgoff > (1UL << page_order(data))) 2509 if (pgoff > (1UL << page_order(buffer)))
2491 return NULL; 2510 return NULL;
2492 2511
2493 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2512 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2494} 2513}
2495 2514
2496static void perf_mmap_unmark_page(void *addr) 2515static void perf_mmap_unmark_page(void *addr)
@@ -2500,57 +2519,59 @@ static void perf_mmap_unmark_page(void *addr)
2500 page->mapping = NULL; 2519 page->mapping = NULL;
2501} 2520}
2502 2521
2503static void perf_mmap_data_free_work(struct work_struct *work) 2522static void perf_buffer_free_work(struct work_struct *work)
2504{ 2523{
2505 struct perf_mmap_data *data; 2524 struct perf_buffer *buffer;
2506 void *base; 2525 void *base;
2507 int i, nr; 2526 int i, nr;
2508 2527
2509 data = container_of(work, struct perf_mmap_data, work); 2528 buffer = container_of(work, struct perf_buffer, work);
2510 nr = 1 << page_order(data); 2529 nr = 1 << page_order(buffer);
2511 2530
2512 base = data->user_page; 2531 base = buffer->user_page;
2513 for (i = 0; i < nr + 1; i++) 2532 for (i = 0; i < nr + 1; i++)
2514 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2533 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2515 2534
2516 vfree(base); 2535 vfree(base);
2517 kfree(data); 2536 kfree(buffer);
2518} 2537}
2519 2538
2520static void perf_mmap_data_free(struct perf_mmap_data *data) 2539static void perf_buffer_free(struct perf_buffer *buffer)
2521{ 2540{
2522 schedule_work(&data->work); 2541 schedule_work(&buffer->work);
2523} 2542}
2524 2543
2525static struct perf_mmap_data * 2544static struct perf_buffer *
2526perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2545perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2527{ 2546{
2528 struct perf_mmap_data *data; 2547 struct perf_buffer *buffer;
2529 unsigned long size; 2548 unsigned long size;
2530 void *all_buf; 2549 void *all_buf;
2531 2550
2532 size = sizeof(struct perf_mmap_data); 2551 size = sizeof(struct perf_buffer);
2533 size += sizeof(void *); 2552 size += sizeof(void *);
2534 2553
2535 data = kzalloc(size, GFP_KERNEL); 2554 buffer = kzalloc(size, GFP_KERNEL);
2536 if (!data) 2555 if (!buffer)
2537 goto fail; 2556 goto fail;
2538 2557
2539 INIT_WORK(&data->work, perf_mmap_data_free_work); 2558 INIT_WORK(&buffer->work, perf_buffer_free_work);
2540 2559
2541 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 2560 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2542 if (!all_buf) 2561 if (!all_buf)
2543 goto fail_all_buf; 2562 goto fail_all_buf;
2544 2563
2545 data->user_page = all_buf; 2564 buffer->user_page = all_buf;
2546 data->data_pages[0] = all_buf + PAGE_SIZE; 2565 buffer->data_pages[0] = all_buf + PAGE_SIZE;
2547 data->page_order = ilog2(nr_pages); 2566 buffer->page_order = ilog2(nr_pages);
2548 data->nr_pages = 1; 2567 buffer->nr_pages = 1;
2568
2569 perf_buffer_init(buffer, watermark, flags);
2549 2570
2550 return data; 2571 return buffer;
2551 2572
2552fail_all_buf: 2573fail_all_buf:
2553 kfree(data); 2574 kfree(buffer);
2554 2575
2555fail: 2576fail:
2556 return NULL; 2577 return NULL;
@@ -2558,15 +2579,15 @@ fail:
2558 2579
2559#endif 2580#endif
2560 2581
2561static unsigned long perf_data_size(struct perf_mmap_data *data) 2582static unsigned long perf_data_size(struct perf_buffer *buffer)
2562{ 2583{
2563 return data->nr_pages << (PAGE_SHIFT + page_order(data)); 2584 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2564} 2585}
2565 2586
2566static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2587static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2567{ 2588{
2568 struct perf_event *event = vma->vm_file->private_data; 2589 struct perf_event *event = vma->vm_file->private_data;
2569 struct perf_mmap_data *data; 2590 struct perf_buffer *buffer;
2570 int ret = VM_FAULT_SIGBUS; 2591 int ret = VM_FAULT_SIGBUS;
2571 2592
2572 if (vmf->flags & FAULT_FLAG_MKWRITE) { 2593 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2576,14 +2597,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2576 } 2597 }
2577 2598
2578 rcu_read_lock(); 2599 rcu_read_lock();
2579 data = rcu_dereference(event->data); 2600 buffer = rcu_dereference(event->buffer);
2580 if (!data) 2601 if (!buffer)
2581 goto unlock; 2602 goto unlock;
2582 2603
2583 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 2604 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2584 goto unlock; 2605 goto unlock;
2585 2606
2586 vmf->page = perf_mmap_to_page(data, vmf->pgoff); 2607 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
2587 if (!vmf->page) 2608 if (!vmf->page)
2588 goto unlock; 2609 goto unlock;
2589 2610
@@ -2598,52 +2619,35 @@ unlock:
2598 return ret; 2619 return ret;
2599} 2620}
2600 2621
2601static void 2622static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
2602perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2603{
2604 long max_size = perf_data_size(data);
2605
2606 if (event->attr.watermark) {
2607 data->watermark = min_t(long, max_size,
2608 event->attr.wakeup_watermark);
2609 }
2610
2611 if (!data->watermark)
2612 data->watermark = max_size / 2;
2613
2614 atomic_set(&data->refcount, 1);
2615 rcu_assign_pointer(event->data, data);
2616}
2617
2618static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2619{ 2623{
2620 struct perf_mmap_data *data; 2624 struct perf_buffer *buffer;
2621 2625
2622 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2626 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
2623 perf_mmap_data_free(data); 2627 perf_buffer_free(buffer);
2624} 2628}
2625 2629
2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) 2630static struct perf_buffer *perf_buffer_get(struct perf_event *event)
2627{ 2631{
2628 struct perf_mmap_data *data; 2632 struct perf_buffer *buffer;
2629 2633
2630 rcu_read_lock(); 2634 rcu_read_lock();
2631 data = rcu_dereference(event->data); 2635 buffer = rcu_dereference(event->buffer);
2632 if (data) { 2636 if (buffer) {
2633 if (!atomic_inc_not_zero(&data->refcount)) 2637 if (!atomic_inc_not_zero(&buffer->refcount))
2634 data = NULL; 2638 buffer = NULL;
2635 } 2639 }
2636 rcu_read_unlock(); 2640 rcu_read_unlock();
2637 2641
2638 return data; 2642 return buffer;
2639} 2643}
2640 2644
2641static void perf_mmap_data_put(struct perf_mmap_data *data) 2645static void perf_buffer_put(struct perf_buffer *buffer)
2642{ 2646{
2643 if (!atomic_dec_and_test(&data->refcount)) 2647 if (!atomic_dec_and_test(&buffer->refcount))
2644 return; 2648 return;
2645 2649
2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2650 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
2647} 2651}
2648 2652
2649static void perf_mmap_open(struct vm_area_struct *vma) 2653static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2658,16 +2662,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2658 struct perf_event *event = vma->vm_file->private_data; 2662 struct perf_event *event = vma->vm_file->private_data;
2659 2663
2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2664 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2661 unsigned long size = perf_data_size(event->data); 2665 unsigned long size = perf_data_size(event->buffer);
2662 struct user_struct *user = event->mmap_user; 2666 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data; 2667 struct perf_buffer *buffer = event->buffer;
2664 2668
2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2669 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2666 vma->vm_mm->locked_vm -= event->mmap_locked; 2670 vma->vm_mm->locked_vm -= event->mmap_locked;
2667 rcu_assign_pointer(event->data, NULL); 2671 rcu_assign_pointer(event->buffer, NULL);
2668 mutex_unlock(&event->mmap_mutex); 2672 mutex_unlock(&event->mmap_mutex);
2669 2673
2670 perf_mmap_data_put(data); 2674 perf_buffer_put(buffer);
2671 free_uid(user); 2675 free_uid(user);
2672 } 2676 }
2673} 2677}
@@ -2685,11 +2689,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2685 unsigned long user_locked, user_lock_limit; 2689 unsigned long user_locked, user_lock_limit;
2686 struct user_struct *user = current_user(); 2690 struct user_struct *user = current_user();
2687 unsigned long locked, lock_limit; 2691 unsigned long locked, lock_limit;
2688 struct perf_mmap_data *data; 2692 struct perf_buffer *buffer;
2689 unsigned long vma_size; 2693 unsigned long vma_size;
2690 unsigned long nr_pages; 2694 unsigned long nr_pages;
2691 long user_extra, extra; 2695 long user_extra, extra;
2692 int ret = 0; 2696 int ret = 0, flags = 0;
2693 2697
2694 /* 2698 /*
2695 * Don't allow mmap() of inherited per-task counters. This would 2699 * Don't allow mmap() of inherited per-task counters. This would
@@ -2706,7 +2710,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2706 nr_pages = (vma_size / PAGE_SIZE) - 1; 2710 nr_pages = (vma_size / PAGE_SIZE) - 1;
2707 2711
2708 /* 2712 /*
2709 * If we have data pages ensure they're a power-of-two number, so we 2713 * If we have buffer pages ensure they're a power-of-two number, so we
2710 * can do bitmasks instead of modulo. 2714 * can do bitmasks instead of modulo.
2711 */ 2715 */
2712 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 2716 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2720,9 +2724,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2720 2724
2721 WARN_ON_ONCE(event->ctx->parent_ctx); 2725 WARN_ON_ONCE(event->ctx->parent_ctx);
2722 mutex_lock(&event->mmap_mutex); 2726 mutex_lock(&event->mmap_mutex);
2723 if (event->data) { 2727 if (event->buffer) {
2724 if (event->data->nr_pages == nr_pages) 2728 if (event->buffer->nr_pages == nr_pages)
2725 atomic_inc(&event->data->refcount); 2729 atomic_inc(&event->buffer->refcount);
2726 else 2730 else
2727 ret = -EINVAL; 2731 ret = -EINVAL;
2728 goto unlock; 2732 goto unlock;
@@ -2752,17 +2756,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2752 goto unlock; 2756 goto unlock;
2753 } 2757 }
2754 2758
2755 WARN_ON(event->data); 2759 WARN_ON(event->buffer);
2760
2761 if (vma->vm_flags & VM_WRITE)
2762 flags |= PERF_BUFFER_WRITABLE;
2756 2763
2757 data = perf_mmap_data_alloc(event, nr_pages); 2764 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
2758 if (!data) { 2765 event->cpu, flags);
2766 if (!buffer) {
2759 ret = -ENOMEM; 2767 ret = -ENOMEM;
2760 goto unlock; 2768 goto unlock;
2761 } 2769 }
2762 2770 rcu_assign_pointer(event->buffer, buffer);
2763 perf_mmap_data_init(event, data);
2764 if (vma->vm_flags & VM_WRITE)
2765 event->data->writable = 1;
2766 2771
2767 atomic_long_add(user_extra, &user->locked_vm); 2772 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra; 2773 event->mmap_locked = extra;
@@ -2941,11 +2946,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2941 return NULL; 2946 return NULL;
2942} 2947}
2943 2948
2944__weak
2945void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2946{
2947}
2948
2949 2949
2950/* 2950/*
2951 * We assume there is only KVM supporting the callbacks. 2951 * We assume there is only KVM supporting the callbacks.
@@ -2971,15 +2971,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2971/* 2971/*
2972 * Output 2972 * Output
2973 */ 2973 */
2974static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 2974static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
2975 unsigned long offset, unsigned long head) 2975 unsigned long offset, unsigned long head)
2976{ 2976{
2977 unsigned long mask; 2977 unsigned long mask;
2978 2978
2979 if (!data->writable) 2979 if (!buffer->writable)
2980 return true; 2980 return true;
2981 2981
2982 mask = perf_data_size(data) - 1; 2982 mask = perf_data_size(buffer) - 1;
2983 2983
2984 offset = (offset - tail) & mask; 2984 offset = (offset - tail) & mask;
2985 head = (head - tail) & mask; 2985 head = (head - tail) & mask;
@@ -2992,7 +2992,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2992 2992
2993static void perf_output_wakeup(struct perf_output_handle *handle) 2993static void perf_output_wakeup(struct perf_output_handle *handle)
2994{ 2994{
2995 atomic_set(&handle->data->poll, POLL_IN); 2995 atomic_set(&handle->buffer->poll, POLL_IN);
2996 2996
2997 if (handle->nmi) { 2997 if (handle->nmi) {
2998 handle->event->pending_wakeup = 1; 2998 handle->event->pending_wakeup = 1;
@@ -3012,45 +3012,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
3012 */ 3012 */
3013static void perf_output_get_handle(struct perf_output_handle *handle) 3013static void perf_output_get_handle(struct perf_output_handle *handle)
3014{ 3014{
3015 struct perf_mmap_data *data = handle->data; 3015 struct perf_buffer *buffer = handle->buffer;
3016 3016
3017 preempt_disable(); 3017 preempt_disable();
3018 local_inc(&data->nest); 3018 local_inc(&buffer->nest);
3019 handle->wakeup = local_read(&data->wakeup); 3019 handle->wakeup = local_read(&buffer->wakeup);
3020} 3020}
3021 3021
3022static void perf_output_put_handle(struct perf_output_handle *handle) 3022static void perf_output_put_handle(struct perf_output_handle *handle)
3023{ 3023{
3024 struct perf_mmap_data *data = handle->data; 3024 struct perf_buffer *buffer = handle->buffer;
3025 unsigned long head; 3025 unsigned long head;
3026 3026
3027again: 3027again:
3028 head = local_read(&data->head); 3028 head = local_read(&buffer->head);
3029 3029
3030 /* 3030 /*
3031 * IRQ/NMI can happen here, which means we can miss a head update. 3031 * IRQ/NMI can happen here, which means we can miss a head update.
3032 */ 3032 */
3033 3033
3034 if (!local_dec_and_test(&data->nest)) 3034 if (!local_dec_and_test(&buffer->nest))
3035 goto out; 3035 goto out;
3036 3036
3037 /* 3037 /*
3038 * Publish the known good head. Rely on the full barrier implied 3038 * Publish the known good head. Rely on the full barrier implied
3039 * by atomic_dec_and_test() order the data->head read and this 3039 * by atomic_dec_and_test() order the buffer->head read and this
3040 * write. 3040 * write.
3041 */ 3041 */
3042 data->user_page->data_head = head; 3042 buffer->user_page->data_head = head;
3043 3043
3044 /* 3044 /*
3045 * Now check if we missed an update, rely on the (compiler) 3045 * Now check if we missed an update, rely on the (compiler)
3046 * barrier in atomic_dec_and_test() to re-read data->head. 3046 * barrier in atomic_dec_and_test() to re-read buffer->head.
3047 */ 3047 */
3048 if (unlikely(head != local_read(&data->head))) { 3048 if (unlikely(head != local_read(&buffer->head))) {
3049 local_inc(&data->nest); 3049 local_inc(&buffer->nest);
3050 goto again; 3050 goto again;
3051 } 3051 }
3052 3052
3053 if (handle->wakeup != local_read(&data->wakeup)) 3053 if (handle->wakeup != local_read(&buffer->wakeup))
3054 perf_output_wakeup(handle); 3054 perf_output_wakeup(handle);
3055 3055
3056 out: 3056 out:
@@ -3070,12 +3070,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3070 buf += size; 3070 buf += size;
3071 handle->size -= size; 3071 handle->size -= size;
3072 if (!handle->size) { 3072 if (!handle->size) {
3073 struct perf_mmap_data *data = handle->data; 3073 struct perf_buffer *buffer = handle->buffer;
3074 3074
3075 handle->page++; 3075 handle->page++;
3076 handle->page &= data->nr_pages - 1; 3076 handle->page &= buffer->nr_pages - 1;
3077 handle->addr = data->data_pages[handle->page]; 3077 handle->addr = buffer->data_pages[handle->page];
3078 handle->size = PAGE_SIZE << page_order(data); 3078 handle->size = PAGE_SIZE << page_order(buffer);
3079 } 3079 }
3080 } while (len); 3080 } while (len);
3081} 3081}
@@ -3084,7 +3084,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3084 struct perf_event *event, unsigned int size, 3084 struct perf_event *event, unsigned int size,
3085 int nmi, int sample) 3085 int nmi, int sample)
3086{ 3086{
3087 struct perf_mmap_data *data; 3087 struct perf_buffer *buffer;
3088 unsigned long tail, offset, head; 3088 unsigned long tail, offset, head;
3089 int have_lost; 3089 int have_lost;
3090 struct { 3090 struct {
@@ -3100,19 +3100,19 @@ int perf_output_begin(struct perf_output_handle *handle,
3100 if (event->parent) 3100 if (event->parent)
3101 event = event->parent; 3101 event = event->parent;
3102 3102
3103 data = rcu_dereference(event->data); 3103 buffer = rcu_dereference(event->buffer);
3104 if (!data) 3104 if (!buffer)
3105 goto out; 3105 goto out;
3106 3106
3107 handle->data = data; 3107 handle->buffer = buffer;
3108 handle->event = event; 3108 handle->event = event;
3109 handle->nmi = nmi; 3109 handle->nmi = nmi;
3110 handle->sample = sample; 3110 handle->sample = sample;
3111 3111
3112 if (!data->nr_pages) 3112 if (!buffer->nr_pages)
3113 goto out; 3113 goto out;
3114 3114
3115 have_lost = local_read(&data->lost); 3115 have_lost = local_read(&buffer->lost);
3116 if (have_lost) 3116 if (have_lost)
3117 size += sizeof(lost_event); 3117 size += sizeof(lost_event);
3118 3118
@@ -3124,30 +3124,30 @@ int perf_output_begin(struct perf_output_handle *handle,
3124 * tail pointer. So that all reads will be completed before the 3124 * tail pointer. So that all reads will be completed before the
3125 * write is issued. 3125 * write is issued.
3126 */ 3126 */
3127 tail = ACCESS_ONCE(data->user_page->data_tail); 3127 tail = ACCESS_ONCE(buffer->user_page->data_tail);
3128 smp_rmb(); 3128 smp_rmb();
3129 offset = head = local_read(&data->head); 3129 offset = head = local_read(&buffer->head);
3130 head += size; 3130 head += size;
3131 if (unlikely(!perf_output_space(data, tail, offset, head))) 3131 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
3132 goto fail; 3132 goto fail;
3133 } while (local_cmpxchg(&data->head, offset, head) != offset); 3133 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
3134 3134
3135 if (head - local_read(&data->wakeup) > data->watermark) 3135 if (head - local_read(&buffer->wakeup) > buffer->watermark)
3136 local_add(data->watermark, &data->wakeup); 3136 local_add(buffer->watermark, &buffer->wakeup);
3137 3137
3138 handle->page = offset >> (PAGE_SHIFT + page_order(data)); 3138 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
3139 handle->page &= data->nr_pages - 1; 3139 handle->page &= buffer->nr_pages - 1;
3140 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); 3140 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
3141 handle->addr = data->data_pages[handle->page]; 3141 handle->addr = buffer->data_pages[handle->page];
3142 handle->addr += handle->size; 3142 handle->addr += handle->size;
3143 handle->size = (PAGE_SIZE << page_order(data)) - handle->size; 3143 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3144 3144
3145 if (have_lost) { 3145 if (have_lost) {
3146 lost_event.header.type = PERF_RECORD_LOST; 3146 lost_event.header.type = PERF_RECORD_LOST;
3147 lost_event.header.misc = 0; 3147 lost_event.header.misc = 0;
3148 lost_event.header.size = sizeof(lost_event); 3148 lost_event.header.size = sizeof(lost_event);
3149 lost_event.id = event->id; 3149 lost_event.id = event->id;
3150 lost_event.lost = local_xchg(&data->lost, 0); 3150 lost_event.lost = local_xchg(&buffer->lost, 0);
3151 3151
3152 perf_output_put(handle, lost_event); 3152 perf_output_put(handle, lost_event);
3153 } 3153 }
@@ -3155,7 +3155,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3155 return 0; 3155 return 0;
3156 3156
3157fail: 3157fail:
3158 local_inc(&data->lost); 3158 local_inc(&buffer->lost);
3159 perf_output_put_handle(handle); 3159 perf_output_put_handle(handle);
3160out: 3160out:
3161 rcu_read_unlock(); 3161 rcu_read_unlock();
@@ -3166,15 +3166,15 @@ out:
3166void perf_output_end(struct perf_output_handle *handle) 3166void perf_output_end(struct perf_output_handle *handle)
3167{ 3167{
3168 struct perf_event *event = handle->event; 3168 struct perf_event *event = handle->event;
3169 struct perf_mmap_data *data = handle->data; 3169 struct perf_buffer *buffer = handle->buffer;
3170 3170
3171 int wakeup_events = event->attr.wakeup_events; 3171 int wakeup_events = event->attr.wakeup_events;
3172 3172
3173 if (handle->sample && wakeup_events) { 3173 if (handle->sample && wakeup_events) {
3174 int events = local_inc_return(&data->events); 3174 int events = local_inc_return(&buffer->events);
3175 if (events >= wakeup_events) { 3175 if (events >= wakeup_events) {
3176 local_sub(wakeup_events, &data->events); 3176 local_sub(wakeup_events, &buffer->events);
3177 local_inc(&data->wakeup); 3177 local_inc(&buffer->wakeup);
3178 } 3178 }
3179 } 3179 }
3180 3180
@@ -3211,7 +3211,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3211 u64 values[4]; 3211 u64 values[4];
3212 int n = 0; 3212 int n = 0;
3213 3213
3214 values[n++] = atomic64_read(&event->count); 3214 values[n++] = perf_event_count(event);
3215 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3215 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3216 values[n++] = event->total_time_enabled + 3216 values[n++] = event->total_time_enabled +
3217 atomic64_read(&event->child_total_time_enabled); 3217 atomic64_read(&event->child_total_time_enabled);
@@ -3248,7 +3248,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3248 if (leader != event) 3248 if (leader != event)
3249 leader->pmu->read(leader); 3249 leader->pmu->read(leader);
3250 3250
3251 values[n++] = atomic64_read(&leader->count); 3251 values[n++] = perf_event_count(leader);
3252 if (read_format & PERF_FORMAT_ID) 3252 if (read_format & PERF_FORMAT_ID)
3253 values[n++] = primary_event_id(leader); 3253 values[n++] = primary_event_id(leader);
3254 3254
@@ -3260,7 +3260,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3260 if (sub != event) 3260 if (sub != event)
3261 sub->pmu->read(sub); 3261 sub->pmu->read(sub);
3262 3262
3263 values[n++] = atomic64_read(&sub->count); 3263 values[n++] = perf_event_count(sub);
3264 if (read_format & PERF_FORMAT_ID) 3264 if (read_format & PERF_FORMAT_ID)
3265 values[n++] = primary_event_id(sub); 3265 values[n++] = primary_event_id(sub);
3266 3266
@@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event,
3491/* 3491/*
3492 * task tracking -- fork/exit 3492 * task tracking -- fork/exit
3493 * 3493 *
3494 * enabled by: attr.comm | attr.mmap | attr.task 3494 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
3495 */ 3495 */
3496 3496
3497struct perf_task_event { 3497struct perf_task_event {
@@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event)
3541 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3541 if (event->cpu != -1 && event->cpu != smp_processor_id())
3542 return 0; 3542 return 0;
3543 3543
3544 if (event->attr.comm || event->attr.mmap || event->attr.task) 3544 if (event->attr.comm || event->attr.mmap ||
3545 event->attr.mmap_data || event->attr.task)
3545 return 1; 3546 return 1;
3546 3547
3547 return 0; 3548 return 0;
@@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event,
3766} 3767}
3767 3768
3768static int perf_event_mmap_match(struct perf_event *event, 3769static int perf_event_mmap_match(struct perf_event *event,
3769 struct perf_mmap_event *mmap_event) 3770 struct perf_mmap_event *mmap_event,
3771 int executable)
3770{ 3772{
3771 if (event->state < PERF_EVENT_STATE_INACTIVE) 3773 if (event->state < PERF_EVENT_STATE_INACTIVE)
3772 return 0; 3774 return 0;
@@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event,
3774 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3776 if (event->cpu != -1 && event->cpu != smp_processor_id())
3775 return 0; 3777 return 0;
3776 3778
3777 if (event->attr.mmap) 3779 if ((!executable && event->attr.mmap_data) ||
3780 (executable && event->attr.mmap))
3778 return 1; 3781 return 1;
3779 3782
3780 return 0; 3783 return 0;
3781} 3784}
3782 3785
3783static void perf_event_mmap_ctx(struct perf_event_context *ctx, 3786static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3784 struct perf_mmap_event *mmap_event) 3787 struct perf_mmap_event *mmap_event,
3788 int executable)
3785{ 3789{
3786 struct perf_event *event; 3790 struct perf_event *event;
3787 3791
3788 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3792 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3789 if (perf_event_mmap_match(event, mmap_event)) 3793 if (perf_event_mmap_match(event, mmap_event, executable))
3790 perf_event_mmap_output(event, mmap_event); 3794 perf_event_mmap_output(event, mmap_event);
3791 } 3795 }
3792} 3796}
@@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3830 if (!vma->vm_mm) { 3834 if (!vma->vm_mm) {
3831 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3835 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3832 goto got_name; 3836 goto got_name;
3837 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
3838 vma->vm_end >= vma->vm_mm->brk) {
3839 name = strncpy(tmp, "[heap]", sizeof(tmp));
3840 goto got_name;
3841 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
3842 vma->vm_end >= vma->vm_mm->start_stack) {
3843 name = strncpy(tmp, "[stack]", sizeof(tmp));
3844 goto got_name;
3833 } 3845 }
3834 3846
3835 name = strncpy(tmp, "//anon", sizeof(tmp)); 3847 name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3846,17 +3858,17 @@ got_name:
3846 3858
3847 rcu_read_lock(); 3859 rcu_read_lock();
3848 cpuctx = &get_cpu_var(perf_cpu_context); 3860 cpuctx = &get_cpu_var(perf_cpu_context);
3849 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3861 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
3850 ctx = rcu_dereference(current->perf_event_ctxp); 3862 ctx = rcu_dereference(current->perf_event_ctxp);
3851 if (ctx) 3863 if (ctx)
3852 perf_event_mmap_ctx(ctx, mmap_event); 3864 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
3853 put_cpu_var(perf_cpu_context); 3865 put_cpu_var(perf_cpu_context);
3854 rcu_read_unlock(); 3866 rcu_read_unlock();
3855 3867
3856 kfree(buf); 3868 kfree(buf);
3857} 3869}
3858 3870
3859void __perf_event_mmap(struct vm_area_struct *vma) 3871void perf_event_mmap(struct vm_area_struct *vma)
3860{ 3872{
3861 struct perf_mmap_event mmap_event; 3873 struct perf_mmap_event mmap_event;
3862 3874
@@ -4018,14 +4030,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
4018 hwc->last_period = hwc->sample_period; 4030 hwc->last_period = hwc->sample_period;
4019 4031
4020again: 4032again:
4021 old = val = atomic64_read(&hwc->period_left); 4033 old = val = local64_read(&hwc->period_left);
4022 if (val < 0) 4034 if (val < 0)
4023 return 0; 4035 return 0;
4024 4036
4025 nr = div64_u64(period + val, period); 4037 nr = div64_u64(period + val, period);
4026 offset = nr * period; 4038 offset = nr * period;
4027 val -= offset; 4039 val -= offset;
4028 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) 4040 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4029 goto again; 4041 goto again;
4030 4042
4031 return nr; 4043 return nr;
@@ -4064,7 +4076,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4064{ 4076{
4065 struct hw_perf_event *hwc = &event->hw; 4077 struct hw_perf_event *hwc = &event->hw;
4066 4078
4067 atomic64_add(nr, &event->count); 4079 local64_add(nr, &event->count);
4068 4080
4069 if (!regs) 4081 if (!regs)
4070 return; 4082 return;
@@ -4075,7 +4087,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4075 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4087 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4076 return perf_swevent_overflow(event, 1, nmi, data, regs); 4088 return perf_swevent_overflow(event, 1, nmi, data, regs);
4077 4089
4078 if (atomic64_add_negative(nr, &hwc->period_left)) 4090 if (local64_add_negative(nr, &hwc->period_left))
4079 return; 4091 return;
4080 4092
4081 perf_swevent_overflow(event, 0, nmi, data, regs); 4093 perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4213,14 +4225,12 @@ int perf_swevent_get_recursion_context(void)
4213} 4225}
4214EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4226EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4215 4227
4216void perf_swevent_put_recursion_context(int rctx) 4228void inline perf_swevent_put_recursion_context(int rctx)
4217{ 4229{
4218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4230 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4219 barrier(); 4231 barrier();
4220 cpuctx->recursion[rctx]--; 4232 cpuctx->recursion[rctx]--;
4221} 4233}
4222EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4223
4224 4234
4225void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4235void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4226 struct pt_regs *regs, u64 addr) 4236 struct pt_regs *regs, u64 addr)
@@ -4368,8 +4378,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4368 u64 now; 4378 u64 now;
4369 4379
4370 now = cpu_clock(cpu); 4380 now = cpu_clock(cpu);
4371 prev = atomic64_xchg(&event->hw.prev_count, now); 4381 prev = local64_xchg(&event->hw.prev_count, now);
4372 atomic64_add(now - prev, &event->count); 4382 local64_add(now - prev, &event->count);
4373} 4383}
4374 4384
4375static int cpu_clock_perf_event_enable(struct perf_event *event) 4385static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4377,7 +4387,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
4377 struct hw_perf_event *hwc = &event->hw; 4387 struct hw_perf_event *hwc = &event->hw;
4378 int cpu = raw_smp_processor_id(); 4388 int cpu = raw_smp_processor_id();
4379 4389
4380 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4390 local64_set(&hwc->prev_count, cpu_clock(cpu));
4381 perf_swevent_start_hrtimer(event); 4391 perf_swevent_start_hrtimer(event);
4382 4392
4383 return 0; 4393 return 0;
@@ -4409,9 +4419,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4409 u64 prev; 4419 u64 prev;
4410 s64 delta; 4420 s64 delta;
4411 4421
4412 prev = atomic64_xchg(&event->hw.prev_count, now); 4422 prev = local64_xchg(&event->hw.prev_count, now);
4413 delta = now - prev; 4423 delta = now - prev;
4414 atomic64_add(delta, &event->count); 4424 local64_add(delta, &event->count);
4415} 4425}
4416 4426
4417static int task_clock_perf_event_enable(struct perf_event *event) 4427static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4421,7 +4431,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
4421 4431
4422 now = event->ctx->time; 4432 now = event->ctx->time;
4423 4433
4424 atomic64_set(&hwc->prev_count, now); 4434 local64_set(&hwc->prev_count, now);
4425 4435
4426 perf_swevent_start_hrtimer(event); 4436 perf_swevent_start_hrtimer(event);
4427 4437
@@ -4601,7 +4611,7 @@ static int perf_tp_event_match(struct perf_event *event,
4601} 4611}
4602 4612
4603void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 4613void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4604 struct pt_regs *regs, struct hlist_head *head) 4614 struct pt_regs *regs, struct hlist_head *head, int rctx)
4605{ 4615{
4606 struct perf_sample_data data; 4616 struct perf_sample_data data;
4607 struct perf_event *event; 4617 struct perf_event *event;
@@ -4615,12 +4625,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4615 perf_sample_data_init(&data, addr); 4625 perf_sample_data_init(&data, addr);
4616 data.raw = &raw; 4626 data.raw = &raw;
4617 4627
4618 rcu_read_lock();
4619 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4628 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4620 if (perf_tp_event_match(event, &data, regs)) 4629 if (perf_tp_event_match(event, &data, regs))
4621 perf_swevent_add(event, count, 1, &data, regs); 4630 perf_swevent_add(event, count, 1, &data, regs);
4622 } 4631 }
4623 rcu_read_unlock(); 4632
4633 perf_swevent_put_recursion_context(rctx);
4624} 4634}
4625EXPORT_SYMBOL_GPL(perf_tp_event); 4635EXPORT_SYMBOL_GPL(perf_tp_event);
4626 4636
@@ -4864,7 +4874,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4864 hwc->sample_period = 1; 4874 hwc->sample_period = 1;
4865 hwc->last_period = hwc->sample_period; 4875 hwc->last_period = hwc->sample_period;
4866 4876
4867 atomic64_set(&hwc->period_left, hwc->sample_period); 4877 local64_set(&hwc->period_left, hwc->sample_period);
4868 4878
4869 /* 4879 /*
4870 * we currently do not support PERF_FORMAT_GROUP on inherited events 4880 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4913,7 +4923,7 @@ done:
4913 4923
4914 if (!event->parent) { 4924 if (!event->parent) {
4915 atomic_inc(&nr_events); 4925 atomic_inc(&nr_events);
4916 if (event->attr.mmap) 4926 if (event->attr.mmap || event->attr.mmap_data)
4917 atomic_inc(&nr_mmap_events); 4927 atomic_inc(&nr_mmap_events);
4918 if (event->attr.comm) 4928 if (event->attr.comm)
4919 atomic_inc(&nr_comm_events); 4929 atomic_inc(&nr_comm_events);
@@ -5007,7 +5017,7 @@ err_size:
5007static int 5017static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5018perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5009{ 5019{
5010 struct perf_mmap_data *data = NULL, *old_data = NULL; 5020 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
5011 int ret = -EINVAL; 5021 int ret = -EINVAL;
5012 5022
5013 if (!output_event) 5023 if (!output_event)
@@ -5037,19 +5047,19 @@ set:
5037 5047
5038 if (output_event) { 5048 if (output_event) {
5039 /* get the buffer we want to redirect to */ 5049 /* get the buffer we want to redirect to */
5040 data = perf_mmap_data_get(output_event); 5050 buffer = perf_buffer_get(output_event);
5041 if (!data) 5051 if (!buffer)
5042 goto unlock; 5052 goto unlock;
5043 } 5053 }
5044 5054
5045 old_data = event->data; 5055 old_buffer = event->buffer;
5046 rcu_assign_pointer(event->data, data); 5056 rcu_assign_pointer(event->buffer, buffer);
5047 ret = 0; 5057 ret = 0;
5048unlock: 5058unlock:
5049 mutex_unlock(&event->mmap_mutex); 5059 mutex_unlock(&event->mmap_mutex);
5050 5060
5051 if (old_data) 5061 if (old_buffer)
5052 perf_mmap_data_put(old_data); 5062 perf_buffer_put(old_buffer);
5053out: 5063out:
5054 return ret; 5064 return ret;
5055} 5065}
@@ -5298,7 +5308,7 @@ inherit_event(struct perf_event *parent_event,
5298 hwc->sample_period = sample_period; 5308 hwc->sample_period = sample_period;
5299 hwc->last_period = sample_period; 5309 hwc->last_period = sample_period;
5300 5310
5301 atomic64_set(&hwc->period_left, sample_period); 5311 local64_set(&hwc->period_left, sample_period);
5302 } 5312 }
5303 5313
5304 child_event->overflow_handler = parent_event->overflow_handler; 5314 child_event->overflow_handler = parent_event->overflow_handler;
@@ -5359,12 +5369,12 @@ static void sync_child_event(struct perf_event *child_event,
5359 if (child_event->attr.inherit_stat) 5369 if (child_event->attr.inherit_stat)
5360 perf_event_read_event(child_event, child); 5370 perf_event_read_event(child_event, child);
5361 5371
5362 child_val = atomic64_read(&child_event->count); 5372 child_val = perf_event_count(child_event);
5363 5373
5364 /* 5374 /*
5365 * Add back the child's count to the parent's count: 5375 * Add back the child's count to the parent's count:
5366 */ 5376 */
5367 atomic64_add(child_val, &parent_event->count); 5377 atomic64_add(child_val, &parent_event->child_count);
5368 atomic64_add(child_event->total_time_enabled, 5378 atomic64_add(child_event->total_time_enabled,
5369 &parent_event->child_total_time_enabled); 5379 &parent_event->child_total_time_enabled);
5370 atomic64_add(child_event->total_time_running, 5380 atomic64_add(child_event->total_time_running,
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c132d26..d55c6fb8d087 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid)
122 atomic_inc(&map->nr_free); 122 atomic_inc(&map->nr_free);
123} 123}
124 124
125/*
126 * If we started walking pids at 'base', is 'a' seen before 'b'?
127 */
128static int pid_before(int base, int a, int b)
129{
130 /*
131 * This is the same as saying
132 *
133 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
134 * and that mapping orders 'a' and 'b' with respect to 'base'.
135 */
136 return (unsigned)(a - base) < (unsigned)(b - base);
137}
138
139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid.
141 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately.
143 *
144 * Since pids rollover, it is not sufficient to just pick the bigger
145 * value. We have to consider where we started counting from.
146 *
147 * 'base' is the value of pid_ns->last_pid that we observed when
148 * we started looking for a pid.
149 *
150 * 'pid' is the pid that we eventually found.
151 */
152static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
153{
154 int prev;
155 int last_write = base;
156 do {
157 prev = last_write;
158 last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
159 } while ((prev != last_write) && (pid_before(base, last_write, pid)));
160}
161
125static int alloc_pidmap(struct pid_namespace *pid_ns) 162static int alloc_pidmap(struct pid_namespace *pid_ns)
126{ 163{
127 int i, offset, max_scan, pid, last = pid_ns->last_pid; 164 int i, offset, max_scan, pid, last = pid_ns->last_pid;
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
132 pid = RESERVED_PIDS; 169 pid = RESERVED_PIDS;
133 offset = pid & BITS_PER_PAGE_MASK; 170 offset = pid & BITS_PER_PAGE_MASK;
134 map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; 171 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
135 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 172 /*
173 * If last_pid points into the middle of the map->page we
174 * want to scan this bitmap block twice, the second time
175 * we start with offset == 0 (or RESERVED_PIDS).
176 */
177 max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
136 for (i = 0; i <= max_scan; ++i) { 178 for (i = 0; i <= max_scan; ++i) {
137 if (unlikely(!map->page)) { 179 if (unlikely(!map->page)) {
138 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); 180 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
154 do { 196 do {
155 if (!test_and_set_bit(offset, map->page)) { 197 if (!test_and_set_bit(offset, map->page)) {
156 atomic_dec(&map->nr_free); 198 atomic_dec(&map->nr_free);
157 pid_ns->last_pid = pid; 199 set_last_pid(pid_ns, last, pid);
158 return pid; 200 return pid;
159 } 201 }
160 offset = find_next_offset(map, offset); 202 offset = find_next_offset(map, offset);
161 pid = mk_pid(pid_ns, map, offset); 203 pid = mk_pid(pid_ns, map, offset);
162 /* 204 } while (offset < BITS_PER_PAGE && pid < pid_max);
163 * find_next_offset() found a bit, the pid from it
164 * is in-bounds, and if we fell back to the last
165 * bitmap block and the final block was the same
166 * as the starting point, pid is before last_pid.
167 */
168 } while (offset < BITS_PER_PAGE && pid < pid_max &&
169 (i != max_scan || pid < last ||
170 !((last+1) & BITS_PER_PAGE_MASK)));
171 } 205 }
172 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 206 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
173 ++map; 207 ++map;
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index f42d3f737a33..b7e4c362361b 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -48,59 +48,49 @@
48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
49 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
50 */ 50 */
51struct pm_qos_request_list { 51enum pm_qos_type {
52 struct list_head list; 52 PM_QOS_MAX, /* return the largest value */
53 union { 53 PM_QOS_MIN /* return the smallest value */
54 s32 value;
55 s32 usec;
56 s32 kbps;
57 };
58 int pm_qos_class;
59}; 54};
60 55
61static s32 max_compare(s32 v1, s32 v2);
62static s32 min_compare(s32 v1, s32 v2);
63
64struct pm_qos_object { 56struct pm_qos_object {
65 struct pm_qos_request_list requests; 57 struct plist_head requests;
66 struct blocking_notifier_head *notifiers; 58 struct blocking_notifier_head *notifiers;
67 struct miscdevice pm_qos_power_miscdev; 59 struct miscdevice pm_qos_power_miscdev;
68 char *name; 60 char *name;
69 s32 default_value; 61 s32 default_value;
70 atomic_t target_value; 62 enum pm_qos_type type;
71 s32 (*comparitor)(s32, s32);
72}; 63};
73 64
65static DEFINE_SPINLOCK(pm_qos_lock);
66
74static struct pm_qos_object null_pm_qos; 67static struct pm_qos_object null_pm_qos;
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 68static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 69static struct pm_qos_object cpu_dma_pm_qos = {
77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, 70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
78 .notifiers = &cpu_dma_lat_notifier, 71 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency", 72 .name = "cpu_dma_latency",
80 .default_value = 2000 * USEC_PER_SEC, 73 .default_value = 2000 * USEC_PER_SEC,
81 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 74 .type = PM_QOS_MIN,
82 .comparitor = min_compare
83}; 75};
84 76
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 77static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 78static struct pm_qos_object network_lat_pm_qos = {
87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, 79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
88 .notifiers = &network_lat_notifier, 80 .notifiers = &network_lat_notifier,
89 .name = "network_latency", 81 .name = "network_latency",
90 .default_value = 2000 * USEC_PER_SEC, 82 .default_value = 2000 * USEC_PER_SEC,
91 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 83 .type = PM_QOS_MIN
92 .comparitor = min_compare
93}; 84};
94 85
95 86
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 87static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 88static struct pm_qos_object network_throughput_pm_qos = {
98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, 89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
99 .notifiers = &network_throughput_notifier, 90 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 91 .name = "network_throughput",
101 .default_value = 0, 92 .default_value = 0,
102 .target_value = ATOMIC_INIT(0), 93 .type = PM_QOS_MAX,
103 .comparitor = max_compare
104}; 94};
105 95
106 96
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
111 &network_throughput_pm_qos 101 &network_throughput_pm_qos
112}; 102};
113 103
114static DEFINE_SPINLOCK(pm_qos_lock);
115
116static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
117 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
118static int pm_qos_power_open(struct inode *inode, struct file *filp); 106static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
124 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
125}; 113};
126 114
127/* static helper functions */ 115/* unlocked internal variant */
128static s32 max_compare(s32 v1, s32 v2) 116static inline int pm_qos_get_value(struct pm_qos_object *o)
129{ 117{
130 return max(v1, v2); 118 if (plist_head_empty(&o->requests))
131} 119 return o->default_value;
132 120
133static s32 min_compare(s32 v1, s32 v2) 121 switch (o->type) {
134{ 122 case PM_QOS_MIN:
135 return min(v1, v2); 123 return plist_last(&o->requests)->prio;
136} 124
125 case PM_QOS_MAX:
126 return plist_first(&o->requests)->prio;
137 127
128 default:
129 /* runtime check for not using enum */
130 BUG();
131 }
132}
138 133
139static void update_target(int pm_qos_class) 134static void update_target(struct pm_qos_object *o, struct plist_node *node,
135 int del, int value)
140{ 136{
141 s32 extreme_value;
142 struct pm_qos_request_list *node;
143 unsigned long flags; 137 unsigned long flags;
144 int call_notifier = 0; 138 int prev_value, curr_value;
145 139
146 spin_lock_irqsave(&pm_qos_lock, flags); 140 spin_lock_irqsave(&pm_qos_lock, flags);
147 extreme_value = pm_qos_array[pm_qos_class]->default_value; 141 prev_value = pm_qos_get_value(o);
148 list_for_each_entry(node, 142 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
149 &pm_qos_array[pm_qos_class]->requests.list, list) { 143 if (value != PM_QOS_DEFAULT_VALUE) {
150 extreme_value = pm_qos_array[pm_qos_class]->comparitor( 144 /*
151 extreme_value, node->value); 145 * to change the list, we atomically remove, reinit
152 } 146 * with new value and add, then see if the extremal
153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != 147 * changed
154 extreme_value) { 148 */
155 call_notifier = 1; 149 plist_del(node, &o->requests);
156 atomic_set(&pm_qos_array[pm_qos_class]->target_value, 150 plist_node_init(node, value);
157 extreme_value); 151 plist_add(node, &o->requests);
158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, 152 } else if (del) {
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value)); 153 plist_del(node, &o->requests);
154 } else {
155 plist_add(node, &o->requests);
160 } 156 }
157 curr_value = pm_qos_get_value(o);
161 spin_unlock_irqrestore(&pm_qos_lock, flags); 158 spin_unlock_irqrestore(&pm_qos_lock, flags);
162 159
163 if (call_notifier) 160 if (prev_value != curr_value)
164 blocking_notifier_call_chain( 161 blocking_notifier_call_chain(o->notifiers,
165 pm_qos_array[pm_qos_class]->notifiers, 162 (unsigned long)curr_value,
166 (unsigned long) extreme_value, NULL); 163 NULL);
167} 164}
168 165
169static int register_pm_qos_misc(struct pm_qos_object *qos) 166static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -196,42 +193,53 @@ static int find_pm_qos_object_by_minor(int minor)
196 */ 193 */
197int pm_qos_request(int pm_qos_class) 194int pm_qos_request(int pm_qos_class)
198{ 195{
199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 196 unsigned long flags;
197 int value;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return value;
200} 204}
201EXPORT_SYMBOL_GPL(pm_qos_request); 205EXPORT_SYMBOL_GPL(pm_qos_request);
202 206
207int pm_qos_request_active(struct pm_qos_request_list *req)
208{
209 return req->pm_qos_class != 0;
210}
211EXPORT_SYMBOL_GPL(pm_qos_request_active);
212
203/** 213/**
204 * pm_qos_add_request - inserts new qos request into the list 214 * pm_qos_add_request - inserts new qos request into the list
205 * @pm_qos_class: identifies which list of qos request to us 215 * @dep: pointer to a preallocated handle
216 * @pm_qos_class: identifies which list of qos request to use
206 * @value: defines the qos request 217 * @value: defines the qos request
207 * 218 *
208 * This function inserts a new entry in the pm_qos_class list of requested qos 219 * This function inserts a new entry in the pm_qos_class list of requested qos
209 * performance characteristics. It recomputes the aggregate QoS expectations 220 * performance characteristics. It recomputes the aggregate QoS expectations
210 * for the pm_qos_class of parameters, and returns the pm_qos_request list 221 * for the pm_qos_class of parameters and initializes the pm_qos_request_list
211 * element as a handle for use in updating and removal. Call needs to save 222 * handle. Caller needs to save this handle for later use in updates and
212 * this handle for later use. 223 * removal.
213 */ 224 */
214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) 225
226void pm_qos_add_request(struct pm_qos_request_list *dep,
227 int pm_qos_class, s32 value)
215{ 228{
216 struct pm_qos_request_list *dep; 229 struct pm_qos_object *o = pm_qos_array[pm_qos_class];
217 unsigned long flags; 230 int new_value;
218 231
219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); 232 if (pm_qos_request_active(dep)) {
220 if (dep) { 233 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
221 if (value == PM_QOS_DEFAULT_VALUE) 234 return;
222 dep->value = pm_qos_array[pm_qos_class]->default_value;
223 else
224 dep->value = value;
225 dep->pm_qos_class = pm_qos_class;
226
227 spin_lock_irqsave(&pm_qos_lock, flags);
228 list_add(&dep->list,
229 &pm_qos_array[pm_qos_class]->requests.list);
230 spin_unlock_irqrestore(&pm_qos_lock, flags);
231 update_target(pm_qos_class);
232 } 235 }
233 236 if (value == PM_QOS_DEFAULT_VALUE)
234 return dep; 237 new_value = o->default_value;
238 else
239 new_value = value;
240 plist_node_init(&dep->list, new_value);
241 dep->pm_qos_class = pm_qos_class;
242 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
235} 243}
236EXPORT_SYMBOL_GPL(pm_qos_add_request); 244EXPORT_SYMBOL_GPL(pm_qos_add_request);
237 245
@@ -246,27 +254,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
246 * Attempts are made to make this code callable on hot code paths. 254 * Attempts are made to make this code callable on hot code paths.
247 */ 255 */
248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, 256void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value) 257 s32 new_value)
250{ 258{
251 unsigned long flags;
252 int pending_update = 0;
253 s32 temp; 259 s32 temp;
260 struct pm_qos_object *o;
261
262 if (!pm_qos_req) /*guard against callers passing in null */
263 return;
254 264
255 if (pm_qos_req) { /*guard against callers passing in null */ 265 if (!pm_qos_request_active(pm_qos_req)) {
256 spin_lock_irqsave(&pm_qos_lock, flags); 266 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
257 if (new_value == PM_QOS_DEFAULT_VALUE) 267 return;
258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
259 else
260 temp = new_value;
261
262 if (temp != pm_qos_req->value) {
263 pending_update = 1;
264 pm_qos_req->value = temp;
265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
269 } 268 }
269
270 o = pm_qos_array[pm_qos_req->pm_qos_class];
271
272 if (new_value == PM_QOS_DEFAULT_VALUE)
273 temp = o->default_value;
274 else
275 temp = new_value;
276
277 if (temp != pm_qos_req->list.prio)
278 update_target(o, &pm_qos_req->list, 0, temp);
270} 279}
271EXPORT_SYMBOL_GPL(pm_qos_update_request); 280EXPORT_SYMBOL_GPL(pm_qos_update_request);
272 281
@@ -280,19 +289,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
280 */ 289 */
281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) 290void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
282{ 291{
283 unsigned long flags; 292 struct pm_qos_object *o;
284 int qos_class;
285 293
286 if (pm_qos_req == NULL) 294 if (pm_qos_req == NULL)
287 return; 295 return;
288 /* silent return to keep pcm code cleaner */ 296 /* silent return to keep pcm code cleaner */
289 297
290 qos_class = pm_qos_req->pm_qos_class; 298 if (!pm_qos_request_active(pm_qos_req)) {
291 spin_lock_irqsave(&pm_qos_lock, flags); 299 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
292 list_del(&pm_qos_req->list); 300 return;
293 kfree(pm_qos_req); 301 }
294 spin_unlock_irqrestore(&pm_qos_lock, flags); 302
295 update_target(qos_class); 303 o = pm_qos_array[pm_qos_req->pm_qos_class];
304 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
305 memset(pm_qos_req, 0, sizeof(*pm_qos_req));
296} 306}
297EXPORT_SYMBOL_GPL(pm_qos_remove_request); 307EXPORT_SYMBOL_GPL(pm_qos_remove_request);
298 308
@@ -340,8 +350,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
340 350
341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 351 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
342 if (pm_qos_class >= 0) { 352 if (pm_qos_class >= 0) {
343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class, 353 struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
344 PM_QOS_DEFAULT_VALUE); 354 if (!req)
355 return -ENOMEM;
356
357 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
358 filp->private_data = req;
345 359
346 if (filp->private_data) 360 if (filp->private_data)
347 return 0; 361 return 0;
@@ -353,8 +367,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
353{ 367{
354 struct pm_qos_request_list *req; 368 struct pm_qos_request_list *req;
355 369
356 req = (struct pm_qos_request_list *)filp->private_data; 370 req = filp->private_data;
357 pm_qos_remove_request(req); 371 pm_qos_remove_request(req);
372 kfree(req);
358 373
359 return 0; 374 return 0;
360} 375}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..6842eeba5879 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -16,13 +16,13 @@
16 * siglock protection since other code may update expiration cache as 16 * siglock protection since other code may update expiration cache as
17 * well. 17 * well.
18 */ 18 */
19void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
20{ 20{
21 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
22 22
23 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&task->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&task->sighand->siglock);
26} 26}
27 27
28static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
232 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{ 234{
235 struct sighand_struct *sighand; 235 struct signal_struct *sig = tsk->signal;
236 struct signal_struct *sig;
237 struct task_struct *t; 236 struct task_struct *t;
238 237
239 *times = INIT_CPUTIME; 238 times->utime = sig->utime;
239 times->stime = sig->stime;
240 times->sum_exec_runtime = sig->sum_sched_runtime;
240 241
241 rcu_read_lock(); 242 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand); 243 /* make sure we can trust tsk->thread_group list */
243 if (!sighand) 244 if (!likely(pid_alive(tsk)))
244 goto out; 245 goto out;
245 246
246 sig = tsk->signal;
247
248 t = tsk; 247 t = tsk;
249 do { 248 do {
250 times->utime = cputime_add(times->utime, t->utime); 249 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime); 250 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime; 251 times->sum_exec_runtime += t->se.sum_exec_runtime;
253 252 } while_each_thread(tsk, t);
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out: 253out:
261 rcu_read_unlock(); 254 rcu_read_unlock();
262} 255}
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1279{ 1272{
1280 struct signal_struct *sig; 1273 struct signal_struct *sig;
1281 1274
1282 /* tsk == current, ensure it is safe to use ->signal/sighand */
1283 if (unlikely(tsk->exit_state))
1284 return 0;
1285
1286 if (!task_cputime_zero(&tsk->cputime_expires)) { 1275 if (!task_cputime_zero(&tsk->cputime_expires)) {
1287 struct task_cputime task_sample = { 1276 struct task_cputime task_sample = {
1288 .utime = tsk->utime, 1277 .utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1298 if (sig->cputimer.running) { 1287 if (sig->cputimer.running) {
1299 struct task_cputime group_sample; 1288 struct task_cputime group_sample;
1300 1289
1301 thread_group_cputimer(tsk, &group_sample); 1290 spin_lock(&sig->cputimer.lock);
1291 group_sample = sig->cputimer.cputime;
1292 spin_unlock(&sig->cputimer.lock);
1293
1302 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1294 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1303 return 1; 1295 return 1;
1304 } 1296 }
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1315{ 1307{
1316 LIST_HEAD(firing); 1308 LIST_HEAD(firing);
1317 struct k_itimer *timer, *next; 1309 struct k_itimer *timer, *next;
1310 unsigned long flags;
1318 1311
1319 BUG_ON(!irqs_disabled()); 1312 BUG_ON(!irqs_disabled());
1320 1313
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1325 if (!fastpath_timer_check(tsk)) 1318 if (!fastpath_timer_check(tsk))
1326 return; 1319 return;
1327 1320
1328 spin_lock(&tsk->sighand->siglock); 1321 if (!lock_task_sighand(tsk, &flags))
1322 return;
1329 /* 1323 /*
1330 * Here we take off tsk->signal->cpu_timers[N] and 1324 * Here we take off tsk->signal->cpu_timers[N] and
1331 * tsk->cpu_timers[N] all the timers that are firing, and 1325 * tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1347 * that gets the timer lock before we do will give it up and 1341 * that gets the timer lock before we do will give it up and
1348 * spin until we've taken care of that timer below. 1342 * spin until we've taken care of that timer below.
1349 */ 1343 */
1350 spin_unlock(&tsk->sighand->siglock); 1344 unlock_task_sighand(tsk, &flags);
1351 1345
1352 /* 1346 /*
1353 * Now that all the timers on our list have the firing flag, 1347 * Now that all the timers on our list have the firing flag,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad723420acc3..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 562
563 if (copy_to_user(created_timer_id,
564 &new_timer_id, sizeof (new_timer_id))) {
565 error = -EFAULT;
566 goto out;
567 }
568 if (timer_event_spec) { 563 if (timer_event_spec) {
569 if (copy_from_user(&event, timer_event_spec, sizeof (event))) { 564 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
570 error = -EFAULT; 565 error = -EFAULT;
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
590 new_timer->sigq->info.si_tid = new_timer->it_id; 585 new_timer->sigq->info.si_tid = new_timer->it_id;
591 new_timer->sigq->info.si_code = SI_TIMER; 586 new_timer->sigq->info.si_code = SI_TIMER;
592 587
588 if (copy_to_user(created_timer_id,
589 &new_timer_id, sizeof (new_timer_id))) {
590 error = -EFAULT;
591 goto out;
592 }
593
593 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 594 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
594 if (error) 595 if (error)
595 goto out; 596 goto out;
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 97024fd40cd5..83bbc7c02df9 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..c77963938bca 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
277 goto Enable_irqs; 277 goto Enable_irqs;
278 } 278 }
279 279
280 if (hibernation_test(TEST_CORE)) 280 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
281 goto Power_up; 281 goto Power_up;
282 282
283 in_suspend = 1; 283 in_suspend = 1;
@@ -288,8 +288,10 @@ static int create_image(int platform_mode)
288 error); 288 error);
289 /* Restore control flow magically appears here */ 289 /* Restore control flow magically appears here */
290 restore_processor_state(); 290 restore_processor_state();
291 if (!in_suspend) 291 if (!in_suspend) {
292 events_check_enabled = false;
292 platform_leave(platform_mode); 293 platform_leave(platform_mode);
294 }
293 295
294 Power_up: 296 Power_up:
295 sysdev_resume(); 297 sysdev_resume();
@@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode)
328 330
329 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
330 if (error) 332 if (error)
331 return error; 333 goto Close;
332 334
333 /* Preallocate image memory before shutting down devices. */ 335 /* Preallocate image memory before shutting down devices. */
334 error = hibernate_preallocate_memory(); 336 error = hibernate_preallocate_memory();
@@ -336,6 +338,7 @@ int hibernation_snapshot(int platform_mode)
336 goto Close; 338 goto Close;
337 339
338 suspend_console(); 340 suspend_console();
341 hibernation_freeze_swap();
339 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
340 error = dpm_suspend_start(PMSG_FREEZE); 343 error = dpm_suspend_start(PMSG_FREEZE);
341 if (error) 344 if (error)
@@ -511,18 +514,24 @@ int hibernation_platform_enter(void)
511 514
512 local_irq_disable(); 515 local_irq_disable();
513 sysdev_suspend(PMSG_HIBERNATE); 516 sysdev_suspend(PMSG_HIBERNATE);
517 if (!pm_check_wakeup_events()) {
518 error = -EAGAIN;
519 goto Power_up;
520 }
521
514 hibernation_ops->enter(); 522 hibernation_ops->enter();
515 /* We should never get here */ 523 /* We should never get here */
516 while (1); 524 while (1);
517 525
518 /* 526 Power_up:
519 * We don't need to reenable the nonboot CPUs or resume consoles, since 527 sysdev_resume();
520 * the system is going to be halted anyway. 528 local_irq_enable();
521 */ 529 enable_nonboot_cpus();
530
522 Platform_finish: 531 Platform_finish:
523 hibernation_ops->finish(); 532 hibernation_ops->finish();
524 533
525 dpm_suspend_noirq(PMSG_RESTORE); 534 dpm_resume_noirq(PMSG_RESTORE);
526 535
527 Resume_devices: 536 Resume_devices:
528 entering_platform_hibernation = false; 537 entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
204 204
205power_attr(state); 205power_attr(state);
206 206
207#ifdef CONFIG_PM_SLEEP
208/*
209 * The 'wakeup_count' attribute, along with the functions defined in
210 * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
211 * handled in a non-racy way.
212 *
213 * If a wakeup event occurs when the system is in a sleep state, it simply is
214 * woken up. In turn, if an event that would wake the system up from a sleep
215 * state occurs when it is undergoing a transition to that sleep state, the
216 * transition should be aborted. Moreover, if such an event occurs when the
217 * system is in the working state, an attempt to start a transition to the
218 * given sleep state should fail during certain period after the detection of
219 * the event. Using the 'state' attribute alone is not sufficient to satisfy
220 * these requirements, because a wakeup event may occur exactly when 'state'
221 * is being written to and may be delivered to user space right before it is
222 * frozen, so the event will remain only partially processed until the system is
223 * woken up by another event. In particular, it won't cause the transition to
224 * a sleep state to be aborted.
225 *
226 * This difficulty may be overcome if user space uses 'wakeup_count' before
227 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to.
234 */
235
236static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr,
238 char *buf)
239{
240 unsigned long val;
241
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
243}
244
245static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr,
247 const char *buf, size_t n)
248{
249 unsigned long val;
250
251 if (sscanf(buf, "%lu", &val) == 1) {
252 if (pm_save_wakeup_count(val))
253 return n;
254 }
255 return -EINVAL;
256}
257
258power_attr(wakeup_count);
259#endif /* CONFIG_PM_SLEEP */
260
207#ifdef CONFIG_PM_TRACE 261#ifdef CONFIG_PM_TRACE
208int pm_trace_enabled; 262int pm_trace_enabled;
209 263
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
236#endif 290#endif
237#ifdef CONFIG_PM_SLEEP 291#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr, 292 &pm_async_attr.attr,
293 &wakeup_count_attr.attr,
239#ifdef CONFIG_PM_DEBUG 294#ifdef CONFIG_PM_DEBUG
240 &pm_test_attr.attr, 295 &pm_test_attr.attr,
241#endif 296#endif
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index e8b337006276..d52359374e85 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy)
24 24
25static DECLARE_WORK(poweroff_work, do_poweroff); 25static DECLARE_WORK(poweroff_work, do_poweroff);
26 26
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..028a99598f49 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h>
18 19
19/* 20/*
20 * Timeout for stopping processes 21 * Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
35 struct task_struct *g, *p; 36 struct task_struct *g, *p;
36 unsigned long end_time; 37 unsigned long end_time;
37 unsigned int todo; 38 unsigned int todo;
39 bool wq_busy = false;
38 struct timeval start, end; 40 struct timeval start, end;
39 u64 elapsed_csecs64; 41 u64 elapsed_csecs64;
40 unsigned int elapsed_csecs; 42 unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
42 do_gettimeofday(&start); 44 do_gettimeofday(&start);
43 45
44 end_time = jiffies + TIMEOUT; 46 end_time = jiffies + TIMEOUT;
47
48 if (!sig_only)
49 freeze_workqueues_begin();
50
45 while (true) { 51 while (true) {
46 todo = 0; 52 todo = 0;
47 read_lock(&tasklist_lock); 53 read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
63 todo++; 69 todo++;
64 } while_each_thread(g, p); 70 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 71 read_unlock(&tasklist_lock);
72
73 if (!sig_only) {
74 wq_busy = freeze_workqueues_busy();
75 todo += wq_busy;
76 }
77
66 if (!todo || time_after(jiffies, end_time)) 78 if (!todo || time_after(jiffies, end_time))
67 break; 79 break;
68 80
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only)
86 */ 98 */
87 printk("\n"); 99 printk("\n");
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 100 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 101 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 102 elapsed_csecs / 100, elapsed_csecs % 100,
103 todo - wq_busy, wq_busy);
104
105 thaw_workqueues();
106
91 read_lock(&tasklist_lock); 107 read_lock(&tasklist_lock);
92 do_each_thread(g, p) { 108 do_each_thread(g, p) {
93 task_lock(p); 109 task_lock(p);
@@ -157,6 +173,7 @@ void thaw_processes(void)
157 oom_killer_enable(); 173 oom_killer_enable();
158 174
159 printk("Restarting tasks ... "); 175 printk("Restarting tasks ... ");
176 thaw_workqueues();
160 thaw_tasks(true); 177 thaw_tasks(true);
161 thaw_tasks(false); 178 thaw_tasks(false);
162 schedule(); 179 schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8b..5e7edfb05e66 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * This file provides system snapshot/restore functionality for swsusp. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -1086,6 +1086,7 @@ void swsusp_free(void)
1086 buffer = NULL; 1086 buffer = NULL;
1087 alloc_normal = 0; 1087 alloc_normal = 0;
1088 alloc_highmem = 0; 1088 alloc_highmem = 0;
1089 hibernation_thaw_swap();
1089} 1090}
1090 1091
1091/* Helper functions used for the shrinking of memory. */ 1092/* Helper functions used for the shrinking of memory. */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f37cb7dd4402..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
136 if (suspend_ops->prepare) { 136 if (suspend_ops->prepare) {
137 error = suspend_ops->prepare(); 137 error = suspend_ops->prepare();
138 if (error) 138 if (error)
139 return error; 139 goto Platform_finish;
140 } 140 }
141 141
142 error = dpm_suspend_noirq(PMSG_SUSPEND); 142 error = dpm_suspend_noirq(PMSG_SUSPEND);
143 if (error) { 143 if (error) {
144 printk(KERN_ERR "PM: Some devices failed to power down\n"); 144 printk(KERN_ERR "PM: Some devices failed to power down\n");
145 goto Platfrom_finish; 145 goto Platform_finish;
146 } 146 }
147 147
148 if (suspend_ops->prepare_late) { 148 if (suspend_ops->prepare_late) {
149 error = suspend_ops->prepare_late(); 149 error = suspend_ops->prepare_late();
150 if (error) 150 if (error)
151 goto Power_up_devices; 151 goto Platform_wake;
152 } 152 }
153 153
154 if (suspend_test(TEST_PLATFORM)) 154 if (suspend_test(TEST_PLATFORM))
@@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
163 163
164 error = sysdev_suspend(PMSG_SUSPEND); 164 error = sysdev_suspend(PMSG_SUSPEND);
165 if (!error) { 165 if (!error) {
166 if (!suspend_test(TEST_CORE)) 166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
167 error = suspend_ops->enter(state); 167 error = suspend_ops->enter(state);
168 events_check_enabled = false;
169 }
168 sysdev_resume(); 170 sysdev_resume();
169 } 171 }
170 172
@@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
178 if (suspend_ops->wake) 180 if (suspend_ops->wake)
179 suspend_ops->wake(); 181 suspend_ops->wake();
180 182
181 Power_up_devices:
182 dpm_resume_noirq(PMSG_RESUME); 183 dpm_resume_noirq(PMSG_RESUME);
183 184
184 Platfrom_finish: 185 Platform_finish:
185 if (suspend_ops->finish) 186 if (suspend_ops->finish)
186 suspend_ops->finish(); 187 suspend_ops->finish();
187 188
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb21778391..5d0059eed3e4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
4 * This file provides functions for reading the suspend image from 4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition. 5 * and writing it to a swap partition.
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * 9 *
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
@@ -32,7 +32,7 @@
32/* 32/*
33 * The swap map is a data structure used for keeping track of each page 33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page 34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries. 35 * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
36 * These structures are stored on the swap and linked together with the 36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member. 37 * help of the .next_swap member.
38 * 38 *
@@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap)
136{ 136{
137 unsigned long offset; 137 unsigned long offset;
138 138
139 offset = swp_offset(get_swap_page_of_type(swap)); 139 offset = swp_offset(get_swap_for_hibernation(swap));
140 if (offset) { 140 if (offset) {
141 if (swsusp_extents_insert(offset)) 141 if (swsusp_extents_insert(offset))
142 swap_free(swp_entry(swap, offset)); 142 swap_free_for_hibernation(swp_entry(swap, offset));
143 else 143 else
144 return swapdev_block(swap, offset); 144 return swapdev_block(swap, offset);
145 } 145 }
@@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
148 148
149/** 149/**
150 * free_all_swap_pages - free swap pages allocated for saving image data. 150 * free_all_swap_pages - free swap pages allocated for saving image data.
151 * It also frees the extents used to register which swap entres had been 151 * It also frees the extents used to register which swap entries had been
152 * allocated. 152 * allocated.
153 */ 153 */
154 154
@@ -163,7 +163,7 @@ void free_all_swap_pages(int swap)
163 ext = container_of(node, struct swsusp_extent, node); 163 ext = container_of(node, struct swsusp_extent, node);
164 rb_erase(node, &swsusp_extents); 164 rb_erase(node, &swsusp_extents);
165 for (offset = ext->start; offset <= ext->end; offset++) 165 for (offset = ext->start; offset <= ext->end; offset++)
166 swap_free(swp_entry(swap, offset)); 166 swap_free_for_hibernation(swp_entry(swap, offset));
167 167
168 kfree(ext); 168 kfree(ext);
169 } 169 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 444b770c9595..8fe465ac008a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,8 @@
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h>
41#include <linux/notifier.h>
40 42
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
42 44
@@ -985,6 +987,32 @@ void resume_console(void)
985} 987}
986 988
987/** 989/**
990 * console_cpu_notify - print deferred console messages after CPU hotplug
991 * @self: notifier struct
992 * @action: CPU hotplug event
993 * @hcpu: unused
994 *
995 * If printk() is called from a CPU that is not online yet, the messages
996 * will be spooled but will not show up on the console. This function is
997 * called when a new CPU comes online (or fails to come up), and ensures
998 * that any such output gets printed.
999 */
1000static int __cpuinit console_cpu_notify(struct notifier_block *self,
1001 unsigned long action, void *hcpu)
1002{
1003 switch (action) {
1004 case CPU_ONLINE:
1005 case CPU_DEAD:
1006 case CPU_DYING:
1007 case CPU_DOWN_FAILED:
1008 case CPU_UP_CANCELED:
1009 acquire_console_sem();
1010 release_console_sem();
1011 }
1012 return NOTIFY_OK;
1013}
1014
1015/**
988 * acquire_console_sem - lock the console system for exclusive use. 1016 * acquire_console_sem - lock the console system for exclusive use.
989 * 1017 *
990 * Acquires a semaphore which guarantees that the caller has 1018 * Acquires a semaphore which guarantees that the caller has
@@ -1371,7 +1399,7 @@ int unregister_console(struct console *console)
1371} 1399}
1372EXPORT_SYMBOL(unregister_console); 1400EXPORT_SYMBOL(unregister_console);
1373 1401
1374static int __init disable_boot_consoles(void) 1402static int __init printk_late_init(void)
1375{ 1403{
1376 struct console *con; 1404 struct console *con;
1377 1405
@@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void)
1382 unregister_console(con); 1410 unregister_console(con);
1383 } 1411 }
1384 } 1412 }
1413 hotcpu_notifier(console_cpu_notify, 0);
1385 return 0; 1414 return 0;
1386} 1415}
1387late_initcall(disable_boot_consoles); 1416late_initcall(printk_late_init);
1388 1417
1389#if defined CONFIG_PRINTK 1418#if defined CONFIG_PRINTK
1390 1419
@@ -1520,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1520 chars = logged_chars; 1549 chars = logged_chars;
1521 spin_unlock_irqrestore(&logbuf_lock, flags); 1550 spin_unlock_irqrestore(&logbuf_lock, flags);
1522 1551
1523 if (logged_chars > end) { 1552 if (chars > end) {
1524 s1 = log_buf + log_buf_len - logged_chars + end; 1553 s1 = log_buf + log_buf_len - chars + end;
1525 l1 = logged_chars - end; 1554 l1 = chars - end;
1526 1555
1527 s2 = log_buf; 1556 s2 = log_buf;
1528 l2 = end; 1557 l2 = end;
@@ -1530,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1530 s1 = ""; 1559 s1 = "";
1531 l1 = 0; 1560 l1 = 0;
1532 1561
1533 s2 = log_buf + end - logged_chars; 1562 s2 = log_buf + end - chars;
1534 l2 = logged_chars; 1563 l2 = chars;
1535 } 1564 }
1536 1565
1537 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1566 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 74a3d693c196..f34d798ef4a2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -324,26 +324,32 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
324} 324}
325 325
326/* 326/*
327 * Detach all tasks we were using ptrace on. 327 * Detach all tasks we were using ptrace on. Called with tasklist held
328 * for writing, and returns with it held too. But note it can release
329 * and reacquire the lock.
328 */ 330 */
329void exit_ptrace(struct task_struct *tracer) 331void exit_ptrace(struct task_struct *tracer)
330{ 332{
331 struct task_struct *p, *n; 333 struct task_struct *p, *n;
332 LIST_HEAD(ptrace_dead); 334 LIST_HEAD(ptrace_dead);
333 335
334 write_lock_irq(&tasklist_lock); 336 if (likely(list_empty(&tracer->ptraced)))
337 return;
338
335 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 339 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
336 if (__ptrace_detach(tracer, p)) 340 if (__ptrace_detach(tracer, p))
337 list_add(&p->ptrace_entry, &ptrace_dead); 341 list_add(&p->ptrace_entry, &ptrace_dead);
338 } 342 }
339 write_unlock_irq(&tasklist_lock);
340 343
344 write_unlock_irq(&tasklist_lock);
341 BUG_ON(!list_empty(&tracer->ptraced)); 345 BUG_ON(!list_empty(&tracer->ptraced));
342 346
343 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { 347 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
344 list_del_init(&p->ptrace_entry); 348 list_del_init(&p->ptrace_entry);
345 release_task(p); 349 release_task(p);
346 } 350 }
351
352 write_lock_irq(&tasklist_lock);
347} 353}
348 354
349int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 355int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/range.c b/kernel/range.c
index 74e2e6114927..471b66acabb5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -7,10 +7,6 @@
7 7
8#include <linux/range.h> 8#include <linux/range.h>
9 9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) 10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{ 11{
16 if (start >= end) 12 if (start >= end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void)
114} 114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); 115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */ 116#endif /* #ifdef CONFIG_PROVE_RCU */
117
118#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
119static inline void debug_init_rcu_head(struct rcu_head *head)
120{
121 debug_object_init(head, &rcuhead_debug_descr);
122}
123
124static inline void debug_rcu_head_free(struct rcu_head *head)
125{
126 debug_object_free(head, &rcuhead_debug_descr);
127}
128
129/*
130 * fixup_init is called when:
131 * - an active object is initialized
132 */
133static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
134{
135 struct rcu_head *head = addr;
136
137 switch (state) {
138 case ODEBUG_STATE_ACTIVE:
139 /*
140 * Ensure that queued callbacks are all executed.
141 * If we detect that we are nested in a RCU read-side critical
142 * section, we should simply fail, otherwise we would deadlock.
143 */
144 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
145 irqs_disabled()) {
146 WARN_ON(1);
147 return 0;
148 }
149 rcu_barrier();
150 rcu_barrier_sched();
151 rcu_barrier_bh();
152 debug_object_init(head, &rcuhead_debug_descr);
153 return 1;
154 default:
155 return 0;
156 }
157}
158
159/*
160 * fixup_activate is called when:
161 * - an active object is activated
162 * - an unknown object is activated (might be a statically initialized object)
163 * Activation is performed internally by call_rcu().
164 */
165static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
166{
167 struct rcu_head *head = addr;
168
169 switch (state) {
170
171 case ODEBUG_STATE_NOTAVAILABLE:
172 /*
173 * This is not really a fixup. We just make sure that it is
174 * tracked in the object tracker.
175 */
176 debug_object_init(head, &rcuhead_debug_descr);
177 debug_object_activate(head, &rcuhead_debug_descr);
178 return 0;
179
180 case ODEBUG_STATE_ACTIVE:
181 /*
182 * Ensure that queued callbacks are all executed.
183 * If we detect that we are nested in a RCU read-side critical
184 * section, we should simply fail, otherwise we would deadlock.
185 */
186 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
187 irqs_disabled()) {
188 WARN_ON(1);
189 return 0;
190 }
191 rcu_barrier();
192 rcu_barrier_sched();
193 rcu_barrier_bh();
194 debug_object_activate(head, &rcuhead_debug_descr);
195 return 1;
196 default:
197 return 0;
198 }
199}
200
201/*
202 * fixup_free is called when:
203 * - an active object is freed
204 */
205static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
206{
207 struct rcu_head *head = addr;
208
209 switch (state) {
210 case ODEBUG_STATE_ACTIVE:
211 /*
212 * Ensure that queued callbacks are all executed.
213 * If we detect that we are nested in a RCU read-side critical
214 * section, we should simply fail, otherwise we would deadlock.
215 */
216#ifndef CONFIG_PREEMPT
217 WARN_ON(1);
218 return 0;
219#else
220 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
221 irqs_disabled()) {
222 WARN_ON(1);
223 return 0;
224 }
225 rcu_barrier();
226 rcu_barrier_sched();
227 rcu_barrier_bh();
228 debug_object_free(head, &rcuhead_debug_descr);
229 return 1;
230#endif
231 default:
232 return 0;
233 }
234}
235
236/**
237 * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
238 * @head: pointer to rcu_head structure to be initialized
239 *
240 * This function informs debugobjects of a new rcu_head structure that
241 * has been allocated as an auto variable on the stack. This function
242 * is not required for rcu_head structures that are statically defined or
243 * that are dynamically allocated on the heap. This function has no
244 * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
245 */
246void init_rcu_head_on_stack(struct rcu_head *head)
247{
248 debug_object_init_on_stack(head, &rcuhead_debug_descr);
249}
250EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
251
252/**
253 * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
254 * @head: pointer to rcu_head structure to be initialized
255 *
256 * This function informs debugobjects that an on-stack rcu_head structure
257 * is about to go out of scope. As with init_rcu_head_on_stack(), this
258 * function is not required for rcu_head structures that are statically
259 * defined or that are dynamically allocated on the heap. Also as with
260 * init_rcu_head_on_stack(), this function has no effect for
261 * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
262 */
263void destroy_rcu_head_on_stack(struct rcu_head *head)
264{
265 debug_object_free(head, &rcuhead_debug_descr);
266}
267EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
268
269struct debug_obj_descr rcuhead_debug_descr = {
270 .name = "rcu_head",
271 .fixup_init = rcuhead_fixup_init,
272 .fixup_activate = rcuhead_fixup_activate,
273 .fixup_free = rcuhead_fixup_free,
274};
275EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
276#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
169 while (list) { 169 while (list) {
170 next = list->next; 170 next = list->next;
171 prefetch(next); 171 prefetch(next);
172 debug_rcu_head_unqueue(list);
172 list->func(list); 173 list->func(list);
173 list = next; 174 list = next;
174 } 175 }
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
211{ 212{
212 unsigned long flags; 213 unsigned long flags;
213 214
215 debug_rcu_head_queue(head);
214 head->func = func; 216 head->func = func;
215 head->next = NULL; 217 head->next = NULL;
216 218
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
239rcu_random(struct rcu_random_state *rrsp) 239rcu_random(struct rcu_random_state *rrsp)
240{ 240{
241 if (--rrsp->rrs_count < 0) { 241 if (--rrsp->rrs_count < 0) {
242 rrsp->rrs_state += 242 rrsp->rrs_state += (unsigned long)local_clock();
243 (unsigned long)cpu_clock(raw_smp_processor_id());
244 rrsp->rrs_count = RCU_RANDOM_REFRESH; 243 rrsp->rrs_count = RCU_RANDOM_REFRESH;
245 } 244 }
246 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 245 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1112 while (list) { 1112 while (list) {
1113 next = list->next; 1113 next = list->next;
1114 prefetch(next); 1114 prefetch(next);
1115 debug_rcu_head_unqueue(list);
1115 list->func(list); 1116 list->func(list);
1116 list = next; 1117 list = next;
1117 if (++count >= rdp->blimit) 1118 if (++count >= rdp->blimit)
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1388 unsigned long flags; 1389 unsigned long flags;
1389 struct rcu_data *rdp; 1390 struct rcu_data *rdp;
1390 1391
1392 debug_rcu_head_queue(head);
1391 head->func = func; 1393 head->func = func;
1392 head->next = NULL; 1394 head->next = NULL;
1393 1395
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..09b574e7f4df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
456 unsigned long nr_running; 457 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 462 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
462#endif 464#endif
463 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
464 466
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
1193 1195
1194#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1195/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
1232 smp_send_reschedule(cpu); 1255 smp_send_reschedule(cpu);
1233} 1256}
1234 1257
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1245#endif /* CONFIG_NO_HZ */ 1258#endif /* CONFIG_NO_HZ */
1246 1259
1247static u64 sched_avg_period(void) 1260static u64 sched_avg_period(void)
@@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
1652 if (root_task_group_empty()) 1665 if (root_task_group_empty())
1653 return; 1666 return;
1654 1667
1655 now = cpu_clock(raw_smp_processor_id()); 1668 now = local_clock();
1656 elapsed = now - sd->last_update; 1669 elapsed = now - sd->last_update;
1657 1670
1658 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1671 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805static void calc_load_account_idle(struct rq *this_rq); 1818static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1819static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1820static int get_update_sysctl_factor(void);
1821static void update_cpu_load(struct rq *this_rq);
1808 1822
1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1810{ 1824{
@@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample)
2267} 2281}
2268#endif 2282#endif
2269 2283
2270/*** 2284static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2285 bool is_sync, bool is_migrate, bool is_local,
2286 unsigned long en_flags)
2287{
2288 schedstat_inc(p, se.statistics.nr_wakeups);
2289 if (is_sync)
2290 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2291 if (is_migrate)
2292 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2293 if (is_local)
2294 schedstat_inc(p, se.statistics.nr_wakeups_local);
2295 else
2296 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2297
2298 activate_task(rq, p, en_flags);
2299}
2300
2301static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2302 int wake_flags, bool success)
2303{
2304 trace_sched_wakeup(p, success);
2305 check_preempt_curr(rq, p, wake_flags);
2306
2307 p->state = TASK_RUNNING;
2308#ifdef CONFIG_SMP
2309 if (p->sched_class->task_woken)
2310 p->sched_class->task_woken(rq, p);
2311
2312 if (unlikely(rq->idle_stamp)) {
2313 u64 delta = rq->clock - rq->idle_stamp;
2314 u64 max = 2*sysctl_sched_migration_cost;
2315
2316 if (delta > max)
2317 rq->avg_idle = max;
2318 else
2319 update_avg(&rq->avg_idle, delta);
2320 rq->idle_stamp = 0;
2321 }
2322#endif
2323 /* if a worker is waking up, notify workqueue */
2324 if ((p->flags & PF_WQ_WORKER) && success)
2325 wq_worker_waking_up(p, cpu_of(rq));
2326}
2327
2328/**
2271 * try_to_wake_up - wake up a thread 2329 * try_to_wake_up - wake up a thread
2272 * @p: the to-be-woken-up thread 2330 * @p: the thread to be awakened
2273 * @state: the mask of task states that can be woken 2331 * @state: the mask of task states that can be woken
2274 * @sync: do a synchronous wakeup? 2332 * @wake_flags: wake modifier flags (WF_*)
2275 * 2333 *
2276 * Put it on the run-queue if it's not already there. The "current" 2334 * Put it on the run-queue if it's not already there. The "current"
2277 * thread is always on the run-queue (except when the actual 2335 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample)
2279 * the simpler "current->state = TASK_RUNNING" to mark yourself 2337 * the simpler "current->state = TASK_RUNNING" to mark yourself
2280 * runnable without the overhead of this. 2338 * runnable without the overhead of this.
2281 * 2339 *
2282 * returns failure only if the task is already active. 2340 * Returns %true if @p was woken up, %false if it was already running
2341 * or @state didn't match @p's state.
2283 */ 2342 */
2284static int try_to_wake_up(struct task_struct *p, unsigned int state, 2343static int try_to_wake_up(struct task_struct *p, unsigned int state,
2285 int wake_flags) 2344 int wake_flags)
@@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359 2418
2360out_activate: 2419out_activate:
2361#endif /* CONFIG_SMP */ 2420#endif /* CONFIG_SMP */
2362 schedstat_inc(p, se.statistics.nr_wakeups); 2421 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2363 if (wake_flags & WF_SYNC) 2422 cpu == this_cpu, en_flags);
2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2365 if (orig_cpu != cpu)
2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2367 if (cpu == this_cpu)
2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2369 else
2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2371 activate_task(rq, p, en_flags);
2372 success = 1; 2423 success = 1;
2373
2374out_running: 2424out_running:
2375 trace_sched_wakeup(p, success); 2425 ttwu_post_activation(p, rq, wake_flags, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394out: 2426out:
2395 task_rq_unlock(rq, &flags); 2427 task_rq_unlock(rq, &flags);
2396 put_cpu(); 2428 put_cpu();
@@ -2399,6 +2431,37 @@ out:
2399} 2431}
2400 2432
2401/** 2433/**
2434 * try_to_wake_up_local - try to wake up a local task with rq lock held
2435 * @p: the thread to be awakened
2436 *
2437 * Put @p on the run-queue if it's not alredy there. The caller must
2438 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2439 * the current task. this_rq() stays locked over invocation.
2440 */
2441static void try_to_wake_up_local(struct task_struct *p)
2442{
2443 struct rq *rq = task_rq(p);
2444 bool success = false;
2445
2446 BUG_ON(rq != this_rq());
2447 BUG_ON(p == current);
2448 lockdep_assert_held(&rq->lock);
2449
2450 if (!(p->state & TASK_NORMAL))
2451 return;
2452
2453 if (!p->se.on_rq) {
2454 if (likely(!task_running(rq, p))) {
2455 schedstat_inc(rq, ttwu_count);
2456 schedstat_inc(rq, ttwu_local);
2457 }
2458 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2459 success = true;
2460 }
2461 ttwu_post_activation(p, rq, 0, success);
2462}
2463
2464/**
2402 * wake_up_process - Wake up a specific process 2465 * wake_up_process - Wake up a specific process
2403 * @p: The process to be woken up. 2466 * @p: The process to be woken up.
2404 * 2467 *
@@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq)
3012} 3075}
3013 3076
3014/* 3077/*
3078 * The exact cpuload at various idx values, calculated at every tick would be
3079 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3080 *
3081 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3082 * on nth tick when cpu may be busy, then we have:
3083 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3084 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3085 *
3086 * decay_load_missed() below does efficient calculation of
3087 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3088 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3089 *
3090 * The calculation is approximated on a 128 point scale.
3091 * degrade_zero_ticks is the number of ticks after which load at any
3092 * particular idx is approximated to be zero.
3093 * degrade_factor is a precomputed table, a row for each load idx.
3094 * Each column corresponds to degradation factor for a power of two ticks,
3095 * based on 128 point scale.
3096 * Example:
3097 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3098 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3099 *
3100 * With this power of 2 load factors, we can degrade the load n times
3101 * by looking at 1 bits in n and doing as many mult/shift instead of
3102 * n mult/shifts needed by the exact degradation.
3103 */
3104#define DEGRADE_SHIFT 7
3105static const unsigned char
3106 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3107static const unsigned char
3108 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3109 {0, 0, 0, 0, 0, 0, 0, 0},
3110 {64, 32, 8, 0, 0, 0, 0, 0},
3111 {96, 72, 40, 12, 1, 0, 0},
3112 {112, 98, 75, 43, 15, 1, 0},
3113 {120, 112, 98, 76, 45, 16, 2} };
3114
3115/*
3116 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3117 * would be when CPU is idle and so we just decay the old load without
3118 * adding any new load.
3119 */
3120static unsigned long
3121decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3122{
3123 int j = 0;
3124
3125 if (!missed_updates)
3126 return load;
3127
3128 if (missed_updates >= degrade_zero_ticks[idx])
3129 return 0;
3130
3131 if (idx == 1)
3132 return load >> missed_updates;
3133
3134 while (missed_updates) {
3135 if (missed_updates % 2)
3136 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3137
3138 missed_updates >>= 1;
3139 j++;
3140 }
3141 return load;
3142}
3143
3144/*
3015 * Update rq->cpu_load[] statistics. This function is usually called every 3145 * Update rq->cpu_load[] statistics. This function is usually called every
3016 * scheduler tick (TICK_NSEC). 3146 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3147 * every tick. We fix it up based on jiffies.
3017 */ 3148 */
3018static void update_cpu_load(struct rq *this_rq) 3149static void update_cpu_load(struct rq *this_rq)
3019{ 3150{
3020 unsigned long this_load = this_rq->load.weight; 3151 unsigned long this_load = this_rq->load.weight;
3152 unsigned long curr_jiffies = jiffies;
3153 unsigned long pending_updates;
3021 int i, scale; 3154 int i, scale;
3022 3155
3023 this_rq->nr_load_updates++; 3156 this_rq->nr_load_updates++;
3024 3157
3158 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3159 if (curr_jiffies == this_rq->last_load_update_tick)
3160 return;
3161
3162 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3163 this_rq->last_load_update_tick = curr_jiffies;
3164
3025 /* Update our load: */ 3165 /* Update our load: */
3026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3166 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3167 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3027 unsigned long old_load, new_load; 3168 unsigned long old_load, new_load;
3028 3169
3029 /* scale is effectively 1 << i now, and >> i divides by scale */ 3170 /* scale is effectively 1 << i now, and >> i divides by scale */
3030 3171
3031 old_load = this_rq->cpu_load[i]; 3172 old_load = this_rq->cpu_load[i];
3173 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3032 new_load = this_load; 3174 new_load = this_load;
3033 /* 3175 /*
3034 * Round up the averaging division if load is increasing. This 3176 * Round up the averaging division if load is increasing. This
@@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq)
3036 * example. 3178 * example.
3037 */ 3179 */
3038 if (new_load > old_load) 3180 if (new_load > old_load)
3039 new_load += scale-1; 3181 new_load += scale - 1;
3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3182
3183 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3041 } 3184 }
3185}
3186
3187static void update_cpu_load_active(struct rq *this_rq)
3188{
3189 update_cpu_load(this_rq);
3042 3190
3043 calc_load_account_active(this_rq); 3191 calc_load_account_active(this_rq);
3044} 3192}
@@ -3426,7 +3574,7 @@ void scheduler_tick(void)
3426 3574
3427 raw_spin_lock(&rq->lock); 3575 raw_spin_lock(&rq->lock);
3428 update_rq_clock(rq); 3576 update_rq_clock(rq);
3429 update_cpu_load(rq); 3577 update_cpu_load_active(rq);
3430 curr->sched_class->task_tick(rq, curr, 0); 3578 curr->sched_class->task_tick(rq, curr, 0);
3431 raw_spin_unlock(&rq->lock); 3579 raw_spin_unlock(&rq->lock);
3432 3580
@@ -3598,7 +3746,6 @@ need_resched:
3598 rq = cpu_rq(cpu); 3746 rq = cpu_rq(cpu);
3599 rcu_note_context_switch(cpu); 3747 rcu_note_context_switch(cpu);
3600 prev = rq->curr; 3748 prev = rq->curr;
3601 switch_count = &prev->nivcsw;
3602 3749
3603 release_kernel_lock(prev); 3750 release_kernel_lock(prev);
3604need_resched_nonpreemptible: 3751need_resched_nonpreemptible:
@@ -3611,11 +3758,26 @@ need_resched_nonpreemptible:
3611 raw_spin_lock_irq(&rq->lock); 3758 raw_spin_lock_irq(&rq->lock);
3612 clear_tsk_need_resched(prev); 3759 clear_tsk_need_resched(prev);
3613 3760
3761 switch_count = &prev->nivcsw;
3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3762 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3615 if (unlikely(signal_pending_state(prev->state, prev))) 3763 if (unlikely(signal_pending_state(prev->state, prev))) {
3616 prev->state = TASK_RUNNING; 3764 prev->state = TASK_RUNNING;
3617 else 3765 } else {
3766 /*
3767 * If a worker is going to sleep, notify and
3768 * ask workqueue whether it wants to wake up a
3769 * task to maintain concurrency. If so, wake
3770 * up the task.
3771 */
3772 if (prev->flags & PF_WQ_WORKER) {
3773 struct task_struct *to_wakeup;
3774
3775 to_wakeup = wq_worker_sleeping(prev, cpu);
3776 if (to_wakeup)
3777 try_to_wake_up_local(to_wakeup);
3778 }
3618 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3779 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3780 }
3619 switch_count = &prev->nvcsw; 3781 switch_count = &prev->nvcsw;
3620 } 3782 }
3621 3783
@@ -3637,8 +3799,10 @@ need_resched_nonpreemptible:
3637 3799
3638 context_switch(rq, prev, next); /* unlocks the rq */ 3800 context_switch(rq, prev, next); /* unlocks the rq */
3639 /* 3801 /*
3640 * the context switch might have flipped the stack from under 3802 * The context switch have flipped the stack from under us
3641 * us, hence refresh the local variables. 3803 * and restored the local variables which were saved when
3804 * this task called schedule() in the past. prev == current
3805 * is still correct, but it can be moved to another cpu/rq.
3642 */ 3806 */
3643 cpu = smp_processor_id(); 3807 cpu = smp_processor_id();
3644 rq = cpu_rq(cpu); 3808 rq = cpu_rq(cpu);
@@ -3647,11 +3811,8 @@ need_resched_nonpreemptible:
3647 3811
3648 post_schedule(rq); 3812 post_schedule(rq);
3649 3813
3650 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3814 if (unlikely(reacquire_kernel_lock(prev)))
3651 prev = rq->curr;
3652 switch_count = &prev->nivcsw;
3653 goto need_resched_nonpreemptible; 3815 goto need_resched_nonpreemptible;
3654 }
3655 3816
3656 preempt_enable_no_resched(); 3817 preempt_enable_no_resched();
3657 if (need_resched()) 3818 if (need_resched())
@@ -3704,8 +3865,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3704 /* 3865 /*
3705 * Owner changed, break to re-assess state. 3866 * Owner changed, break to re-assess state.
3706 */ 3867 */
3707 if (lock->owner != owner) 3868 if (lock->owner != owner) {
3869 /*
3870 * If the lock has switched to a different owner,
3871 * we likely have heavy contention. Return 0 to quit
3872 * optimistic spinning and not contend further:
3873 */
3874 if (lock->owner)
3875 return 0;
3708 break; 3876 break;
3877 }
3709 3878
3710 /* 3879 /*
3711 * Is that owner really running on that cpu? 3880 * Is that owner really running on that cpu?
@@ -3726,7 +3895,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3726 * off of preempt_enable. Kernel preemptions off return from interrupt 3895 * off of preempt_enable. Kernel preemptions off return from interrupt
3727 * occur there and call schedule directly. 3896 * occur there and call schedule directly.
3728 */ 3897 */
3729asmlinkage void __sched preempt_schedule(void) 3898asmlinkage void __sched notrace preempt_schedule(void)
3730{ 3899{
3731 struct thread_info *ti = current_thread_info(); 3900 struct thread_info *ti = current_thread_info();
3732 3901
@@ -3738,9 +3907,9 @@ asmlinkage void __sched preempt_schedule(void)
3738 return; 3907 return;
3739 3908
3740 do { 3909 do {
3741 add_preempt_count(PREEMPT_ACTIVE); 3910 add_preempt_count_notrace(PREEMPT_ACTIVE);
3742 schedule(); 3911 schedule();
3743 sub_preempt_count(PREEMPT_ACTIVE); 3912 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3744 3913
3745 /* 3914 /*
3746 * Check again in case we missed a preemption opportunity 3915 * Check again in case we missed a preemption opportunity
@@ -4441,12 +4610,8 @@ recheck:
4441 */ 4610 */
4442 if (user && !capable(CAP_SYS_NICE)) { 4611 if (user && !capable(CAP_SYS_NICE)) {
4443 if (rt_policy(policy)) { 4612 if (rt_policy(policy)) {
4444 unsigned long rlim_rtprio; 4613 unsigned long rlim_rtprio =
4445 4614 task_rlimit(p, RLIMIT_RTPRIO);
4446 if (!lock_task_sighand(p, &flags))
4447 return -ESRCH;
4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4449 unlock_task_sighand(p, &flags);
4450 4615
4451 /* can't set/change the rt policy */ 4616 /* can't set/change the rt policy */
4452 if (policy != p->policy && !rlim_rtprio) 4617 if (policy != p->policy && !rlim_rtprio)
@@ -5816,20 +5981,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5816 */ 5981 */
5817static struct notifier_block __cpuinitdata migration_notifier = { 5982static struct notifier_block __cpuinitdata migration_notifier = {
5818 .notifier_call = migration_call, 5983 .notifier_call = migration_call,
5819 .priority = 10 5984 .priority = CPU_PRI_MIGRATION,
5820}; 5985};
5821 5986
5987static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5988 unsigned long action, void *hcpu)
5989{
5990 switch (action & ~CPU_TASKS_FROZEN) {
5991 case CPU_ONLINE:
5992 case CPU_DOWN_FAILED:
5993 set_cpu_active((long)hcpu, true);
5994 return NOTIFY_OK;
5995 default:
5996 return NOTIFY_DONE;
5997 }
5998}
5999
6000static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6001 unsigned long action, void *hcpu)
6002{
6003 switch (action & ~CPU_TASKS_FROZEN) {
6004 case CPU_DOWN_PREPARE:
6005 set_cpu_active((long)hcpu, false);
6006 return NOTIFY_OK;
6007 default:
6008 return NOTIFY_DONE;
6009 }
6010}
6011
5822static int __init migration_init(void) 6012static int __init migration_init(void)
5823{ 6013{
5824 void *cpu = (void *)(long)smp_processor_id(); 6014 void *cpu = (void *)(long)smp_processor_id();
5825 int err; 6015 int err;
5826 6016
5827 /* Start one for the boot CPU: */ 6017 /* Initialize migration for the boot CPU */
5828 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6018 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5829 BUG_ON(err == NOTIFY_BAD); 6019 BUG_ON(err == NOTIFY_BAD);
5830 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6020 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5831 register_cpu_notifier(&migration_notifier); 6021 register_cpu_notifier(&migration_notifier);
5832 6022
6023 /* Register cpu active notifiers */
6024 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6025 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6026
5833 return 0; 6027 return 0;
5834} 6028}
5835early_initcall(migration_init); 6029early_initcall(migration_init);
@@ -6064,23 +6258,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6064 free_rootdomain(old_rd); 6258 free_rootdomain(old_rd);
6065} 6259}
6066 6260
6067static int init_rootdomain(struct root_domain *rd, bool bootmem) 6261static int init_rootdomain(struct root_domain *rd)
6068{ 6262{
6069 gfp_t gfp = GFP_KERNEL;
6070
6071 memset(rd, 0, sizeof(*rd)); 6263 memset(rd, 0, sizeof(*rd));
6072 6264
6073 if (bootmem) 6265 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6074 gfp = GFP_NOWAIT;
6075
6076 if (!alloc_cpumask_var(&rd->span, gfp))
6077 goto out; 6266 goto out;
6078 if (!alloc_cpumask_var(&rd->online, gfp)) 6267 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6079 goto free_span; 6268 goto free_span;
6080 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6269 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6081 goto free_online; 6270 goto free_online;
6082 6271
6083 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6272 if (cpupri_init(&rd->cpupri) != 0)
6084 goto free_rto_mask; 6273 goto free_rto_mask;
6085 return 0; 6274 return 0;
6086 6275
@@ -6096,7 +6285,7 @@ out:
6096 6285
6097static void init_defrootdomain(void) 6286static void init_defrootdomain(void)
6098{ 6287{
6099 init_rootdomain(&def_root_domain, true); 6288 init_rootdomain(&def_root_domain);
6100 6289
6101 atomic_set(&def_root_domain.refcount, 1); 6290 atomic_set(&def_root_domain.refcount, 1);
6102} 6291}
@@ -6109,7 +6298,7 @@ static struct root_domain *alloc_rootdomain(void)
6109 if (!rd) 6298 if (!rd)
6110 return NULL; 6299 return NULL;
6111 6300
6112 if (init_rootdomain(rd, false) != 0) { 6301 if (init_rootdomain(rd) != 0) {
6113 kfree(rd); 6302 kfree(rd);
6114 return NULL; 6303 return NULL;
6115 } 6304 }
@@ -7288,29 +7477,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7288} 7477}
7289#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7478#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7290 7479
7291#ifndef CONFIG_CPUSETS
7292/* 7480/*
7293 * Add online and remove offline CPUs from the scheduler domains. 7481 * Update cpusets according to cpu_active mask. If cpusets are
7294 * When cpusets are enabled they take over this function. 7482 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7483 * around partition_sched_domains().
7295 */ 7484 */
7296static int update_sched_domains(struct notifier_block *nfb, 7485static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7297 unsigned long action, void *hcpu) 7486 void *hcpu)
7298{ 7487{
7299 switch (action) { 7488 switch (action & ~CPU_TASKS_FROZEN) {
7300 case CPU_ONLINE: 7489 case CPU_ONLINE:
7301 case CPU_ONLINE_FROZEN:
7302 case CPU_DOWN_PREPARE:
7303 case CPU_DOWN_PREPARE_FROZEN:
7304 case CPU_DOWN_FAILED: 7490 case CPU_DOWN_FAILED:
7305 case CPU_DOWN_FAILED_FROZEN: 7491 cpuset_update_active_cpus();
7306 partition_sched_domains(1, NULL, NULL);
7307 return NOTIFY_OK; 7492 return NOTIFY_OK;
7493 default:
7494 return NOTIFY_DONE;
7495 }
7496}
7308 7497
7498static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7499 void *hcpu)
7500{
7501 switch (action & ~CPU_TASKS_FROZEN) {
7502 case CPU_DOWN_PREPARE:
7503 cpuset_update_active_cpus();
7504 return NOTIFY_OK;
7309 default: 7505 default:
7310 return NOTIFY_DONE; 7506 return NOTIFY_DONE;
7311 } 7507 }
7312} 7508}
7313#endif
7314 7509
7315static int update_runtime(struct notifier_block *nfb, 7510static int update_runtime(struct notifier_block *nfb,
7316 unsigned long action, void *hcpu) 7511 unsigned long action, void *hcpu)
@@ -7356,10 +7551,8 @@ void __init sched_init_smp(void)
7356 mutex_unlock(&sched_domains_mutex); 7551 mutex_unlock(&sched_domains_mutex);
7357 put_online_cpus(); 7552 put_online_cpus();
7358 7553
7359#ifndef CONFIG_CPUSETS 7554 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7360 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7555 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7361 hotcpu_notifier(update_sched_domains, 0);
7362#endif
7363 7556
7364 /* RT runtime code needs to handle some hotplug events */ 7557 /* RT runtime code needs to handle some hotplug events */
7365 hotcpu_notifier(update_runtime, 0); 7558 hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +7797,9 @@ void __init sched_init(void)
7604 7797
7605 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7798 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7606 rq->cpu_load[j] = 0; 7799 rq->cpu_load[j] = 0;
7800
7801 rq->last_load_update_tick = jiffies;
7802
7607#ifdef CONFIG_SMP 7803#ifdef CONFIG_SMP
7608 rq->sd = NULL; 7804 rq->sd = NULL;
7609 rq->rd = NULL; 7805 rq->rd = NULL;
@@ -7617,6 +7813,10 @@ void __init sched_init(void)
7617 rq->idle_stamp = 0; 7813 rq->idle_stamp = 0;
7618 rq->avg_idle = 2*sysctl_sched_migration_cost; 7814 rq->avg_idle = 2*sysctl_sched_migration_cost;
7619 rq_attach_root(rq, &def_root_domain); 7815 rq_attach_root(rq, &def_root_domain);
7816#ifdef CONFIG_NO_HZ
7817 rq->nohz_balance_kick = 0;
7818 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7819#endif
7620#endif 7820#endif
7621 init_rq_hrtick(rq); 7821 init_rq_hrtick(rq);
7622 atomic_set(&rq->nr_iowait, 0); 7822 atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +7861,11 @@ void __init sched_init(void)
7661 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7861 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7662#ifdef CONFIG_SMP 7862#ifdef CONFIG_SMP
7663#ifdef CONFIG_NO_HZ 7863#ifdef CONFIG_NO_HZ
7664 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7864 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7665 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7865 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7866 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7867 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7868 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7666#endif 7869#endif
7667 /* May be allocated at isolcpus cmdline parse time */ 7870 /* May be allocated at isolcpus cmdline parse time */
7668 if (cpu_isolated_map == NULL) 7871 if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
10 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 *
14 * - gtod 14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
15 * - sched_clock() 43 * - sched_clock()
16 * - explicit idle events 44 * - explicit idle events
17 * 45 *
18 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
19 * making it monotonic and keeping it within an expected window. 47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
20 * 49 *
21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
22 * that is otherwise invisible (TSC gets stopped). 51 * that is otherwise invisible (TSC gets stopped).
23 * 52 *
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 53 *
25 * consistent between cpus (never more than 2 jiffies difference). 54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
26 */ 62 */
27#include <linux/spinlock.h> 63#include <linux/spinlock.h>
28#include <linux/hardirq.h> 64#include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
170 return val; 206 return val;
171} 207}
172 208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
173u64 sched_clock_cpu(int cpu) 214u64 sched_clock_cpu(int cpu)
174{ 215{
175 struct sched_clock_data *scd; 216 struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
237} 278}
238EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
239 280
240unsigned long long cpu_clock(int cpu) 281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
241{ 292{
242 unsigned long long clock; 293 u64 clock;
243 unsigned long flags; 294 unsigned long flags;
244 295
245 local_irq_save(flags); 296 local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
249 return clock; 300 return clock;
250} 301}
251 302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
252#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
253 323
254void sched_clock_init(void) 324void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
264 return sched_clock(); 334 return sched_clock();
265} 335}
266 336
267 337u64 cpu_clock(int cpu)
268unsigned long long cpu_clock(int cpu)
269{ 338{
270 return sched_clock_cpu(cpu); 339 return sched_clock_cpu(cpu);
271} 340}
272 341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
273#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
274 348
275EXPORT_SYMBOL_GPL(cpu_clock); 349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
166 * 166 *
167 * Returns: -ENOMEM if memory fails. 167 * Returns: -ENOMEM if memory fails.
168 */ 168 */
169int cpupri_init(struct cpupri *cp, bool bootmem) 169int cpupri_init(struct cpupri *cp)
170{ 170{
171 gfp_t gfp = GFP_KERNEL;
172 int i; 171 int i;
173 172
174 if (bootmem)
175 gfp = GFP_NOWAIT;
176
177 memset(cp, 0, sizeof(*cp)); 173 memset(cp, 0, sizeof(*cp));
178 174
179 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
181 177
182 raw_spin_lock_init(&vec->lock); 178 raw_spin_lock_init(&vec->lock);
183 vec->count = 0; 179 vec->count = 0;
184 if (!zalloc_cpumask_var(&vec->mask, gfp)) 180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
185 goto cleanup; 181 goto cleanup;
186 } 182 }
187 183
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, struct cpumask *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
32#else 32#else
33#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 35565395d00d..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
332 PN(sysctl_sched_latency); 332 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 333 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 334 PN(sysctl_sched_wakeup_granularity);
335 PN(sysctl_sched_child_runs_first); 335 P(sysctl_sched_child_runs_first);
336 P(sysctl_sched_features); 336 P(sysctl_sched_features);
337#undef PN 337#undef PN
338#undef P 338#undef P
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..ab661ebc4895 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2287 unsigned long power = SCHED_LOAD_SCALE; 2287 unsigned long power = SCHED_LOAD_SCALE;
2288 struct sched_group *sdg = sd->groups; 2288 struct sched_group *sdg = sd->groups;
2289 2289
2290 if (sched_feat(ARCH_POWER))
2291 power *= arch_scale_freq_power(sd, cpu);
2292 else
2293 power *= default_scale_freq_power(sd, cpu);
2294
2295 power >>= SCHED_LOAD_SHIFT;
2296
2297 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2290 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2298 if (sched_feat(ARCH_POWER)) 2291 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_smt_power(sd, cpu); 2292 power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2303 power >>= SCHED_LOAD_SHIFT; 2296 power >>= SCHED_LOAD_SHIFT;
2304 } 2297 }
2305 2298
2299 sdg->cpu_power_orig = power;
2300
2301 if (sched_feat(ARCH_POWER))
2302 power *= arch_scale_freq_power(sd, cpu);
2303 else
2304 power *= default_scale_freq_power(sd, cpu);
2305
2306 power >>= SCHED_LOAD_SHIFT;
2307
2306 power *= scale_rt_power(cpu); 2308 power *= scale_rt_power(cpu);
2307 power >>= SCHED_LOAD_SHIFT; 2309 power >>= SCHED_LOAD_SHIFT;
2308 2310
@@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2335 sdg->cpu_power = power; 2337 sdg->cpu_power = power;
2336} 2338}
2337 2339
2340/*
2341 * Try and fix up capacity for tiny siblings, this is needed when
2342 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2343 * which on its own isn't powerful enough.
2344 *
2345 * See update_sd_pick_busiest() and check_asym_packing().
2346 */
2347static inline int
2348fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2349{
2350 /*
2351 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2352 */
2353 if (sd->level != SD_LV_SIBLING)
2354 return 0;
2355
2356 /*
2357 * If ~90% of the cpu_power is still there, we're good.
2358 */
2359 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2360 return 1;
2361
2362 return 0;
2363}
2364
2338/** 2365/**
2339 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2366 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2340 * @sd: The sched_domain whose statistics are to be updated. 2367 * @sd: The sched_domain whose statistics are to be updated.
@@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2400 * domains. In the newly idle case, we will allow all the cpu's 2427 * domains. In the newly idle case, we will allow all the cpu's
2401 * to do the newly idle load balance. 2428 * to do the newly idle load balance.
2402 */ 2429 */
2403 if (idle != CPU_NEWLY_IDLE && local_group && 2430 if (idle != CPU_NEWLY_IDLE && local_group) {
2404 balance_cpu != this_cpu) { 2431 if (balance_cpu != this_cpu) {
2405 *balance = 0; 2432 *balance = 0;
2406 return; 2433 return;
2434 }
2435 update_group_power(sd, this_cpu);
2407 } 2436 }
2408 2437
2409 update_group_power(sd, this_cpu);
2410
2411 /* Adjust by relative CPU power of the group */ 2438 /* Adjust by relative CPU power of the group */
2412 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2439 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2413 2440
@@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2428 2455
2429 sgs->group_capacity = 2456 sgs->group_capacity =
2430 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2457 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2458 if (!sgs->group_capacity)
2459 sgs->group_capacity = fix_small_capacity(sd, group);
2460}
2461
2462/**
2463 * update_sd_pick_busiest - return 1 on busiest group
2464 * @sd: sched_domain whose statistics are to be checked
2465 * @sds: sched_domain statistics
2466 * @sg: sched_group candidate to be checked for being the busiest
2467 * @sgs: sched_group statistics
2468 * @this_cpu: the current cpu
2469 *
2470 * Determine if @sg is a busier group than the previously selected
2471 * busiest group.
2472 */
2473static bool update_sd_pick_busiest(struct sched_domain *sd,
2474 struct sd_lb_stats *sds,
2475 struct sched_group *sg,
2476 struct sg_lb_stats *sgs,
2477 int this_cpu)
2478{
2479 if (sgs->avg_load <= sds->max_load)
2480 return false;
2481
2482 if (sgs->sum_nr_running > sgs->group_capacity)
2483 return true;
2484
2485 if (sgs->group_imb)
2486 return true;
2487
2488 /*
2489 * ASYM_PACKING needs to move all the work to the lowest
2490 * numbered CPUs in the group, therefore mark all groups
2491 * higher than ourself as busy.
2492 */
2493 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2494 this_cpu < group_first_cpu(sg)) {
2495 if (!sds->busiest)
2496 return true;
2497
2498 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2499 return true;
2500 }
2501
2502 return false;
2431} 2503}
2432 2504
2433/** 2505/**
@@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2435 * @sd: sched_domain whose statistics are to be updated. 2507 * @sd: sched_domain whose statistics are to be updated.
2436 * @this_cpu: Cpu for which load balance is currently performed. 2508 * @this_cpu: Cpu for which load balance is currently performed.
2437 * @idle: Idle status of this_cpu 2509 * @idle: Idle status of this_cpu
2438 * @sd_idle: Idle status of the sched_domain containing group. 2510 * @sd_idle: Idle status of the sched_domain containing sg.
2439 * @cpus: Set of cpus considered for load balancing. 2511 * @cpus: Set of cpus considered for load balancing.
2440 * @balance: Should we balance. 2512 * @balance: Should we balance.
2441 * @sds: variable to hold the statistics for this sched_domain. 2513 * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2446 struct sd_lb_stats *sds) 2518 struct sd_lb_stats *sds)
2447{ 2519{
2448 struct sched_domain *child = sd->child; 2520 struct sched_domain *child = sd->child;
2449 struct sched_group *group = sd->groups; 2521 struct sched_group *sg = sd->groups;
2450 struct sg_lb_stats sgs; 2522 struct sg_lb_stats sgs;
2451 int load_idx, prefer_sibling = 0; 2523 int load_idx, prefer_sibling = 0;
2452 2524
@@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2459 do { 2531 do {
2460 int local_group; 2532 int local_group;
2461 2533
2462 local_group = cpumask_test_cpu(this_cpu, 2534 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2463 sched_group_cpus(group));
2464 memset(&sgs, 0, sizeof(sgs)); 2535 memset(&sgs, 0, sizeof(sgs));
2465 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2536 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2466 local_group, cpus, balance, &sgs); 2537 local_group, cpus, balance, &sgs);
2467 2538
2468 if (local_group && !(*balance)) 2539 if (local_group && !(*balance))
2469 return; 2540 return;
2470 2541
2471 sds->total_load += sgs.group_load; 2542 sds->total_load += sgs.group_load;
2472 sds->total_pwr += group->cpu_power; 2543 sds->total_pwr += sg->cpu_power;
2473 2544
2474 /* 2545 /*
2475 * In case the child domain prefers tasks go to siblings 2546 * In case the child domain prefers tasks go to siblings
2476 * first, lower the group capacity to one so that we'll try 2547 * first, lower the sg capacity to one so that we'll try
2477 * and move all the excess tasks away. 2548 * and move all the excess tasks away.
2478 */ 2549 */
2479 if (prefer_sibling) 2550 if (prefer_sibling)
@@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2481 2552
2482 if (local_group) { 2553 if (local_group) {
2483 sds->this_load = sgs.avg_load; 2554 sds->this_load = sgs.avg_load;
2484 sds->this = group; 2555 sds->this = sg;
2485 sds->this_nr_running = sgs.sum_nr_running; 2556 sds->this_nr_running = sgs.sum_nr_running;
2486 sds->this_load_per_task = sgs.sum_weighted_load; 2557 sds->this_load_per_task = sgs.sum_weighted_load;
2487 } else if (sgs.avg_load > sds->max_load && 2558 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2488 (sgs.sum_nr_running > sgs.group_capacity ||
2489 sgs.group_imb)) {
2490 sds->max_load = sgs.avg_load; 2559 sds->max_load = sgs.avg_load;
2491 sds->busiest = group; 2560 sds->busiest = sg;
2492 sds->busiest_nr_running = sgs.sum_nr_running; 2561 sds->busiest_nr_running = sgs.sum_nr_running;
2493 sds->busiest_group_capacity = sgs.group_capacity; 2562 sds->busiest_group_capacity = sgs.group_capacity;
2494 sds->busiest_load_per_task = sgs.sum_weighted_load; 2563 sds->busiest_load_per_task = sgs.sum_weighted_load;
2495 sds->group_imb = sgs.group_imb; 2564 sds->group_imb = sgs.group_imb;
2496 } 2565 }
2497 2566
2498 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2567 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2499 group = group->next; 2568 sg = sg->next;
2500 } while (group != sd->groups); 2569 } while (sg != sd->groups);
2570}
2571
2572int __weak arch_sd_sibling_asym_packing(void)
2573{
2574 return 0*SD_ASYM_PACKING;
2575}
2576
2577/**
2578 * check_asym_packing - Check to see if the group is packed into the
2579 * sched doman.
2580 *
2581 * This is primarily intended to used at the sibling level. Some
2582 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2583 * case of POWER7, it can move to lower SMT modes only when higher
2584 * threads are idle. When in lower SMT modes, the threads will
2585 * perform better since they share less core resources. Hence when we
2586 * have idle threads, we want them to be the higher ones.
2587 *
2588 * This packing function is run on idle threads. It checks to see if
2589 * the busiest CPU in this domain (core in the P7 case) has a higher
2590 * CPU number than the packing function is being run on. Here we are
2591 * assuming lower CPU number will be equivalent to lower a SMT thread
2592 * number.
2593 *
2594 * Returns 1 when packing is required and a task should be moved to
2595 * this CPU. The amount of the imbalance is returned in *imbalance.
2596 *
2597 * @sd: The sched_domain whose packing is to be checked.
2598 * @sds: Statistics of the sched_domain which is to be packed
2599 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2600 * @imbalance: returns amount of imbalanced due to packing.
2601 */
2602static int check_asym_packing(struct sched_domain *sd,
2603 struct sd_lb_stats *sds,
2604 int this_cpu, unsigned long *imbalance)
2605{
2606 int busiest_cpu;
2607
2608 if (!(sd->flags & SD_ASYM_PACKING))
2609 return 0;
2610
2611 if (!sds->busiest)
2612 return 0;
2613
2614 busiest_cpu = group_first_cpu(sds->busiest);
2615 if (this_cpu > busiest_cpu)
2616 return 0;
2617
2618 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2619 SCHED_LOAD_SCALE);
2620 return 1;
2501} 2621}
2502 2622
2503/** 2623/**
@@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2692 if (!(*balance)) 2812 if (!(*balance))
2693 goto ret; 2813 goto ret;
2694 2814
2815 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2816 check_asym_packing(sd, &sds, this_cpu, imbalance))
2817 return sds.busiest;
2818
2695 if (!sds.busiest || sds.busiest_nr_running == 0) 2819 if (!sds.busiest || sds.busiest_nr_running == 0)
2696 goto out_balanced; 2820 goto out_balanced;
2697 2821
@@ -2726,8 +2850,9 @@ ret:
2726 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2850 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2727 */ 2851 */
2728static struct rq * 2852static struct rq *
2729find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2853find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2730 unsigned long imbalance, const struct cpumask *cpus) 2854 enum cpu_idle_type idle, unsigned long imbalance,
2855 const struct cpumask *cpus)
2731{ 2856{
2732 struct rq *busiest = NULL, *rq; 2857 struct rq *busiest = NULL, *rq;
2733 unsigned long max_load = 0; 2858 unsigned long max_load = 0;
@@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2738 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2863 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2739 unsigned long wl; 2864 unsigned long wl;
2740 2865
2866 if (!capacity)
2867 capacity = fix_small_capacity(sd, group);
2868
2741 if (!cpumask_test_cpu(i, cpus)) 2869 if (!cpumask_test_cpu(i, cpus))
2742 continue; 2870 continue;
2743 2871
@@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2777/* Working cpumask for load_balance and load_balance_newidle. */ 2905/* Working cpumask for load_balance and load_balance_newidle. */
2778static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2906static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2779 2907
2780static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2908static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2909 int busiest_cpu, int this_cpu)
2781{ 2910{
2782 if (idle == CPU_NEWLY_IDLE) { 2911 if (idle == CPU_NEWLY_IDLE) {
2912
2913 /*
2914 * ASYM_PACKING needs to force migrate tasks from busy but
2915 * higher numbered CPUs in order to pack all tasks in the
2916 * lowest numbered CPUs.
2917 */
2918 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2919 return 1;
2920
2783 /* 2921 /*
2784 * The only task running in a non-idle cpu can be moved to this 2922 * The only task running in a non-idle cpu can be moved to this
2785 * cpu in an attempt to completely freeup the other CPU 2923 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +2992,7 @@ redo:
2854 goto out_balanced; 2992 goto out_balanced;
2855 } 2993 }
2856 2994
2857 busiest = find_busiest_queue(group, idle, imbalance, cpus); 2995 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2858 if (!busiest) { 2996 if (!busiest) {
2859 schedstat_inc(sd, lb_nobusyq[idle]); 2997 schedstat_inc(sd, lb_nobusyq[idle]);
2860 goto out_balanced; 2998 goto out_balanced;
@@ -2898,7 +3036,8 @@ redo:
2898 schedstat_inc(sd, lb_failed[idle]); 3036 schedstat_inc(sd, lb_failed[idle]);
2899 sd->nr_balance_failed++; 3037 sd->nr_balance_failed++;
2900 3038
2901 if (need_active_balance(sd, sd_idle, idle)) { 3039 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3040 this_cpu)) {
2902 raw_spin_lock_irqsave(&busiest->lock, flags); 3041 raw_spin_lock_irqsave(&busiest->lock, flags);
2903 3042
2904 /* don't kick the active_load_balance_cpu_stop, 3043 /* don't kick the active_load_balance_cpu_stop,
@@ -3093,13 +3232,40 @@ out_unlock:
3093} 3232}
3094 3233
3095#ifdef CONFIG_NO_HZ 3234#ifdef CONFIG_NO_HZ
3235
3236static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3237
3238static void trigger_sched_softirq(void *data)
3239{
3240 raise_softirq_irqoff(SCHED_SOFTIRQ);
3241}
3242
3243static inline void init_sched_softirq_csd(struct call_single_data *csd)
3244{
3245 csd->func = trigger_sched_softirq;
3246 csd->info = NULL;
3247 csd->flags = 0;
3248 csd->priv = 0;
3249}
3250
3251/*
3252 * idle load balancing details
3253 * - One of the idle CPUs nominates itself as idle load_balancer, while
3254 * entering idle.
3255 * - This idle load balancer CPU will also go into tickless mode when
3256 * it is idle, just like all other idle CPUs
3257 * - When one of the busy CPUs notice that there may be an idle rebalancing
3258 * needed, they will kick the idle load balancer, which then does idle
3259 * load balancing for all the idle CPUs.
3260 */
3096static struct { 3261static struct {
3097 atomic_t load_balancer; 3262 atomic_t load_balancer;
3098 cpumask_var_t cpu_mask; 3263 atomic_t first_pick_cpu;
3099 cpumask_var_t ilb_grp_nohz_mask; 3264 atomic_t second_pick_cpu;
3100} nohz ____cacheline_aligned = { 3265 cpumask_var_t idle_cpus_mask;
3101 .load_balancer = ATOMIC_INIT(-1), 3266 cpumask_var_t grp_idle_mask;
3102}; 3267 unsigned long next_balance; /* in jiffy units */
3268} nohz ____cacheline_aligned;
3103 3269
3104int get_nohz_load_balancer(void) 3270int get_nohz_load_balancer(void)
3105{ 3271{
@@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3153 */ 3319 */
3154static inline int is_semi_idle_group(struct sched_group *ilb_group) 3320static inline int is_semi_idle_group(struct sched_group *ilb_group)
3155{ 3321{
3156 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3322 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3157 sched_group_cpus(ilb_group)); 3323 sched_group_cpus(ilb_group));
3158 3324
3159 /* 3325 /*
3160 * A sched_group is semi-idle when it has atleast one busy cpu 3326 * A sched_group is semi-idle when it has atleast one busy cpu
3161 * and atleast one idle cpu. 3327 * and atleast one idle cpu.
3162 */ 3328 */
3163 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3329 if (cpumask_empty(nohz.grp_idle_mask))
3164 return 0; 3330 return 0;
3165 3331
3166 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3332 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3167 return 0; 3333 return 0;
3168 3334
3169 return 1; 3335 return 1;
@@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu)
3196 * Optimize for the case when we have no idle CPUs or only one 3362 * Optimize for the case when we have no idle CPUs or only one
3197 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3363 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3198 */ 3364 */
3199 if (cpumask_weight(nohz.cpu_mask) < 2) 3365 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3200 goto out_done; 3366 goto out_done;
3201 3367
3202 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3368 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu)
3204 3370
3205 do { 3371 do {
3206 if (is_semi_idle_group(ilb_group)) 3372 if (is_semi_idle_group(ilb_group))
3207 return cpumask_first(nohz.ilb_grp_nohz_mask); 3373 return cpumask_first(nohz.grp_idle_mask);
3208 3374
3209 ilb_group = ilb_group->next; 3375 ilb_group = ilb_group->next;
3210 3376
@@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu)
3212 } 3378 }
3213 3379
3214out_done: 3380out_done:
3215 return cpumask_first(nohz.cpu_mask); 3381 return nr_cpu_ids;
3216} 3382}
3217#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3383#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3218static inline int find_new_ilb(int call_cpu) 3384static inline int find_new_ilb(int call_cpu)
3219{ 3385{
3220 return cpumask_first(nohz.cpu_mask); 3386 return nr_cpu_ids;
3221} 3387}
3222#endif 3388#endif
3223 3389
3224/* 3390/*
3391 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3392 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3393 * CPU (if there is one).
3394 */
3395static void nohz_balancer_kick(int cpu)
3396{
3397 int ilb_cpu;
3398
3399 nohz.next_balance++;
3400
3401 ilb_cpu = get_nohz_load_balancer();
3402
3403 if (ilb_cpu >= nr_cpu_ids) {
3404 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3405 if (ilb_cpu >= nr_cpu_ids)
3406 return;
3407 }
3408
3409 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3410 struct call_single_data *cp;
3411
3412 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3413 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3414 __smp_call_function_single(ilb_cpu, cp, 0);
3415 }
3416 return;
3417}
3418
3419/*
3225 * This routine will try to nominate the ilb (idle load balancing) 3420 * This routine will try to nominate the ilb (idle load balancing)
3226 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3421 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3227 * load balancing on behalf of all those cpus. If all the cpus in the system 3422 * load balancing on behalf of all those cpus.
3228 * go into this tickless mode, then there will be no ilb owner (as there is
3229 * no need for one) and all the cpus will sleep till the next wakeup event
3230 * arrives...
3231 * 3423 *
3232 * For the ilb owner, tick is not stopped. And this tick will be used 3424 * When the ilb owner becomes busy, we will not have new ilb owner until some
3233 * for idle load balancing. ilb owner will still be part of 3425 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3234 * nohz.cpu_mask.. 3426 * idle load balancing by kicking one of the idle CPUs.
3235 * 3427 *
3236 * While stopping the tick, this cpu will become the ilb owner if there 3428 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3237 * is no other owner. And will be the owner till that cpu becomes busy 3429 * ilb owner CPU in future (when there is a need for idle load balancing on
3238 * or if all cpus in the system stop their ticks at which point 3430 * behalf of all idle CPUs).
3239 * there is no need for ilb owner.
3240 *
3241 * When the ilb owner becomes busy, it nominates another owner, during the
3242 * next busy scheduler_tick()
3243 */ 3431 */
3244int select_nohz_load_balancer(int stop_tick) 3432void select_nohz_load_balancer(int stop_tick)
3245{ 3433{
3246 int cpu = smp_processor_id(); 3434 int cpu = smp_processor_id();
3247 3435
3248 if (stop_tick) { 3436 if (stop_tick) {
3249 cpu_rq(cpu)->in_nohz_recently = 1;
3250
3251 if (!cpu_active(cpu)) { 3437 if (!cpu_active(cpu)) {
3252 if (atomic_read(&nohz.load_balancer) != cpu) 3438 if (atomic_read(&nohz.load_balancer) != cpu)
3253 return 0; 3439 return;
3254 3440
3255 /* 3441 /*
3256 * If we are going offline and still the leader, 3442 * If we are going offline and still the leader,
3257 * give up! 3443 * give up!
3258 */ 3444 */
3259 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3445 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3446 nr_cpu_ids) != cpu)
3260 BUG(); 3447 BUG();
3261 3448
3262 return 0; 3449 return;
3263 } 3450 }
3264 3451
3265 cpumask_set_cpu(cpu, nohz.cpu_mask); 3452 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3266 3453
3267 /* time for ilb owner also to sleep */ 3454 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3268 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3455 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3269 if (atomic_read(&nohz.load_balancer) == cpu) 3456 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3270 atomic_set(&nohz.load_balancer, -1); 3457 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3271 return 0;
3272 }
3273 3458
3274 if (atomic_read(&nohz.load_balancer) == -1) { 3459 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3275 /* make me the ilb owner */
3276 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3277 return 1;
3278 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3279 int new_ilb; 3460 int new_ilb;
3280 3461
3281 if (!(sched_smt_power_savings || 3462 /* make me the ilb owner */
3282 sched_mc_power_savings)) 3463 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3283 return 1; 3464 cpu) != nr_cpu_ids)
3465 return;
3466
3284 /* 3467 /*
3285 * Check to see if there is a more power-efficient 3468 * Check to see if there is a more power-efficient
3286 * ilb. 3469 * ilb.
3287 */ 3470 */
3288 new_ilb = find_new_ilb(cpu); 3471 new_ilb = find_new_ilb(cpu);
3289 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3472 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3290 atomic_set(&nohz.load_balancer, -1); 3473 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3291 resched_cpu(new_ilb); 3474 resched_cpu(new_ilb);
3292 return 0; 3475 return;
3293 } 3476 }
3294 return 1; 3477 return;
3295 } 3478 }
3296 } else { 3479 } else {
3297 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3480 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3298 return 0; 3481 return;
3299 3482
3300 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3483 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3301 3484
3302 if (atomic_read(&nohz.load_balancer) == cpu) 3485 if (atomic_read(&nohz.load_balancer) == cpu)
3303 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3486 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3487 nr_cpu_ids) != cpu)
3304 BUG(); 3488 BUG();
3305 } 3489 }
3306 return 0; 3490 return;
3307} 3491}
3308#endif 3492#endif
3309 3493
@@ -3385,11 +3569,102 @@ out:
3385 rq->next_balance = next_balance; 3569 rq->next_balance = next_balance;
3386} 3570}
3387 3571
3572#ifdef CONFIG_NO_HZ
3388/* 3573/*
3389 * run_rebalance_domains is triggered when needed from the scheduler tick. 3574 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3390 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3391 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3575 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3392 */ 3576 */
3577static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3578{
3579 struct rq *this_rq = cpu_rq(this_cpu);
3580 struct rq *rq;
3581 int balance_cpu;
3582
3583 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3584 return;
3585
3586 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3587 if (balance_cpu == this_cpu)
3588 continue;
3589
3590 /*
3591 * If this cpu gets work to do, stop the load balancing
3592 * work being done for other cpus. Next load
3593 * balancing owner will pick it up.
3594 */
3595 if (need_resched()) {
3596 this_rq->nohz_balance_kick = 0;
3597 break;
3598 }
3599
3600 raw_spin_lock_irq(&this_rq->lock);
3601 update_rq_clock(this_rq);
3602 update_cpu_load(this_rq);
3603 raw_spin_unlock_irq(&this_rq->lock);
3604
3605 rebalance_domains(balance_cpu, CPU_IDLE);
3606
3607 rq = cpu_rq(balance_cpu);
3608 if (time_after(this_rq->next_balance, rq->next_balance))
3609 this_rq->next_balance = rq->next_balance;
3610 }
3611 nohz.next_balance = this_rq->next_balance;
3612 this_rq->nohz_balance_kick = 0;
3613}
3614
3615/*
3616 * Current heuristic for kicking the idle load balancer
3617 * - first_pick_cpu is the one of the busy CPUs. It will kick
3618 * idle load balancer when it has more than one process active. This
3619 * eliminates the need for idle load balancing altogether when we have
3620 * only one running process in the system (common case).
3621 * - If there are more than one busy CPU, idle load balancer may have
3622 * to run for active_load_balance to happen (i.e., two busy CPUs are
3623 * SMT or core siblings and can run better if they move to different
3624 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3625 * which will kick idle load balancer as soon as it has any load.
3626 */
3627static inline int nohz_kick_needed(struct rq *rq, int cpu)
3628{
3629 unsigned long now = jiffies;
3630 int ret;
3631 int first_pick_cpu, second_pick_cpu;
3632
3633 if (time_before(now, nohz.next_balance))
3634 return 0;
3635
3636 if (!rq->nr_running)
3637 return 0;
3638
3639 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3640 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3641
3642 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3643 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3644 return 0;
3645
3646 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3647 if (ret == nr_cpu_ids || ret == cpu) {
3648 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3649 if (rq->nr_running > 1)
3650 return 1;
3651 } else {
3652 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3653 if (ret == nr_cpu_ids || ret == cpu) {
3654 if (rq->nr_running)
3655 return 1;
3656 }
3657 }
3658 return 0;
3659}
3660#else
3661static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3662#endif
3663
3664/*
3665 * run_rebalance_domains is triggered when needed from the scheduler tick.
3666 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3667 */
3393static void run_rebalance_domains(struct softirq_action *h) 3668static void run_rebalance_domains(struct softirq_action *h)
3394{ 3669{
3395 int this_cpu = smp_processor_id(); 3670 int this_cpu = smp_processor_id();
@@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3399 3674
3400 rebalance_domains(this_cpu, idle); 3675 rebalance_domains(this_cpu, idle);
3401 3676
3402#ifdef CONFIG_NO_HZ
3403 /* 3677 /*
3404 * If this cpu is the owner for idle load balancing, then do the 3678 * If this cpu has a pending nohz_balance_kick, then do the
3405 * balancing on behalf of the other idle cpus whose ticks are 3679 * balancing on behalf of the other idle cpus whose ticks are
3406 * stopped. 3680 * stopped.
3407 */ 3681 */
3408 if (this_rq->idle_at_tick && 3682 nohz_idle_balance(this_cpu, idle);
3409 atomic_read(&nohz.load_balancer) == this_cpu) {
3410 struct rq *rq;
3411 int balance_cpu;
3412
3413 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3414 if (balance_cpu == this_cpu)
3415 continue;
3416
3417 /*
3418 * If this cpu gets work to do, stop the load balancing
3419 * work being done for other cpus. Next load
3420 * balancing owner will pick it up.
3421 */
3422 if (need_resched())
3423 break;
3424
3425 rebalance_domains(balance_cpu, CPU_IDLE);
3426
3427 rq = cpu_rq(balance_cpu);
3428 if (time_after(this_rq->next_balance, rq->next_balance))
3429 this_rq->next_balance = rq->next_balance;
3430 }
3431 }
3432#endif
3433} 3683}
3434 3684
3435static inline int on_null_domain(int cpu) 3685static inline int on_null_domain(int cpu)
@@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu)
3439 3689
3440/* 3690/*
3441 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3691 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3442 *
3443 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3444 * idle load balancing owner or decide to stop the periodic load balancing,
3445 * if the whole system is idle.
3446 */ 3692 */
3447static inline void trigger_load_balance(struct rq *rq, int cpu) 3693static inline void trigger_load_balance(struct rq *rq, int cpu)
3448{ 3694{
3449#ifdef CONFIG_NO_HZ
3450 /*
3451 * If we were in the nohz mode recently and busy at the current
3452 * scheduler tick, then check if we need to nominate new idle
3453 * load balancer.
3454 */
3455 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3456 rq->in_nohz_recently = 0;
3457
3458 if (atomic_read(&nohz.load_balancer) == cpu) {
3459 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3460 atomic_set(&nohz.load_balancer, -1);
3461 }
3462
3463 if (atomic_read(&nohz.load_balancer) == -1) {
3464 int ilb = find_new_ilb(cpu);
3465
3466 if (ilb < nr_cpu_ids)
3467 resched_cpu(ilb);
3468 }
3469 }
3470
3471 /*
3472 * If this cpu is idle and doing idle load balancing for all the
3473 * cpus with ticks stopped, is it time for that to stop?
3474 */
3475 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3476 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3477 resched_cpu(cpu);
3478 return;
3479 }
3480
3481 /*
3482 * If this cpu is idle and the idle load balancing is done by
3483 * someone else, then no need raise the SCHED_SOFTIRQ
3484 */
3485 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3486 cpumask_test_cpu(cpu, nohz.cpu_mask))
3487 return;
3488#endif
3489 /* Don't need to rebalance while attached to NULL domain */ 3695 /* Don't need to rebalance while attached to NULL domain */
3490 if (time_after_eq(jiffies, rq->next_balance) && 3696 if (time_after_eq(jiffies, rq->next_balance) &&
3491 likely(!on_null_domain(cpu))) 3697 likely(!on_null_domain(cpu)))
3492 raise_softirq(SCHED_SOFTIRQ); 3698 raise_softirq(SCHED_SOFTIRQ);
3699#ifdef CONFIG_NO_HZ
3700 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3701 nohz_balancer_kick(cpu);
3702#endif
3493} 3703}
3494 3704
3495static void rq_online_fair(struct rq *rq) 3705static void rq_online_fair(struct rq *rq)
@@ -3542,6 +3752,8 @@ static void task_fork_fair(struct task_struct *p)
3542 3752
3543 raw_spin_lock_irqsave(&rq->lock, flags); 3753 raw_spin_lock_irqsave(&rq->lock, flags);
3544 3754
3755 update_rq_clock(rq);
3756
3545 if (unlikely(task_cpu(p) != this_cpu)) 3757 if (unlikely(task_cpu(p) != this_cpu))
3546 __set_task_cpu(p, this_cpu); 3758 __set_task_cpu(p, this_cpu);
3547 3759
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..d10c80ebb67a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1663{ 1663{
1664 unsigned long soft, hard; 1664 unsigned long soft, hard;
1665 1665
1666 if (!p->signal)
1667 return;
1668
1669 /* max may change after cur was read, this will be fixed next tick */ 1666 /* max may change after cur was read, this will be fixed next tick */
1670 soft = task_rlimit(p, RLIMIT_RTTIME); 1667 soft = task_rlimit(p, RLIMIT_RTTIME);
1671 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1668 hard = task_rlimit_max(p, RLIMIT_RTTIME);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
295static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime) 296 cputime_t cputime)
297{ 297{
298 struct thread_group_cputimer *cputimer; 298 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
299
300 /* tsk == current, ensure it is safe to use ->signal */
301 if (unlikely(tsk->exit_state))
302 return;
303
304 cputimer = &tsk->signal->cputimer;
305 299
306 if (!cputimer->running) 300 if (!cputimer->running)
307 return; 301 return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 319static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 320 cputime_t cputime)
327{ 321{
328 struct thread_group_cputimer *cputimer; 322 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
329
330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state))
332 return;
333
334 cputimer = &tsk->signal->cputimer;
335 323
336 if (!cputimer->running) 324 if (!cputimer->running)
337 return; 325 return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
355static inline void account_group_exec_runtime(struct task_struct *tsk, 343static inline void account_group_exec_runtime(struct task_struct *tsk,
356 unsigned long long ns) 344 unsigned long long ns)
357{ 345{
358 struct thread_group_cputimer *cputimer; 346 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
359 struct signal_struct *sig;
360
361 sig = tsk->signal;
362 /* see __exit_signal()->task_rq_unlock_wait() */
363 barrier();
364 if (unlikely(!sig))
365 return;
366
367 cputimer = &sig->cputimer;
368 347
369 if (!cputimer->running) 348 if (!cputimer->running)
370 return; 349 return;
diff --git a/kernel/signal.c b/kernel/signal.c
index 906ae5a1779c..bded65187780 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info)
637 637
638/* 638/*
639 * Bad permissions for sending the signal 639 * Bad permissions for sending the signal
640 * - the caller must hold at least the RCU read lock 640 * - the caller must hold the RCU read lock
641 */ 641 */
642static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 643 struct task_struct *t)
@@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1127 1127
1128/* 1128/*
1129 * send signal info to all the members of a group 1129 * send signal info to all the members of a group
1130 * - the caller must hold the RCU read lock at least
1131 */ 1130 */
1132int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1131int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1133{ 1132{
1134 int ret = check_kill_permission(sig, info, p); 1133 int ret;
1134
1135 rcu_read_lock();
1136 ret = check_kill_permission(sig, info, p);
1137 rcu_read_unlock();
1135 1138
1136 if (!ret && sig) 1139 if (!ret && sig)
1137 ret = do_send_sig_info(sig, info, p, true); 1140 ret = do_send_sig_info(sig, info, p, true);
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19#include <linux/debugfs.h>
20#include "slow-work.h"
21
22static void slow_work_cull_timeout(unsigned long);
23static void slow_work_oom_timeout(unsigned long);
24
25#ifdef CONFIG_SYSCTL
26static int slow_work_min_threads_sysctl(struct ctl_table *, int,
27 void __user *, size_t *, loff_t *);
28
29static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
30 void __user *, size_t *, loff_t *);
31#endif
32
33/*
34 * The pool of threads has at least min threads in it as long as someone is
35 * using the facility, and may have as many as max.
36 *
37 * A portion of the pool may be processing very slow operations.
38 */
39static unsigned slow_work_min_threads = 2;
40static unsigned slow_work_max_threads = 4;
41static unsigned vslow_work_proportion = 50; /* % of threads that may process
42 * very slow work */
43
44#ifdef CONFIG_SYSCTL
45static const int slow_work_min_min_threads = 2;
46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
47static const int slow_work_min_vslow = 1;
48static const int slow_work_max_vslow = 99;
49
50ctl_table slow_work_sysctls[] = {
51 {
52 .procname = "min-threads",
53 .data = &slow_work_min_threads,
54 .maxlen = sizeof(unsigned),
55 .mode = 0644,
56 .proc_handler = slow_work_min_threads_sysctl,
57 .extra1 = (void *) &slow_work_min_min_threads,
58 .extra2 = &slow_work_max_threads,
59 },
60 {
61 .procname = "max-threads",
62 .data = &slow_work_max_threads,
63 .maxlen = sizeof(unsigned),
64 .mode = 0644,
65 .proc_handler = slow_work_max_threads_sysctl,
66 .extra1 = &slow_work_min_threads,
67 .extra2 = (void *) &slow_work_max_max_threads,
68 },
69 {
70 .procname = "vslow-percentage",
71 .data = &vslow_work_proportion,
72 .maxlen = sizeof(unsigned),
73 .mode = 0644,
74 .proc_handler = proc_dointvec_minmax,
75 .extra1 = (void *) &slow_work_min_vslow,
76 .extra2 = (void *) &slow_work_max_vslow,
77 },
78 {}
79};
80#endif
81
82/*
83 * The active state of the thread pool
84 */
85static atomic_t slow_work_thread_count;
86static atomic_t vslow_work_executing_count;
87
88static bool slow_work_may_not_start_new_thread;
89static bool slow_work_cull; /* cull a thread due to lack of activity */
90static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
91static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
92static struct slow_work slow_work_new_thread; /* new thread starter */
93
94/*
95 * slow work ID allocation (use slow_work_queue_lock)
96 */
97static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
98
99/*
100 * Unregistration tracking to prevent put_ref() from disappearing during module
101 * unload
102 */
103#ifdef CONFIG_MODULES
104static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
105static struct module *slow_work_unreg_module;
106static struct slow_work *slow_work_unreg_work_item;
107static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
108static DEFINE_MUTEX(slow_work_unreg_sync_lock);
109
110static void slow_work_set_thread_processing(int id, struct slow_work *work)
111{
112 if (work)
113 slow_work_thread_processing[id] = work->owner;
114}
115static void slow_work_done_thread_processing(int id, struct slow_work *work)
116{
117 struct module *module = slow_work_thread_processing[id];
118
119 slow_work_thread_processing[id] = NULL;
120 smp_mb();
121 if (slow_work_unreg_work_item == work ||
122 slow_work_unreg_module == module)
123 wake_up_all(&slow_work_unreg_wq);
124}
125static void slow_work_clear_thread_processing(int id)
126{
127 slow_work_thread_processing[id] = NULL;
128}
129#else
130static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
131static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
132static void slow_work_clear_thread_processing(int id) {}
133#endif
134
135/*
136 * Data for tracking currently executing items for indication through /proc
137 */
138#ifdef CONFIG_SLOW_WORK_DEBUG
139struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
140pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
141DEFINE_RWLOCK(slow_work_execs_lock);
142#endif
143
144/*
145 * The queues of work items and the lock governing access to them. These are
146 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
147 * as the number of threads bears no relation to the number of CPUs.
148 *
149 * There are two queues of work items: one for slow work items, and one for
150 * very slow work items.
151 */
152LIST_HEAD(slow_work_queue);
153LIST_HEAD(vslow_work_queue);
154DEFINE_SPINLOCK(slow_work_queue_lock);
155
156/*
157 * The following are two wait queues that get pinged when a work item is placed
158 * on an empty queue. These allow work items that are hogging a thread by
159 * sleeping in a way that could be deferred to yield their thread and enqueue
160 * themselves.
161 */
162static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
163static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
164
165/*
166 * The thread controls. A variable used to signal to the threads that they
167 * should exit when the queue is empty, a waitqueue used by the threads to wait
168 * for signals, and a completion set by the last thread to exit.
169 */
170static bool slow_work_threads_should_exit;
171static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
172static DECLARE_COMPLETION(slow_work_last_thread_exited);
173
174/*
175 * The number of users of the thread pool and its lock. Whilst this is zero we
176 * have no threads hanging around, and when this reaches zero, we wait for all
177 * active or queued work items to complete and kill all the threads we do have.
178 */
179static int slow_work_user_count;
180static DEFINE_MUTEX(slow_work_user_lock);
181
182static inline int slow_work_get_ref(struct slow_work *work)
183{
184 if (work->ops->get_ref)
185 return work->ops->get_ref(work);
186
187 return 0;
188}
189
190static inline void slow_work_put_ref(struct slow_work *work)
191{
192 if (work->ops->put_ref)
193 work->ops->put_ref(work);
194}
195
196/*
197 * Calculate the maximum number of active threads in the pool that are
198 * permitted to process very slow work items.
199 *
200 * The answer is rounded up to at least 1, but may not equal or exceed the
201 * maximum number of the threads in the pool. This means we always have at
202 * least one thread that can process slow work items, and we always have at
203 * least one thread that won't get tied up doing so.
204 */
205static unsigned slow_work_calc_vsmax(void)
206{
207 unsigned vsmax;
208
209 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
210 vsmax /= 100;
211 vsmax = max(vsmax, 1U);
212 return min(vsmax, slow_work_max_threads - 1);
213}
214
215/*
216 * Attempt to execute stuff queued on a slow thread. Return true if we managed
217 * it, false if there was nothing to do.
218 */
219static noinline bool slow_work_execute(int id)
220{
221 struct slow_work *work = NULL;
222 unsigned vsmax;
223 bool very_slow;
224
225 vsmax = slow_work_calc_vsmax();
226
227 /* see if we can schedule a new thread to be started if we're not
228 * keeping up with the work */
229 if (!waitqueue_active(&slow_work_thread_wq) &&
230 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
231 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
232 !slow_work_may_not_start_new_thread)
233 slow_work_enqueue(&slow_work_new_thread);
234
235 /* find something to execute */
236 spin_lock_irq(&slow_work_queue_lock);
237 if (!list_empty(&vslow_work_queue) &&
238 atomic_read(&vslow_work_executing_count) < vsmax) {
239 work = list_entry(vslow_work_queue.next,
240 struct slow_work, link);
241 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
242 BUG();
243 list_del_init(&work->link);
244 atomic_inc(&vslow_work_executing_count);
245 very_slow = true;
246 } else if (!list_empty(&slow_work_queue)) {
247 work = list_entry(slow_work_queue.next,
248 struct slow_work, link);
249 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
250 BUG();
251 list_del_init(&work->link);
252 very_slow = false;
253 } else {
254 very_slow = false; /* avoid the compiler warning */
255 }
256
257 slow_work_set_thread_processing(id, work);
258 if (work) {
259 slow_work_mark_time(work);
260 slow_work_begin_exec(id, work);
261 }
262
263 spin_unlock_irq(&slow_work_queue_lock);
264
265 if (!work)
266 return false;
267
268 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
269 BUG();
270
271 /* don't execute if the work is in the process of being cancelled */
272 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
273 work->ops->execute(work);
274
275 if (very_slow)
276 atomic_dec(&vslow_work_executing_count);
277 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
278
279 /* wake up anyone waiting for this work to be complete */
280 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
281
282 slow_work_end_exec(id, work);
283
284 /* if someone tried to enqueue the item whilst we were executing it,
285 * then it'll be left unenqueued to avoid multiple threads trying to
286 * execute it simultaneously
287 *
288 * there is, however, a race between us testing the pending flag and
289 * getting the spinlock, and between the enqueuer setting the pending
290 * flag and getting the spinlock, so we use a deferral bit to tell us
291 * if the enqueuer got there first
292 */
293 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
294 spin_lock_irq(&slow_work_queue_lock);
295
296 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
297 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
298 goto auto_requeue;
299
300 spin_unlock_irq(&slow_work_queue_lock);
301 }
302
303 /* sort out the race between module unloading and put_ref() */
304 slow_work_put_ref(work);
305 slow_work_done_thread_processing(id, work);
306
307 return true;
308
309auto_requeue:
310 /* we must complete the enqueue operation
311 * - we transfer our ref on the item back to the appropriate queue
312 * - don't wake another thread up as we're awake already
313 */
314 slow_work_mark_time(work);
315 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
316 list_add_tail(&work->link, &vslow_work_queue);
317 else
318 list_add_tail(&work->link, &slow_work_queue);
319 spin_unlock_irq(&slow_work_queue_lock);
320 slow_work_clear_thread_processing(id);
321 return true;
322}
323
324/**
325 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
326 * work: The work item under execution that wants to sleep
327 * _timeout: Scheduler sleep timeout
328 *
329 * Allow a requeueable work item to sleep on a slow-work processor thread until
330 * that thread is needed to do some other work or the sleep is interrupted by
331 * some other event.
332 *
333 * The caller must set up a wake up event before calling this and must have set
334 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
335 * condition before calling this function as no test is made here.
336 *
337 * False is returned if there is nothing on the queue; true is returned if the
338 * work item should be requeued
339 */
340bool slow_work_sleep_till_thread_needed(struct slow_work *work,
341 signed long *_timeout)
342{
343 wait_queue_head_t *wfo_wq;
344 struct list_head *queue;
345
346 DEFINE_WAIT(wait);
347
348 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
349 wfo_wq = &vslow_work_queue_waits_for_occupation;
350 queue = &vslow_work_queue;
351 } else {
352 wfo_wq = &slow_work_queue_waits_for_occupation;
353 queue = &slow_work_queue;
354 }
355
356 if (!list_empty(queue))
357 return true;
358
359 add_wait_queue_exclusive(wfo_wq, &wait);
360 if (list_empty(queue))
361 *_timeout = schedule_timeout(*_timeout);
362 finish_wait(wfo_wq, &wait);
363
364 return !list_empty(queue);
365}
366EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
367
368/**
369 * slow_work_enqueue - Schedule a slow work item for processing
370 * @work: The work item to queue
371 *
372 * Schedule a slow work item for processing. If the item is already undergoing
373 * execution, this guarantees not to re-enter the execution routine until the
374 * first execution finishes.
375 *
376 * The item is pinned by this function as it retains a reference to it, managed
377 * through the item operations. The item is unpinned once it has been
378 * executed.
379 *
380 * An item may hog the thread that is running it for a relatively large amount
381 * of time, sufficient, for example, to perform several lookup, mkdir, create
382 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
383 *
384 * Conversely, if a number of items are awaiting processing, it may take some
385 * time before any given item is given attention. The number of threads in the
386 * pool may be increased to deal with demand, but only up to a limit.
387 *
388 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
389 * the very slow queue, from which only a portion of the threads will be
390 * allowed to pick items to execute. This ensures that very slow items won't
391 * overly block ones that are just ordinarily slow.
392 *
393 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
394 * attempted queued)
395 */
396int slow_work_enqueue(struct slow_work *work)
397{
398 wait_queue_head_t *wfo_wq;
399 struct list_head *queue;
400 unsigned long flags;
401 int ret;
402
403 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
404 return -ECANCELED;
405
406 BUG_ON(slow_work_user_count <= 0);
407 BUG_ON(!work);
408 BUG_ON(!work->ops);
409
410 /* when honouring an enqueue request, we only promise that we will run
411 * the work function in the future; we do not promise to run it once
412 * per enqueue request
413 *
414 * we use the PENDING bit to merge together repeat requests without
415 * having to disable IRQs and take the spinlock, whilst still
416 * maintaining our promise
417 */
418 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
419 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
420 wfo_wq = &vslow_work_queue_waits_for_occupation;
421 queue = &vslow_work_queue;
422 } else {
423 wfo_wq = &slow_work_queue_waits_for_occupation;
424 queue = &slow_work_queue;
425 }
426
427 spin_lock_irqsave(&slow_work_queue_lock, flags);
428
429 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
430 goto cancelled;
431
432 /* we promise that we will not attempt to execute the work
433 * function in more than one thread simultaneously
434 *
435 * this, however, leaves us with a problem if we're asked to
436 * enqueue the work whilst someone is executing the work
437 * function as simply queueing the work immediately means that
438 * another thread may try executing it whilst it is already
439 * under execution
440 *
441 * to deal with this, we set the ENQ_DEFERRED bit instead of
442 * enqueueing, and the thread currently executing the work
443 * function will enqueue the work item when the work function
444 * returns and it has cleared the EXECUTING bit
445 */
446 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
447 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
448 } else {
449 ret = slow_work_get_ref(work);
450 if (ret < 0)
451 goto failed;
452 slow_work_mark_time(work);
453 list_add_tail(&work->link, queue);
454 wake_up(&slow_work_thread_wq);
455
456 /* if someone who could be requeued is sleeping on a
457 * thread, then ask them to yield their thread */
458 if (work->link.prev == queue)
459 wake_up(wfo_wq);
460 }
461
462 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
463 }
464 return 0;
465
466cancelled:
467 ret = -ECANCELED;
468failed:
469 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
470 return ret;
471}
472EXPORT_SYMBOL(slow_work_enqueue);
473
474static int slow_work_wait(void *word)
475{
476 schedule();
477 return 0;
478}
479
480/**
481 * slow_work_cancel - Cancel a slow work item
482 * @work: The work item to cancel
483 *
484 * This function will cancel a previously enqueued work item. If we cannot
485 * cancel the work item, it is guarenteed to have run when this function
486 * returns.
487 */
488void slow_work_cancel(struct slow_work *work)
489{
490 bool wait = true, put = false;
491
492 set_bit(SLOW_WORK_CANCELLING, &work->flags);
493 smp_mb();
494
495 /* if the work item is a delayed work item with an active timer, we
496 * need to wait for the timer to finish _before_ getting the spinlock,
497 * lest we deadlock against the timer routine
498 *
499 * the timer routine will leave DELAYED set if it notices the
500 * CANCELLING flag in time
501 */
502 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
503 struct delayed_slow_work *dwork =
504 container_of(work, struct delayed_slow_work, work);
505 del_timer_sync(&dwork->timer);
506 }
507
508 spin_lock_irq(&slow_work_queue_lock);
509
510 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
511 /* the timer routine aborted or never happened, so we are left
512 * holding the timer's reference on the item and should just
513 * drop the pending flag and wait for any ongoing execution to
514 * finish */
515 struct delayed_slow_work *dwork =
516 container_of(work, struct delayed_slow_work, work);
517
518 BUG_ON(timer_pending(&dwork->timer));
519 BUG_ON(!list_empty(&work->link));
520
521 clear_bit(SLOW_WORK_DELAYED, &work->flags);
522 put = true;
523 clear_bit(SLOW_WORK_PENDING, &work->flags);
524
525 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
526 !list_empty(&work->link)) {
527 /* the link in the pending queue holds a reference on the item
528 * that we will need to release */
529 list_del_init(&work->link);
530 wait = false;
531 put = true;
532 clear_bit(SLOW_WORK_PENDING, &work->flags);
533
534 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
535 /* the executor is holding our only reference on the item, so
536 * we merely need to wait for it to finish executing */
537 clear_bit(SLOW_WORK_PENDING, &work->flags);
538 }
539
540 spin_unlock_irq(&slow_work_queue_lock);
541
542 /* the EXECUTING flag is set by the executor whilst the spinlock is set
543 * and before the item is dequeued - so assuming the above doesn't
544 * actually dequeue it, simply waiting for the EXECUTING flag to be
545 * released here should be sufficient */
546 if (wait)
547 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
548 TASK_UNINTERRUPTIBLE);
549
550 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
551 if (put)
552 slow_work_put_ref(work);
553}
554EXPORT_SYMBOL(slow_work_cancel);
555
556/*
557 * Handle expiry of the delay timer, indicating that a delayed slow work item
558 * should now be queued if not cancelled
559 */
560static void delayed_slow_work_timer(unsigned long data)
561{
562 wait_queue_head_t *wfo_wq;
563 struct list_head *queue;
564 struct slow_work *work = (struct slow_work *) data;
565 unsigned long flags;
566 bool queued = false, put = false, first = false;
567
568 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
569 wfo_wq = &vslow_work_queue_waits_for_occupation;
570 queue = &vslow_work_queue;
571 } else {
572 wfo_wq = &slow_work_queue_waits_for_occupation;
573 queue = &slow_work_queue;
574 }
575
576 spin_lock_irqsave(&slow_work_queue_lock, flags);
577 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
578 clear_bit(SLOW_WORK_DELAYED, &work->flags);
579
580 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
581 /* we discard the reference the timer was holding in
582 * favour of the one the executor holds */
583 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
584 put = true;
585 } else {
586 slow_work_mark_time(work);
587 list_add_tail(&work->link, queue);
588 queued = true;
589 if (work->link.prev == queue)
590 first = true;
591 }
592 }
593
594 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
595 if (put)
596 slow_work_put_ref(work);
597 if (first)
598 wake_up(wfo_wq);
599 if (queued)
600 wake_up(&slow_work_thread_wq);
601}
602
603/**
604 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
605 * @dwork: The delayed work item to queue
606 * @delay: When to start executing the work, in jiffies from now
607 *
608 * This is similar to slow_work_enqueue(), but it adds a delay before the work
609 * is actually queued for processing.
610 *
611 * The item can have delayed processing requested on it whilst it is being
612 * executed. The delay will begin immediately, and if it expires before the
613 * item finishes executing, the item will be placed back on the queue when it
614 * has done executing.
615 */
616int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
617 unsigned long delay)
618{
619 struct slow_work *work = &dwork->work;
620 unsigned long flags;
621 int ret;
622
623 if (delay == 0)
624 return slow_work_enqueue(&dwork->work);
625
626 BUG_ON(slow_work_user_count <= 0);
627 BUG_ON(!work);
628 BUG_ON(!work->ops);
629
630 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
631 return -ECANCELED;
632
633 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
634 spin_lock_irqsave(&slow_work_queue_lock, flags);
635
636 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
637 goto cancelled;
638
639 /* the timer holds a reference whilst it is pending */
640 ret = slow_work_get_ref(work);
641 if (ret < 0)
642 goto cant_get_ref;
643
644 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
645 BUG();
646 dwork->timer.expires = jiffies + delay;
647 dwork->timer.data = (unsigned long) work;
648 dwork->timer.function = delayed_slow_work_timer;
649 add_timer(&dwork->timer);
650
651 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
652 }
653
654 return 0;
655
656cancelled:
657 ret = -ECANCELED;
658cant_get_ref:
659 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
660 return ret;
661}
662EXPORT_SYMBOL(delayed_slow_work_enqueue);
663
664/*
665 * Schedule a cull of the thread pool at some time in the near future
666 */
667static void slow_work_schedule_cull(void)
668{
669 mod_timer(&slow_work_cull_timer,
670 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
671}
672
673/*
674 * Worker thread culling algorithm
675 */
676static bool slow_work_cull_thread(void)
677{
678 unsigned long flags;
679 bool do_cull = false;
680
681 spin_lock_irqsave(&slow_work_queue_lock, flags);
682
683 if (slow_work_cull) {
684 slow_work_cull = false;
685
686 if (list_empty(&slow_work_queue) &&
687 list_empty(&vslow_work_queue) &&
688 atomic_read(&slow_work_thread_count) >
689 slow_work_min_threads) {
690 slow_work_schedule_cull();
691 do_cull = true;
692 }
693 }
694
695 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
696 return do_cull;
697}
698
699/*
700 * Determine if there is slow work available for dispatch
701 */
702static inline bool slow_work_available(int vsmax)
703{
704 return !list_empty(&slow_work_queue) ||
705 (!list_empty(&vslow_work_queue) &&
706 atomic_read(&vslow_work_executing_count) < vsmax);
707}
708
709/*
710 * Worker thread dispatcher
711 */
712static int slow_work_thread(void *_data)
713{
714 int vsmax, id;
715
716 DEFINE_WAIT(wait);
717
718 set_freezable();
719 set_user_nice(current, -5);
720
721 /* allocate ourselves an ID */
722 spin_lock_irq(&slow_work_queue_lock);
723 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
724 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
725 __set_bit(id, slow_work_ids);
726 slow_work_set_thread_pid(id, current->pid);
727 spin_unlock_irq(&slow_work_queue_lock);
728
729 sprintf(current->comm, "kslowd%03u", id);
730
731 for (;;) {
732 vsmax = vslow_work_proportion;
733 vsmax *= atomic_read(&slow_work_thread_count);
734 vsmax /= 100;
735
736 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
737 TASK_INTERRUPTIBLE);
738 if (!freezing(current) &&
739 !slow_work_threads_should_exit &&
740 !slow_work_available(vsmax) &&
741 !slow_work_cull)
742 schedule();
743 finish_wait(&slow_work_thread_wq, &wait);
744
745 try_to_freeze();
746
747 vsmax = vslow_work_proportion;
748 vsmax *= atomic_read(&slow_work_thread_count);
749 vsmax /= 100;
750
751 if (slow_work_available(vsmax) && slow_work_execute(id)) {
752 cond_resched();
753 if (list_empty(&slow_work_queue) &&
754 list_empty(&vslow_work_queue) &&
755 atomic_read(&slow_work_thread_count) >
756 slow_work_min_threads)
757 slow_work_schedule_cull();
758 continue;
759 }
760
761 if (slow_work_threads_should_exit)
762 break;
763
764 if (slow_work_cull && slow_work_cull_thread())
765 break;
766 }
767
768 spin_lock_irq(&slow_work_queue_lock);
769 slow_work_set_thread_pid(id, 0);
770 __clear_bit(id, slow_work_ids);
771 spin_unlock_irq(&slow_work_queue_lock);
772
773 if (atomic_dec_and_test(&slow_work_thread_count))
774 complete_and_exit(&slow_work_last_thread_exited, 0);
775 return 0;
776}
777
778/*
779 * Handle thread cull timer expiration
780 */
781static void slow_work_cull_timeout(unsigned long data)
782{
783 slow_work_cull = true;
784 wake_up(&slow_work_thread_wq);
785}
786
787/*
788 * Start a new slow work thread
789 */
790static void slow_work_new_thread_execute(struct slow_work *work)
791{
792 struct task_struct *p;
793
794 if (slow_work_threads_should_exit)
795 return;
796
797 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
798 return;
799
800 if (!mutex_trylock(&slow_work_user_lock))
801 return;
802
803 slow_work_may_not_start_new_thread = true;
804 atomic_inc(&slow_work_thread_count);
805 p = kthread_run(slow_work_thread, NULL, "kslowd");
806 if (IS_ERR(p)) {
807 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
808 if (atomic_dec_and_test(&slow_work_thread_count))
809 BUG(); /* we're running on a slow work thread... */
810 mod_timer(&slow_work_oom_timer,
811 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
812 } else {
813 /* ratelimit the starting of new threads */
814 mod_timer(&slow_work_oom_timer, jiffies + 1);
815 }
816
817 mutex_unlock(&slow_work_user_lock);
818}
819
820static const struct slow_work_ops slow_work_new_thread_ops = {
821 .owner = THIS_MODULE,
822 .execute = slow_work_new_thread_execute,
823#ifdef CONFIG_SLOW_WORK_DEBUG
824 .desc = slow_work_new_thread_desc,
825#endif
826};
827
828/*
829 * post-OOM new thread start suppression expiration
830 */
831static void slow_work_oom_timeout(unsigned long data)
832{
833 slow_work_may_not_start_new_thread = false;
834}
835
836#ifdef CONFIG_SYSCTL
837/*
838 * Handle adjustment of the minimum number of threads
839 */
840static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
841 void __user *buffer,
842 size_t *lenp, loff_t *ppos)
843{
844 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
845 int n;
846
847 if (ret == 0) {
848 mutex_lock(&slow_work_user_lock);
849 if (slow_work_user_count > 0) {
850 /* see if we need to start or stop threads */
851 n = atomic_read(&slow_work_thread_count) -
852 slow_work_min_threads;
853
854 if (n < 0 && !slow_work_may_not_start_new_thread)
855 slow_work_enqueue(&slow_work_new_thread);
856 else if (n > 0)
857 slow_work_schedule_cull();
858 }
859 mutex_unlock(&slow_work_user_lock);
860 }
861
862 return ret;
863}
864
865/*
866 * Handle adjustment of the maximum number of threads
867 */
868static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
869 void __user *buffer,
870 size_t *lenp, loff_t *ppos)
871{
872 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
873 int n;
874
875 if (ret == 0) {
876 mutex_lock(&slow_work_user_lock);
877 if (slow_work_user_count > 0) {
878 /* see if we need to stop threads */
879 n = slow_work_max_threads -
880 atomic_read(&slow_work_thread_count);
881
882 if (n < 0)
883 slow_work_schedule_cull();
884 }
885 mutex_unlock(&slow_work_user_lock);
886 }
887
888 return ret;
889}
890#endif /* CONFIG_SYSCTL */
891
892/**
893 * slow_work_register_user - Register a user of the facility
894 * @module: The module about to make use of the facility
895 *
896 * Register a user of the facility, starting up the initial threads if there
897 * aren't any other users at this point. This will return 0 if successful, or
898 * an error if not.
899 */
900int slow_work_register_user(struct module *module)
901{
902 struct task_struct *p;
903 int loop;
904
905 mutex_lock(&slow_work_user_lock);
906
907 if (slow_work_user_count == 0) {
908 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
909 init_completion(&slow_work_last_thread_exited);
910
911 slow_work_threads_should_exit = false;
912 slow_work_init(&slow_work_new_thread,
913 &slow_work_new_thread_ops);
914 slow_work_may_not_start_new_thread = false;
915 slow_work_cull = false;
916
917 /* start the minimum number of threads */
918 for (loop = 0; loop < slow_work_min_threads; loop++) {
919 atomic_inc(&slow_work_thread_count);
920 p = kthread_run(slow_work_thread, NULL, "kslowd");
921 if (IS_ERR(p))
922 goto error;
923 }
924 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
925 }
926
927 slow_work_user_count++;
928 mutex_unlock(&slow_work_user_lock);
929 return 0;
930
931error:
932 if (atomic_dec_and_test(&slow_work_thread_count))
933 complete(&slow_work_last_thread_exited);
934 if (loop > 0) {
935 printk(KERN_ERR "Slow work thread pool:"
936 " Aborting startup on ENOMEM\n");
937 slow_work_threads_should_exit = true;
938 wake_up_all(&slow_work_thread_wq);
939 wait_for_completion(&slow_work_last_thread_exited);
940 printk(KERN_ERR "Slow work thread pool: Aborted\n");
941 }
942 mutex_unlock(&slow_work_user_lock);
943 return PTR_ERR(p);
944}
945EXPORT_SYMBOL(slow_work_register_user);
946
947/*
948 * wait for all outstanding items from the calling module to complete
949 * - note that more items may be queued whilst we're waiting
950 */
951static void slow_work_wait_for_items(struct module *module)
952{
953#ifdef CONFIG_MODULES
954 DECLARE_WAITQUEUE(myself, current);
955 struct slow_work *work;
956 int loop;
957
958 mutex_lock(&slow_work_unreg_sync_lock);
959 add_wait_queue(&slow_work_unreg_wq, &myself);
960
961 for (;;) {
962 spin_lock_irq(&slow_work_queue_lock);
963
964 /* first of all, we wait for the last queued item in each list
965 * to be processed */
966 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
967 if (work->owner == module) {
968 set_current_state(TASK_UNINTERRUPTIBLE);
969 slow_work_unreg_work_item = work;
970 goto do_wait;
971 }
972 }
973 list_for_each_entry_reverse(work, &slow_work_queue, link) {
974 if (work->owner == module) {
975 set_current_state(TASK_UNINTERRUPTIBLE);
976 slow_work_unreg_work_item = work;
977 goto do_wait;
978 }
979 }
980
981 /* then we wait for the items being processed to finish */
982 slow_work_unreg_module = module;
983 smp_mb();
984 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
985 if (slow_work_thread_processing[loop] == module)
986 goto do_wait;
987 }
988 spin_unlock_irq(&slow_work_queue_lock);
989 break; /* okay, we're done */
990
991 do_wait:
992 spin_unlock_irq(&slow_work_queue_lock);
993 schedule();
994 slow_work_unreg_work_item = NULL;
995 slow_work_unreg_module = NULL;
996 }
997
998 remove_wait_queue(&slow_work_unreg_wq, &myself);
999 mutex_unlock(&slow_work_unreg_sync_lock);
1000#endif /* CONFIG_MODULES */
1001}
1002
1003/**
1004 * slow_work_unregister_user - Unregister a user of the facility
1005 * @module: The module whose items should be cleared
1006 *
1007 * Unregister a user of the facility, killing all the threads if this was the
1008 * last one.
1009 *
1010 * This waits for all the work items belonging to the nominated module to go
1011 * away before proceeding.
1012 */
1013void slow_work_unregister_user(struct module *module)
1014{
1015 /* first of all, wait for all outstanding items from the calling module
1016 * to complete */
1017 if (module)
1018 slow_work_wait_for_items(module);
1019
1020 /* then we can actually go about shutting down the facility if need
1021 * be */
1022 mutex_lock(&slow_work_user_lock);
1023
1024 BUG_ON(slow_work_user_count <= 0);
1025
1026 slow_work_user_count--;
1027 if (slow_work_user_count == 0) {
1028 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
1029 slow_work_threads_should_exit = true;
1030 del_timer_sync(&slow_work_cull_timer);
1031 del_timer_sync(&slow_work_oom_timer);
1032 wake_up_all(&slow_work_thread_wq);
1033 wait_for_completion(&slow_work_last_thread_exited);
1034 printk(KERN_NOTICE "Slow work thread pool:"
1035 " Shut down complete\n");
1036 }
1037
1038 mutex_unlock(&slow_work_user_lock);
1039}
1040EXPORT_SYMBOL(slow_work_unregister_user);
1041
1042/*
1043 * Initialise the slow work facility
1044 */
1045static int __init init_slow_work(void)
1046{
1047 unsigned nr_cpus = num_possible_cpus();
1048
1049 if (slow_work_max_threads < nr_cpus)
1050 slow_work_max_threads = nr_cpus;
1051#ifdef CONFIG_SYSCTL
1052 if (slow_work_max_max_threads < nr_cpus * 2)
1053 slow_work_max_max_threads = nr_cpus * 2;
1054#endif
1055#ifdef CONFIG_SLOW_WORK_DEBUG
1056 {
1057 struct dentry *dbdir;
1058
1059 dbdir = debugfs_create_dir("slow_work", NULL);
1060 if (dbdir && !IS_ERR(dbdir))
1061 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1062 NULL, &slow_work_runqueue_fops);
1063 }
1064#endif
1065 return 0;
1066}
1067
1068subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index 4b493f67dcb5..000000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9#include <linux/mm.h>
10#include <linux/cpu.h>
11#include <linux/nmi.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/freezer.h>
15#include <linux/kthread.h>
16#include <linux/lockdep.h>
17#include <linux/notifier.h>
18#include <linux/module.h>
19#include <linux/sysctl.h>
20
21#include <asm/irq_regs.h>
22
23static DEFINE_SPINLOCK(print_lock);
24
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
29
30static int __read_mostly did_panic;
31int __read_mostly softlockup_thresh = 60;
32
33/*
34 * Should we panic (and reboot, if panic_timeout= is set) when a
35 * soft-lockup occurs:
36 */
37unsigned int __read_mostly softlockup_panic =
38 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
39
40static int __init softlockup_panic_setup(char *str)
41{
42 softlockup_panic = simple_strtoul(str, NULL, 0);
43
44 return 1;
45}
46__setup("softlockup_panic=", softlockup_panic_setup);
47
48static int
49softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
50{
51 did_panic = 1;
52
53 return NOTIFY_DONE;
54}
55
56static struct notifier_block panic_block = {
57 .notifier_call = softlock_panic,
58};
59
60/*
61 * Returns seconds, approximately. We don't need nanosecond
62 * resolution, and we don't need to waste time with a big divide when
63 * 2^30ns == 1.074s.
64 */
65static unsigned long get_timestamp(int this_cpu)
66{
67 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
68}
69
70static void __touch_softlockup_watchdog(void)
71{
72 int this_cpu = raw_smp_processor_id();
73
74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
75}
76
77void touch_softlockup_watchdog(void)
78{
79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
80}
81EXPORT_SYMBOL(touch_softlockup_watchdog);
82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
89void touch_all_softlockup_watchdogs(void)
90{
91 int cpu;
92
93 /* Cause each CPU to re-update its timestamp rather than complain */
94 for_each_online_cpu(cpu)
95 per_cpu(softlockup_touch_ts, cpu) = 0;
96}
97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
98
99int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
100 void __user *buffer,
101 size_t *lenp, loff_t *ppos)
102{
103 touch_all_softlockup_watchdogs();
104 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
105}
106
107/*
108 * This callback runs from the timer interrupt, and checks
109 * whether the watchdog thread has hung or not:
110 */
111void softlockup_tick(void)
112{
113 int this_cpu = smp_processor_id();
114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
115 unsigned long print_ts;
116 struct pt_regs *regs = get_irq_regs();
117 unsigned long now;
118
119 /* Is detection switched off? */
120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
121 /* Be sure we don't false trigger if switched back on */
122 if (touch_ts)
123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
124 return;
125 }
126
127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
136 __touch_softlockup_watchdog();
137 return;
138 }
139
140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
141
142 /* report at most once a second */
143 if (print_ts == touch_ts || did_panic)
144 return;
145
146 /* do not print during early bootup: */
147 if (unlikely(system_state != SYSTEM_RUNNING)) {
148 __touch_softlockup_watchdog();
149 return;
150 }
151
152 now = get_timestamp(this_cpu);
153
154 /*
155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan.
157 */
158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160
161 /* Warn about unreasonable delays: */
162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return;
164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
166
167 spin_lock(&print_lock);
168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
169 this_cpu, now - touch_ts,
170 current->comm, task_pid_nr(current));
171 print_modules();
172 print_irqtrace_events(current);
173 if (regs)
174 show_regs(regs);
175 else
176 dump_stack();
177 spin_unlock(&print_lock);
178
179 if (softlockup_panic)
180 panic("softlockup: hung tasks");
181}
182
183/*
184 * The watchdog thread - runs every second and touches the timestamp.
185 */
186static int watchdog(void *__bind_cpu)
187{
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 sched_setscheduler(current, SCHED_FIFO, &param);
191
192 /* initialize timestamp */
193 __touch_softlockup_watchdog();
194
195 set_current_state(TASK_INTERRUPTIBLE);
196 /*
197 * Run briefly once per second to reset the softlockup timestamp.
198 * If this gets delayed for more than 60 seconds then the
199 * debug-printout triggers in softlockup_tick().
200 */
201 while (!kthread_should_stop()) {
202 __touch_softlockup_watchdog();
203 schedule();
204
205 if (kthread_should_stop())
206 break;
207
208 set_current_state(TASK_INTERRUPTIBLE);
209 }
210 __set_current_state(TASK_RUNNING);
211
212 return 0;
213}
214
215/*
216 * Create/destroy watchdog threads as CPUs come and go:
217 */
218static int __cpuinit
219cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
220{
221 int hotcpu = (unsigned long)hcpu;
222 struct task_struct *p;
223
224 switch (action) {
225 case CPU_UP_PREPARE:
226 case CPU_UP_PREPARE_FROZEN:
227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
229 if (IS_ERR(p)) {
230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
231 return NOTIFY_BAD;
232 }
233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
234 per_cpu(softlockup_watchdog, hotcpu) = p;
235 kthread_bind(p, hotcpu);
236 break;
237 case CPU_ONLINE:
238 case CPU_ONLINE_FROZEN:
239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
240 break;
241#ifdef CONFIG_HOTPLUG_CPU
242 case CPU_UP_CANCELED:
243 case CPU_UP_CANCELED_FROZEN:
244 if (!per_cpu(softlockup_watchdog, hotcpu))
245 break;
246 /* Unbind so it can run. Fall thru. */
247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
248 cpumask_any(cpu_online_mask));
249 case CPU_DEAD:
250 case CPU_DEAD_FROZEN:
251 p = per_cpu(softlockup_watchdog, hotcpu);
252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
253 kthread_stop(p);
254 break;
255#endif /* CONFIG_HOTPLUG_CPU */
256 }
257 return NOTIFY_OK;
258}
259
260static struct notifier_block __cpuinitdata cpu_nfb = {
261 .notifier_call = cpu_callback
262};
263
264static int __initdata nosoftlockup;
265
266static int __init nosoftlockup_setup(char *str)
267{
268 nosoftlockup = 1;
269 return 1;
270}
271__setup("nosoftlockup", nosoftlockup_setup);
272
273static int __init spawn_softlockup_task(void)
274{
275 void *cpu = (void *)(long)smp_processor_id();
276 int err;
277
278 if (nosoftlockup)
279 return 0;
280
281 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
282 if (err == NOTIFY_BAD) {
283 BUG();
284 return 1;
285 }
286 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
287 register_cpu_notifier(&cpu_nfb);
288
289 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
290
291 return 0;
292}
293early_initcall(spawn_softlockup_task);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 70f8d90331e9..4372ccb25127 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,9 +35,9 @@ struct cpu_stop_done {
35/* the actual stopper, one per every possible cpu, enabled on online cpus */ 35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper { 36struct cpu_stopper {
37 spinlock_t lock; 37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */
38 struct list_head works; /* list of pending works */ 39 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */ 40 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41}; 41};
42 42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
diff --git a/kernel/sys.c b/kernel/sys.c
index e83ddbbaf89d..e9ad44489828 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1236,15 +1236,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1236 1236
1237SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1237SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1238{ 1238{
1239 if (resource >= RLIM_NLIMITS) 1239 struct rlimit value;
1240 return -EINVAL; 1240 int ret;
1241 else { 1241
1242 struct rlimit value; 1242 ret = do_prlimit(current, resource, NULL, &value);
1243 task_lock(current->group_leader); 1243 if (!ret)
1244 value = current->signal->rlim[resource]; 1244 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1245 task_unlock(current->group_leader); 1245
1246 return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1246 return ret;
1247 }
1248} 1247}
1249 1248
1250#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1249#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1272,44 +1271,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1272 1271
1273#endif 1272#endif
1274 1273
1275SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1274static inline bool rlim64_is_infinity(__u64 rlim64)
1276{ 1275{
1277 struct rlimit new_rlim, *old_rlim; 1276#if BITS_PER_LONG < 64
1278 int retval; 1277 return rlim64 >= ULONG_MAX;
1278#else
1279 return rlim64 == RLIM64_INFINITY;
1280#endif
1281}
1282
1283static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
1284{
1285 if (rlim->rlim_cur == RLIM_INFINITY)
1286 rlim64->rlim_cur = RLIM64_INFINITY;
1287 else
1288 rlim64->rlim_cur = rlim->rlim_cur;
1289 if (rlim->rlim_max == RLIM_INFINITY)
1290 rlim64->rlim_max = RLIM64_INFINITY;
1291 else
1292 rlim64->rlim_max = rlim->rlim_max;
1293}
1294
1295static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
1296{
1297 if (rlim64_is_infinity(rlim64->rlim_cur))
1298 rlim->rlim_cur = RLIM_INFINITY;
1299 else
1300 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
1301 if (rlim64_is_infinity(rlim64->rlim_max))
1302 rlim->rlim_max = RLIM_INFINITY;
1303 else
1304 rlim->rlim_max = (unsigned long)rlim64->rlim_max;
1305}
1306
1307/* make sure you are allowed to change @tsk limits before calling this */
1308int do_prlimit(struct task_struct *tsk, unsigned int resource,
1309 struct rlimit *new_rlim, struct rlimit *old_rlim)
1310{
1311 struct rlimit *rlim;
1312 int retval = 0;
1279 1313
1280 if (resource >= RLIM_NLIMITS) 1314 if (resource >= RLIM_NLIMITS)
1281 return -EINVAL; 1315 return -EINVAL;
1282 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1316 if (new_rlim) {
1283 return -EFAULT; 1317 if (new_rlim->rlim_cur > new_rlim->rlim_max)
1284 if (new_rlim.rlim_cur > new_rlim.rlim_max) 1318 return -EINVAL;
1285 return -EINVAL; 1319 if (resource == RLIMIT_NOFILE &&
1286 old_rlim = current->signal->rlim + resource; 1320 new_rlim->rlim_max > sysctl_nr_open)
1287 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1321 return -EPERM;
1288 !capable(CAP_SYS_RESOURCE))
1289 return -EPERM;
1290 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1291 return -EPERM;
1292
1293 retval = security_task_setrlimit(resource, &new_rlim);
1294 if (retval)
1295 return retval;
1296
1297 if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
1298 /*
1299 * The caller is asking for an immediate RLIMIT_CPU
1300 * expiry. But we use the zero value to mean "it was
1301 * never set". So let's cheat and make it one second
1302 * instead
1303 */
1304 new_rlim.rlim_cur = 1;
1305 } 1322 }
1306 1323
1307 task_lock(current->group_leader); 1324 /* protect tsk->signal and tsk->sighand from disappearing */
1308 *old_rlim = new_rlim; 1325 read_lock(&tasklist_lock);
1309 task_unlock(current->group_leader); 1326 if (!tsk->sighand) {
1310 1327 retval = -ESRCH;
1311 if (resource != RLIMIT_CPU)
1312 goto out; 1328 goto out;
1329 }
1330
1331 rlim = tsk->signal->rlim + resource;
1332 task_lock(tsk->group_leader);
1333 if (new_rlim) {
1334 if (new_rlim->rlim_max > rlim->rlim_max &&
1335 !capable(CAP_SYS_RESOURCE))
1336 retval = -EPERM;
1337 if (!retval)
1338 retval = security_task_setrlimit(tsk->group_leader,
1339 resource, new_rlim);
1340 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1341 /*
1342 * The caller is asking for an immediate RLIMIT_CPU
1343 * expiry. But we use the zero value to mean "it was
1344 * never set". So let's cheat and make it one second
1345 * instead
1346 */
1347 new_rlim->rlim_cur = 1;
1348 }
1349 }
1350 if (!retval) {
1351 if (old_rlim)
1352 *old_rlim = *rlim;
1353 if (new_rlim)
1354 *rlim = *new_rlim;
1355 }
1356 task_unlock(tsk->group_leader);
1313 1357
1314 /* 1358 /*
1315 * RLIMIT_CPU handling. Note that the kernel fails to return an error 1359 * RLIMIT_CPU handling. Note that the kernel fails to return an error
@@ -1317,14 +1361,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1317 * very long-standing error, and fixing it now risks breakage of 1361 * very long-standing error, and fixing it now risks breakage of
1318 * applications, so we live with it 1362 * applications, so we live with it
1319 */ 1363 */
1320 if (new_rlim.rlim_cur == RLIM_INFINITY) 1364 if (!retval && new_rlim && resource == RLIMIT_CPU &&
1321 goto out; 1365 new_rlim->rlim_cur != RLIM_INFINITY)
1322 1366 update_rlimit_cpu(tsk, new_rlim->rlim_cur);
1323 update_rlimit_cpu(new_rlim.rlim_cur);
1324out: 1367out:
1368 read_unlock(&tasklist_lock);
1369 return retval;
1370}
1371
1372/* rcu lock must be held */
1373static int check_prlimit_permission(struct task_struct *task)
1374{
1375 const struct cred *cred = current_cred(), *tcred;
1376
1377 tcred = __task_cred(task);
1378 if ((cred->uid != tcred->euid ||
1379 cred->uid != tcred->suid ||
1380 cred->uid != tcred->uid ||
1381 cred->gid != tcred->egid ||
1382 cred->gid != tcred->sgid ||
1383 cred->gid != tcred->gid) &&
1384 !capable(CAP_SYS_RESOURCE)) {
1385 return -EPERM;
1386 }
1387
1325 return 0; 1388 return 0;
1326} 1389}
1327 1390
1391SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1392 const struct rlimit64 __user *, new_rlim,
1393 struct rlimit64 __user *, old_rlim)
1394{
1395 struct rlimit64 old64, new64;
1396 struct rlimit old, new;
1397 struct task_struct *tsk;
1398 int ret;
1399
1400 if (new_rlim) {
1401 if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1402 return -EFAULT;
1403 rlim64_to_rlim(&new64, &new);
1404 }
1405
1406 rcu_read_lock();
1407 tsk = pid ? find_task_by_vpid(pid) : current;
1408 if (!tsk) {
1409 rcu_read_unlock();
1410 return -ESRCH;
1411 }
1412 ret = check_prlimit_permission(tsk);
1413 if (ret) {
1414 rcu_read_unlock();
1415 return ret;
1416 }
1417 get_task_struct(tsk);
1418 rcu_read_unlock();
1419
1420 ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
1421 old_rlim ? &old : NULL);
1422
1423 if (!ret && old_rlim) {
1424 rlim_to_rlim64(&old, &old64);
1425 if (copy_to_user(old_rlim, &old64, sizeof(old64)))
1426 ret = -EFAULT;
1427 }
1428
1429 put_task_struct(tsk);
1430 return ret;
1431}
1432
1433SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1434{
1435 struct rlimit new_rlim;
1436
1437 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1438 return -EFAULT;
1439 return do_prlimit(current, resource, &new_rlim, NULL);
1440}
1441
1328/* 1442/*
1329 * It would make sense to put struct rusage in the task_struct, 1443 * It would make sense to put struct rusage in the task_struct,
1330 * except that would make the task_struct be *really big*. After 1444 * except that would make the task_struct be *really big*. After
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..bad369ec5403 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2);
181 181
182/* performance counters: */ 182/* performance counters: */
183cond_syscall(sys_perf_event_open); 183cond_syscall(sys_perf_event_open);
184
185/* fanotify! */
186cond_syscall(sys_fanotify_init);
187cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..ca38e8e3e907 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,16 +44,17 @@
44#include <linux/times.h> 44#include <linux/times.h>
45#include <linux/limits.h> 45#include <linux/limits.h>
46#include <linux/dcache.h> 46#include <linux/dcache.h>
47#include <linux/dnotify.h>
47#include <linux/syscalls.h> 48#include <linux/syscalls.h>
48#include <linux/vmstat.h> 49#include <linux/vmstat.h>
49#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
50#include <linux/acpi.h> 51#include <linux/acpi.h>
51#include <linux/reboot.h> 52#include <linux/reboot.h>
52#include <linux/ftrace.h> 53#include <linux/ftrace.h>
53#include <linux/slow-work.h>
54#include <linux/perf_event.h> 54#include <linux/perf_event.h>
55#include <linux/kprobes.h> 55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59#include <asm/processor.h> 60#include <asm/processor.h>
@@ -76,15 +77,16 @@
76#include <scsi/sg.h> 77#include <scsi/sg.h>
77#endif 78#endif
78 79
80#ifdef CONFIG_LOCKUP_DETECTOR
81#include <linux/nmi.h>
82#endif
83
79 84
80#if defined(CONFIG_SYSCTL) 85#if defined(CONFIG_SYSCTL)
81 86
82/* External variables not in a header file. */ 87/* External variables not in a header file. */
83extern int sysctl_overcommit_memory; 88extern int sysctl_overcommit_memory;
84extern int sysctl_overcommit_ratio; 89extern int sysctl_overcommit_ratio;
85extern int sysctl_panic_on_oom;
86extern int sysctl_oom_kill_allocating_task;
87extern int sysctl_oom_dump_tasks;
88extern int max_threads; 90extern int max_threads;
89extern int core_uses_pid; 91extern int core_uses_pid;
90extern int suid_dumpable; 92extern int suid_dumpable;
@@ -106,7 +108,7 @@ extern int blk_iopoll_enabled;
106#endif 108#endif
107 109
108/* Constants used for minimum and maximum */ 110/* Constants used for minimum and maximum */
109#ifdef CONFIG_DETECT_SOFTLOCKUP 111#ifdef CONFIG_LOCKUP_DETECTOR
110static int sixty = 60; 112static int sixty = 60;
111static int neg_one = -1; 113static int neg_one = -1;
112#endif 114#endif
@@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
130 132
131static int ngroups_max = NGROUPS_MAX; 133static int ngroups_max = NGROUPS_MAX;
132 134
135#ifdef CONFIG_INOTIFY_USER
136#include <linux/inotify.h>
137#endif
133#ifdef CONFIG_SPARC 138#ifdef CONFIG_SPARC
134#include <asm/system.h> 139#include <asm/system.h>
135#endif 140#endif
@@ -206,9 +211,6 @@ static struct ctl_table fs_table[];
206static struct ctl_table debug_table[]; 211static struct ctl_table debug_table[];
207static struct ctl_table dev_table[]; 212static struct ctl_table dev_table[];
208extern struct ctl_table random_table[]; 213extern struct ctl_table random_table[];
209#ifdef CONFIG_INOTIFY_USER
210extern struct ctl_table inotify_table[];
211#endif
212#ifdef CONFIG_EPOLL 214#ifdef CONFIG_EPOLL
213extern struct ctl_table epoll_table[]; 215extern struct ctl_table epoll_table[];
214#endif 216#endif
@@ -562,7 +564,7 @@ static struct ctl_table kern_table[] = {
562 .extra2 = &one, 564 .extra2 = &one,
563 }, 565 },
564#endif 566#endif
565#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 567#ifdef CONFIG_HOTPLUG
566 { 568 {
567 .procname = "hotplug", 569 .procname = "hotplug",
568 .data = &uevent_helper, 570 .data = &uevent_helper,
@@ -710,7 +712,34 @@ static struct ctl_table kern_table[] = {
710 .mode = 0444, 712 .mode = 0444,
711 .proc_handler = proc_dointvec, 713 .proc_handler = proc_dointvec,
712 }, 714 },
713#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 715#if defined(CONFIG_LOCKUP_DETECTOR)
716 {
717 .procname = "watchdog",
718 .data = &watchdog_enabled,
719 .maxlen = sizeof (int),
720 .mode = 0644,
721 .proc_handler = proc_dowatchdog_enabled,
722 },
723 {
724 .procname = "watchdog_thresh",
725 .data = &softlockup_thresh,
726 .maxlen = sizeof(int),
727 .mode = 0644,
728 .proc_handler = proc_dowatchdog_thresh,
729 .extra1 = &neg_one,
730 .extra2 = &sixty,
731 },
732 {
733 .procname = "softlockup_panic",
734 .data = &softlockup_panic,
735 .maxlen = sizeof(int),
736 .mode = 0644,
737 .proc_handler = proc_dointvec_minmax,
738 .extra1 = &zero,
739 .extra2 = &one,
740 },
741#endif
742#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
714 { 743 {
715 .procname = "unknown_nmi_panic", 744 .procname = "unknown_nmi_panic",
716 .data = &unknown_nmi_panic, 745 .data = &unknown_nmi_panic,
@@ -813,26 +842,6 @@ static struct ctl_table kern_table[] = {
813 .proc_handler = proc_dointvec, 842 .proc_handler = proc_dointvec,
814 }, 843 },
815#endif 844#endif
816#ifdef CONFIG_DETECT_SOFTLOCKUP
817 {
818 .procname = "softlockup_panic",
819 .data = &softlockup_panic,
820 .maxlen = sizeof(int),
821 .mode = 0644,
822 .proc_handler = proc_dointvec_minmax,
823 .extra1 = &zero,
824 .extra2 = &one,
825 },
826 {
827 .procname = "softlockup_thresh",
828 .data = &softlockup_thresh,
829 .maxlen = sizeof(int),
830 .mode = 0644,
831 .proc_handler = proc_dosoftlockup_thresh,
832 .extra1 = &neg_one,
833 .extra2 = &sixty,
834 },
835#endif
836#ifdef CONFIG_DETECT_HUNG_TASK 845#ifdef CONFIG_DETECT_HUNG_TASK
837 { 846 {
838 .procname = "hung_task_panic", 847 .procname = "hung_task_panic",
@@ -906,13 +915,6 @@ static struct ctl_table kern_table[] = {
906 .proc_handler = proc_dointvec, 915 .proc_handler = proc_dointvec,
907 }, 916 },
908#endif 917#endif
909#ifdef CONFIG_SLOW_WORK
910 {
911 .procname = "slow-work",
912 .mode = 0555,
913 .child = slow_work_sysctls,
914 },
915#endif
916#ifdef CONFIG_PERF_EVENTS 918#ifdef CONFIG_PERF_EVENTS
917 { 919 {
918 .procname = "perf_event_paranoid", 920 .procname = "perf_event_paranoid",
diff --git a/kernel/time.c b/kernel/time.c
index 848b1c2ab09a..ba9b338d1835 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
300} 300}
301EXPORT_SYMBOL(timespec_trunc); 301EXPORT_SYMBOL(timespec_trunc);
302 302
303#ifndef CONFIG_GENERIC_TIME
304/*
305 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
306 * and therefore only yields usec accuracy
307 */
308void getnstimeofday(struct timespec *tv)
309{
310 struct timeval x;
311
312 do_gettimeofday(&x);
313 tv->tv_sec = x.tv_sec;
314 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
315}
316EXPORT_SYMBOL_GPL(getnstimeofday);
317#endif
318
319/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 303/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
320 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 304 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
321 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. 305 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0a..f06a8a365648 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
6 6
7config NO_HZ 7config NO_HZ
8 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
9 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 9 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
10 select TICK_ONESHOT 10 select TICK_ONESHOT
11 help 11 help
12 This option enables a tickless system: timer interrupts will 12 This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
15 15
16config HIGH_RES_TIMERS 16config HIGH_RES_TIMERS
17 bool "High Resolution Timer Support" 17 bool "High Resolution Timer Support"
18 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 18 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
19 select TICK_ONESHOT 19 select TICK_ONESHOT
20 help 20 help
21 This option enables high resolution timer support. If your 21 This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f08e99c1d561..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
531 return max_nsecs - (max_nsecs >> 5); 531 return max_nsecs - (max_nsecs >> 5);
532} 532}
533 533
534#ifdef CONFIG_GENERIC_TIME 534#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
535 535
536/** 536/**
537 * clocksource_select - Select the best clocksource available 537 * clocksource_select - Select the best clocksource available
@@ -577,7 +577,7 @@ static void clocksource_select(void)
577 } 577 }
578} 578}
579 579
580#else /* CONFIG_GENERIC_TIME */ 580#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
581 581
582static inline void clocksource_select(void) { } 582static inline void clocksource_select(void) { }
583 583
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
639#define MAX_UPDATE_LENGTH 5 /* Seconds */ 639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640 640
641/** 641/**
642 * __clocksource_register_scale - Used to install new clocksources 642 * __clocksource_updatefreq_scale - Used update clocksource with new freq
643 * @t: clocksource to be registered 643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz 644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale 645 * @freq: clocksource frequency (cycles per second) divided by scale
646 * 646 *
647 * Returns -EBUSY if registration fails, zero otherwise. 647 * This should only be called from the clocksource->enable() method.
648 * 648 *
649 * This *SHOULD NOT* be called directly! Please use the 649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions. 650 * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
651 */ 651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 652void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
653{ 653{
654
655 /* 654 /*
656 * Ideally we want to use some of the limits used in 655 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed 656 * clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
662 NSEC_PER_SEC/scale, 661 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale); 662 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 663 cs->max_idle_ns = clocksource_max_deferment(cs);
664}
665EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
666
667/**
668 * __clocksource_register_scale - Used to install new clocksources
669 * @t: clocksource to be registered
670 * @scale: Scale factor multiplied against freq to get clocksource hz
671 * @freq: clocksource frequency (cycles per second) divided by scale
672 *
673 * Returns -EBUSY if registration fails, zero otherwise.
674 *
675 * This *SHOULD NOT* be called directly! Please use the
676 * clocksource_register_hz() or clocksource_register_khz helper functions.
677 */
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{
680
681 /* Intialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq);
665 683
684 /* Add clocksource to the clcoksource list */
666 mutex_lock(&clocksource_mutex); 685 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs); 686 clocksource_enqueue(cs);
668 clocksource_select(); 687 clocksource_select();
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
188 /* 188 /*
189 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
190 * periodic mode. We read dev->next_event first and add to it 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event() 191 * when the event already expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really 192 * sets dev->next_event only when the event is really
193 * programmed to the device. 193 * programmed to the device.
194 */ 194 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..3e216e01bbd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
325 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
326 326
327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { 328 arch_needs_cpu(cpu)) {
329 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
330 delta_jiffies = 1; 330 delta_jiffies = 1;
331 } else { 331 } else {
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
405 * the scheduler tick in nohz_restart_sched_tick. 405 * the scheduler tick in nohz_restart_sched_tick.
406 */ 406 */
407 if (!ts->tick_stopped) { 407 if (!ts->tick_stopped) {
408 if (select_nohz_load_balancer(1)) { 408 select_nohz_load_balancer(1);
409 /*
410 * sched tick not stopped!
411 */
412 cpumask_clear_cpu(cpu, nohz_cpu_mask);
413 goto out;
414 }
415 409
416 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
417 ts->tick_stopped = 1; 411 ts->tick_stopped = 1;
@@ -780,7 +774,6 @@ void tick_setup_sched_timer(void)
780{ 774{
781 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 775 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
782 ktime_t now = ktime_get(); 776 ktime_t now = ktime_get();
783 u64 offset;
784 777
785 /* 778 /*
786 * Emulate tick processing via per-CPU hrtimers: 779 * Emulate tick processing via per-CPU hrtimers:
@@ -790,10 +783,6 @@ void tick_setup_sched_timer(void)
790 783
791 /* Get the next period (per cpu) */ 784 /* Get the next period (per cpu) */
792 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 785 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
793 offset = ktime_to_ns(tick_period) >> 1;
794 do_div(offset, num_possible_cpus());
795 offset *= smp_processor_id();
796 hrtimer_add_expires_ns(&ts->sched_timer, offset);
797 786
798 for (;;) { 787 for (;;) {
799 hrtimer_forward(&ts->sched_timer, now, tick_period); 788 hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..49010d822f72 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
153 * - wall_to_monotonic is no longer the boot time, getboottime must be 153 * - wall_to_monotonic is no longer the boot time, getboottime must be
154 * used instead. 154 * used instead.
155 */ 155 */
156struct timespec xtime __attribute__ ((aligned (16))); 156static struct timespec xtime __attribute__ ((aligned (16)));
157struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 157static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
158static struct timespec total_sleep_time; 158static struct timespec total_sleep_time;
159 159
160/* 160/*
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond)
170{ 170{
171 xtime.tv_sec += leapsecond; 171 xtime.tv_sec += leapsecond;
172 wall_to_monotonic.tv_sec -= leapsecond; 172 wall_to_monotonic.tv_sec -= leapsecond;
173 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 173 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
174 timekeeper.mult);
174} 175}
175 176
176#ifdef CONFIG_GENERIC_TIME
177
178/** 177/**
179 * timekeeping_forward_now - update clock to the current time 178 * timekeeping_forward_now - update clock to the current time
180 * 179 *
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
328 timekeeper.ntp_error = 0; 327 timekeeper.ntp_error = 0;
329 ntp_clear(); 328 ntp_clear();
330 329
331 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 330 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
331 timekeeper.mult);
332 332
333 write_sequnlock_irqrestore(&xtime_lock, flags); 333 write_sequnlock_irqrestore(&xtime_lock, flags);
334 334
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
376 tick_clock_notify(); 376 tick_clock_notify();
377} 377}
378 378
379#else /* GENERIC_TIME */
380
381static inline void timekeeping_forward_now(void) { }
382
383/**
384 * ktime_get - get the monotonic time in ktime_t format
385 *
386 * returns the time in ktime_t format
387 */
388ktime_t ktime_get(void)
389{
390 struct timespec now;
391
392 ktime_get_ts(&now);
393
394 return timespec_to_ktime(now);
395}
396EXPORT_SYMBOL_GPL(ktime_get);
397
398/**
399 * ktime_get_ts - get the monotonic clock in timespec format
400 * @ts: pointer to timespec variable
401 *
402 * The function calculates the monotonic clock from the realtime
403 * clock and the wall_to_monotonic offset and stores the result
404 * in normalized timespec format in the variable pointed to by @ts.
405 */
406void ktime_get_ts(struct timespec *ts)
407{
408 struct timespec tomono;
409 unsigned long seq;
410
411 do {
412 seq = read_seqbegin(&xtime_lock);
413 getnstimeofday(ts);
414 tomono = wall_to_monotonic;
415
416 } while (read_seqretry(&xtime_lock, seq));
417
418 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
419 ts->tv_nsec + tomono.tv_nsec);
420}
421EXPORT_SYMBOL_GPL(ktime_get_ts);
422
423#endif /* !GENERIC_TIME */
424
425/** 379/**
426 * ktime_get_real - get the real (wall-) time in ktime_t format 380 * ktime_get_real - get the real (wall-) time in ktime_t format
427 * 381 *
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev)
579 533
580 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 534 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
581 ts = timespec_sub(ts, timekeeping_suspend_time); 535 ts = timespec_sub(ts, timekeeping_suspend_time);
582 xtime = timespec_add_safe(xtime, ts); 536 xtime = timespec_add(xtime, ts);
583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 537 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
584 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 538 total_sleep_time = timespec_add(total_sleep_time, ts);
585 } 539 }
586 /* re-base the last cycle value */ 540 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 541 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset)
736static cycle_t logarithmic_accumulation(cycle_t offset, int shift) 690static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
737{ 691{
738 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 692 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
693 u64 raw_nsecs;
739 694
740 /* If the offset is smaller then a shifted interval, do nothing */ 695 /* If the offset is smaller then a shifted interval, do nothing */
741 if (offset < timekeeper.cycle_interval<<shift) 696 if (offset < timekeeper.cycle_interval<<shift)
@@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
752 second_overflow(); 707 second_overflow();
753 } 708 }
754 709
755 /* Accumulate into raw time */ 710 /* Accumulate raw time */
756 raw_time.tv_nsec += timekeeper.raw_interval << shift;; 711 raw_nsecs = timekeeper.raw_interval << shift;
757 while (raw_time.tv_nsec >= NSEC_PER_SEC) { 712 raw_nsecs += raw_time.tv_nsec;
758 raw_time.tv_nsec -= NSEC_PER_SEC; 713 if (raw_nsecs >= NSEC_PER_SEC) {
759 raw_time.tv_sec++; 714 u64 raw_secs = raw_nsecs;
715 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
716 raw_time.tv_sec += raw_secs;
760 } 717 }
718 raw_time.tv_nsec = raw_nsecs;
761 719
762 /* Accumulate error between NTP and clock interval */ 720 /* Accumulate error between NTP and clock interval */
763 timekeeper.ntp_error += tick_length << shift; 721 timekeeper.ntp_error += tick_length << shift;
@@ -784,10 +742,11 @@ void update_wall_time(void)
784 return; 742 return;
785 743
786 clock = timekeeper.clock; 744 clock = timekeeper.clock;
787#ifdef CONFIG_GENERIC_TIME 745
788 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 746#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
789#else
790 offset = timekeeper.cycle_interval; 747 offset = timekeeper.cycle_interval;
748#else
749 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
791#endif 750#endif
792 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 751 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
793 752
@@ -856,7 +815,8 @@ void update_wall_time(void)
856 } 815 }
857 816
858 /* check to see if there is a new clocksource to use */ 817 /* check to see if there is a new clocksource to use */
859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 818 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
819 timekeeper.mult);
860} 820}
861 821
862/** 822/**
@@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime);
887 */ 847 */
888void monotonic_to_bootbased(struct timespec *ts) 848void monotonic_to_bootbased(struct timespec *ts)
889{ 849{
890 *ts = timespec_add_safe(*ts, total_sleep_time); 850 *ts = timespec_add(*ts, total_sleep_time);
891} 851}
892EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 852EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
893 853
@@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void)
902 return xtime; 862 return xtime;
903} 863}
904 864
865struct timespec __get_wall_to_monotonic(void)
866{
867 return wall_to_monotonic;
868}
869
905struct timespec current_kernel_time(void) 870struct timespec current_kernel_time(void)
906{ 871{
907 struct timespec now; 872 struct timespec now;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..97bf05baade7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/* 91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of 92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB for 93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * the new flag to indicate whether the timer is deferrable 94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
95 */ 100 */
96#define TBASE_DEFERRABLE_FLAG (0x1) 101#define TBASE_DEFERRABLE_FLAG (0x1)
97 102
@@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321 326
322/** 327/**
323 * set_timer_slack - set the allowed slack for a timer 328 * set_timer_slack - set the allowed slack for a timer
329 * @timer: the timer to be modified
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding 330 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 * 331 *
326 * Set the amount of time, in jiffies, that a certain timer has 332 * Set the amount of time, in jiffies, that a certain timer has
@@ -577,6 +583,19 @@ static void __init_timer(struct timer_list *timer,
577 lockdep_init_map(&timer->lockdep_map, name, key, 0); 583 lockdep_init_map(&timer->lockdep_map, name, key, 0);
578} 584}
579 585
586void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
587 const char *name,
588 struct lock_class_key *key,
589 void (*function)(unsigned long),
590 unsigned long data)
591{
592 timer->function = function;
593 timer->data = data;
594 init_timer_on_stack_key(timer, name, key);
595 timer_set_deferrable(timer);
596}
597EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
598
580/** 599/**
581 * init_timer_key - initialize a timer 600 * init_timer_key - initialize a timer
582 * @timer: the timer to be initialized 601 * @timer: the timer to be initialized
@@ -679,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
679 cpu = smp_processor_id(); 698 cpu = smp_processor_id();
680 699
681#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 700#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
682 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { 701 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
683 int preferred_cpu = get_nohz_load_balancer(); 702 cpu = get_nohz_timer_target();
684
685 if (preferred_cpu >= 0)
686 cpu = preferred_cpu;
687 }
688#endif 703#endif
689 new_base = per_cpu(tvec_bases, cpu); 704 new_base = per_cpu(tvec_bases, cpu);
690 705
@@ -1289,7 +1304,6 @@ void run_local_timers(void)
1289{ 1304{
1290 hrtimer_run_queues(); 1305 hrtimer_run_queues();
1291 raise_softirq(TIMER_SOFTIRQ); 1306 raise_softirq(TIMER_SOFTIRQ);
1292 softlockup_tick();
1293} 1307}
1294 1308
1295/* 1309/*
@@ -1750,3 +1764,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
1750} 1764}
1751 1765
1752EXPORT_SYMBOL(msleep_interruptible); 1766EXPORT_SYMBOL(msleep_interruptible);
1767
1768static int __sched do_usleep_range(unsigned long min, unsigned long max)
1769{
1770 ktime_t kmin;
1771 unsigned long delta;
1772
1773 kmin = ktime_set(0, min * NSEC_PER_USEC);
1774 delta = (max - min) * NSEC_PER_USEC;
1775 return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
1776}
1777
1778/**
1779 * usleep_range - Drop in replacement for udelay where wakeup is flexible
1780 * @min: Minimum time in usecs to sleep
1781 * @max: Maximum time in usecs to sleep
1782 */
1783void usleep_range(unsigned long min, unsigned long max)
1784{
1785 __set_current_state(TASK_UNINTERRUPTIBLE);
1786 do_usleep_range(min, max);
1787}
1788EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..538501c6ea50 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -153,7 +153,7 @@ config IRQSOFF_TRACER
153 bool "Interrupts-off Latency Tracer" 153 bool "Interrupts-off Latency Tracer"
154 default n 154 default n
155 depends on TRACE_IRQFLAGS_SUPPORT 155 depends on TRACE_IRQFLAGS_SUPPORT
156 depends on GENERIC_TIME 156 depends on !ARCH_USES_GETTIMEOFFSET
157 select TRACE_IRQFLAGS 157 select TRACE_IRQFLAGS
158 select GENERIC_TRACER 158 select GENERIC_TRACER
159 select TRACER_MAX_TRACE 159 select TRACER_MAX_TRACE
@@ -175,7 +175,7 @@ config IRQSOFF_TRACER
175config PREEMPT_TRACER 175config PREEMPT_TRACER
176 bool "Preemption-off Latency Tracer" 176 bool "Preemption-off Latency Tracer"
177 default n 177 default n
178 depends on GENERIC_TIME 178 depends on !ARCH_USES_GETTIMEOFFSET
179 depends on PREEMPT 179 depends on PREEMPT
180 select GENERIC_TRACER 180 select GENERIC_TRACER
181 select TRACER_MAX_TRACE 181 select TRACER_MAX_TRACE
@@ -194,15 +194,6 @@ config PREEMPT_TRACER
194 enabled. This option and the irqs-off timing option can be 194 enabled. This option and the irqs-off timing option can be
195 used together or separately.) 195 used together or separately.)
196 196
197config SYSPROF_TRACER
198 bool "Sysprof Tracer"
199 depends on X86
200 select GENERIC_TRACER
201 select CONTEXT_SWITCH_TRACER
202 help
203 This tracer provides the trace needed by the 'Sysprof' userspace
204 tool.
205
206config SCHED_TRACER 197config SCHED_TRACER
207 bool "Scheduling Latency Tracer" 198 bool "Scheduling Latency Tracer"
208 select GENERIC_TRACER 199 select GENERIC_TRACER
@@ -229,23 +220,6 @@ config FTRACE_SYSCALLS
229 help 220 help
230 Basic tracer to catch the syscall entry and exit events. 221 Basic tracer to catch the syscall entry and exit events.
231 222
232config BOOT_TRACER
233 bool "Trace boot initcalls"
234 select GENERIC_TRACER
235 select CONTEXT_SWITCH_TRACER
236 help
237 This tracer helps developers to optimize boot times: it records
238 the timings of the initcalls and traces key events and the identity
239 of tasks that can cause boot delays, such as context-switches.
240
241 Its aim is to be parsed by the scripts/bootgraph.pl tool to
242 produce pretty graphics about boot inefficiencies, giving a visual
243 representation of the delays during initcalls - but the raw
244 /debug/tracing/trace text output is readable too.
245
246 You must pass in initcall_debug and ftrace=initcall to the kernel
247 command line to enable this on bootup.
248
249config TRACE_BRANCH_PROFILING 223config TRACE_BRANCH_PROFILING
250 bool 224 bool
251 select GENERIC_TRACER 225 select GENERIC_TRACER
@@ -325,28 +299,6 @@ config BRANCH_TRACER
325 299
326 Say N if unsure. 300 Say N if unsure.
327 301
328config KSYM_TRACER
329 bool "Trace read and write access on kernel memory locations"
330 depends on HAVE_HW_BREAKPOINT
331 select TRACING
332 help
333 This tracer helps find read and write operations on any given kernel
334 symbol i.e. /proc/kallsyms.
335
336config PROFILE_KSYM_TRACER
337 bool "Profile all kernel memory accesses on 'watched' variables"
338 depends on KSYM_TRACER
339 help
340 This tracer profiles kernel accesses on variables watched through the
341 ksym tracer ftrace plugin. Depending upon the hardware, all read
342 and write operations on kernel variables can be monitored for
343 accesses.
344
345 The results will be displayed in:
346 /debugfs/tracing/profile_ksym
347
348 Say N if unsure.
349
350config STACK_TRACER 302config STACK_TRACER
351 bool "Trace max stack" 303 bool "Trace max stack"
352 depends on HAVE_FUNCTION_TRACER 304 depends on HAVE_FUNCTION_TRACER
@@ -371,37 +323,6 @@ config STACK_TRACER
371 323
372 Say N if unsure. 324 Say N if unsure.
373 325
374config KMEMTRACE
375 bool "Trace SLAB allocations"
376 select GENERIC_TRACER
377 help
378 kmemtrace provides tracing for slab allocator functions, such as
379 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
380 data is then fed to the userspace application in order to analyse
381 allocation hotspots, internal fragmentation and so on, making it
382 possible to see how well an allocator performs, as well as debug
383 and profile kernel code.
384
385 This requires an userspace application to use. See
386 Documentation/trace/kmemtrace.txt for more information.
387
388 Saying Y will make the kernel somewhat larger and slower. However,
389 if you disable kmemtrace at run-time or boot-time, the performance
390 impact is minimal (depending on the arch the kernel is built for).
391
392 If unsure, say N.
393
394config WORKQUEUE_TRACER
395 bool "Trace workqueues"
396 select GENERIC_TRACER
397 help
398 The workqueue tracer provides some statistical information
399 about each cpu workqueue thread such as the number of the
400 works inserted and executed since their creation. It can help
401 to evaluate the amount of work each of them has to perform.
402 For example it can help a developer to decide whether he should
403 choose a per-cpu workqueue instead of a singlethreaded one.
404
405config BLK_DEV_IO_TRACE 326config BLK_DEV_IO_TRACE
406 bool "Support for tracing block IO actions" 327 bool "Support for tracing block IO actions"
407 depends on SYSFS 328 depends on SYSFS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
30obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
31obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
33obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
34obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 33obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
35obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 34obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
36obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 35obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
38obj-$(CONFIG_NOP_TRACER) += trace_nop.o 37obj-$(CONFIG_NOP_TRACER) += trace_nop.o
39obj-$(CONFIG_STACK_TRACER) += trace_stack.o 38obj-$(CONFIG_STACK_TRACER) += trace_stack.o
40obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 39obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 40obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 41obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 42obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 43obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
47ifeq ($(CONFIG_BLOCK),y) 44ifeq ($(CONFIG_BLOCK),y)
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
55endif 52endif
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
58obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
59obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_EVENT_TRACING) += power-traces.o
56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif
60 59
61libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 638711c17504..959f8d6c8cc1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
170 BLK_TC_ACT(BLK_TC_WRITE) }; 170 BLK_TC_ACT(BLK_TC_WRITE) };
171 171
172#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
173#define BLK_TC_RAHEAD BLK_TC_AHEAD
174
172/* The ilog2() calls fall out because they're constant */ 175/* The ilog2() calls fall out because they're constant */
173#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ 176#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
174 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) 177 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
175 178
176/* 179/*
177 * The worker for the various blk_add_trace*() types. Fills out a 180 * The worker for the various blk_add_trace*() types. Fills out a
@@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
194 return; 197 return;
195 198
196 what |= ddir_act[rw & WRITE]; 199 what |= ddir_act[rw & WRITE];
197 what |= MASK_TC_BIT(rw, BARRIER); 200 what |= MASK_TC_BIT(rw, HARDBARRIER);
198 what |= MASK_TC_BIT(rw, SYNCIO); 201 what |= MASK_TC_BIT(rw, SYNC);
199 what |= MASK_TC_BIT(rw, AHEAD); 202 what |= MASK_TC_BIT(rw, RAHEAD);
200 what |= MASK_TC_BIT(rw, META); 203 what |= MASK_TC_BIT(rw, META);
201 what |= MASK_TC_BIT(rw, DISCARD); 204 what |= MASK_TC_BIT(rw, DISCARD);
202 205
@@ -549,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
549} 552}
550EXPORT_SYMBOL_GPL(blk_trace_setup); 553EXPORT_SYMBOL_GPL(blk_trace_setup);
551 554
555#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
556static int compat_blk_trace_setup(struct request_queue *q, char *name,
557 dev_t dev, struct block_device *bdev,
558 char __user *arg)
559{
560 struct blk_user_trace_setup buts;
561 struct compat_blk_user_trace_setup cbuts;
562 int ret;
563
564 if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
565 return -EFAULT;
566
567 buts = (struct blk_user_trace_setup) {
568 .act_mask = cbuts.act_mask,
569 .buf_size = cbuts.buf_size,
570 .buf_nr = cbuts.buf_nr,
571 .start_lba = cbuts.start_lba,
572 .end_lba = cbuts.end_lba,
573 .pid = cbuts.pid,
574 };
575 memcpy(&buts.name, &cbuts.name, 32);
576
577 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
578 if (ret)
579 return ret;
580
581 if (copy_to_user(arg, &buts.name, 32)) {
582 blk_trace_remove(q);
583 return -EFAULT;
584 }
585
586 return 0;
587}
588#endif
589
552int blk_trace_startstop(struct request_queue *q, int start) 590int blk_trace_startstop(struct request_queue *q, int start)
553{ 591{
554 int ret; 592 int ret;
@@ -601,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
601 if (!q) 639 if (!q)
602 return -ENXIO; 640 return -ENXIO;
603 641
642 lock_kernel();
604 mutex_lock(&bdev->bd_mutex); 643 mutex_lock(&bdev->bd_mutex);
605 644
606 switch (cmd) { 645 switch (cmd) {
@@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
608 bdevname(bdev, b); 647 bdevname(bdev, b);
609 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); 648 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
610 break; 649 break;
650#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
651 case BLKTRACESETUP32:
652 bdevname(bdev, b);
653 ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
654 break;
655#endif
611 case BLKTRACESTART: 656 case BLKTRACESTART:
612 start = 1; 657 start = 1;
613 case BLKTRACESTOP: 658 case BLKTRACESTOP:
@@ -622,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
622 } 667 }
623 668
624 mutex_unlock(&bdev->bd_mutex); 669 mutex_unlock(&bdev->bd_mutex);
670 unlock_kernel();
625 return ret; 671 return ret;
626} 672}
627 673
@@ -661,10 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
661 if (likely(!bt)) 707 if (likely(!bt))
662 return; 708 return;
663 709
664 if (blk_discard_rq(rq)) 710 if (rq->cmd_flags & REQ_DISCARD)
665 rw |= (1 << BIO_RW_DISCARD); 711 rw |= REQ_DISCARD;
712
713 if (rq->cmd_flags & REQ_SECURE)
714 rw |= REQ_SECURE;
666 715
667 if (blk_pc_request(rq)) { 716 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
668 what |= BLK_TC_ACT(BLK_TC_PC); 717 what |= BLK_TC_ACT(BLK_TC_PC);
669 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 718 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
670 what, rq->errors, rq->cmd_len, rq->cmd); 719 what, rq->errors, rq->cmd_len, rq->cmd);
@@ -925,7 +974,7 @@ void blk_add_driver_data(struct request_queue *q,
925 if (likely(!bt)) 974 if (likely(!bt))
926 return; 975 return;
927 976
928 if (blk_pc_request(rq)) 977 if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
929 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 978 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
930 BLK_TA_DRV_DATA, rq->errors, len, data); 979 BLK_TA_DRV_DATA, rq->errors, len, data);
931 else 980 else
@@ -1730,7 +1779,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
1730 int len = rq->cmd_len; 1779 int len = rq->cmd_len;
1731 unsigned char *cmd = rq->cmd; 1780 unsigned char *cmd = rq->cmd;
1732 1781
1733 if (!blk_pc_request(rq)) { 1782 if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
1734 buf[0] = '\0'; 1783 buf[0] = '\0';
1735 return; 1784 return;
1736 } 1785 }
@@ -1755,21 +1804,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1755 1804
1756 if (rw & WRITE) 1805 if (rw & WRITE)
1757 rwbs[i++] = 'W'; 1806 rwbs[i++] = 'W';
1758 else if (rw & 1 << BIO_RW_DISCARD) 1807 else if (rw & REQ_DISCARD)
1759 rwbs[i++] = 'D'; 1808 rwbs[i++] = 'D';
1760 else if (bytes) 1809 else if (bytes)
1761 rwbs[i++] = 'R'; 1810 rwbs[i++] = 'R';
1762 else 1811 else
1763 rwbs[i++] = 'N'; 1812 rwbs[i++] = 'N';
1764 1813
1765 if (rw & 1 << BIO_RW_AHEAD) 1814 if (rw & REQ_RAHEAD)
1766 rwbs[i++] = 'A'; 1815 rwbs[i++] = 'A';
1767 if (rw & 1 << BIO_RW_BARRIER) 1816 if (rw & REQ_HARDBARRIER)
1768 rwbs[i++] = 'B'; 1817 rwbs[i++] = 'B';
1769 if (rw & 1 << BIO_RW_SYNCIO) 1818 if (rw & REQ_SYNC)
1770 rwbs[i++] = 'S'; 1819 rwbs[i++] = 'S';
1771 if (rw & 1 << BIO_RW_META) 1820 if (rw & REQ_META)
1772 rwbs[i++] = 'M'; 1821 rwbs[i++] = 'M';
1822 if (rw & REQ_SECURE)
1823 rwbs[i++] = 'E';
1773 1824
1774 rwbs[i] = '\0'; 1825 rwbs[i] = '\0';
1775} 1826}
@@ -1779,8 +1830,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1779 int rw = rq->cmd_flags & 0x03; 1830 int rw = rq->cmd_flags & 0x03;
1780 int bytes; 1831 int bytes;
1781 1832
1782 if (blk_discard_rq(rq)) 1833 if (rq->cmd_flags & REQ_DISCARD)
1783 rw |= (1 << BIO_RW_DISCARD); 1834 rw |= REQ_DISCARD;
1835
1836 if (rq->cmd_flags & REQ_SECURE)
1837 rw |= REQ_SECURE;
1784 1838
1785 bytes = blk_rq_bytes(rq); 1839 bytes = blk_rq_bytes(rq);
1786 1840
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..0d88ce9b9fb8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1883 struct hlist_head *hhd; 1883 struct hlist_head *hhd;
1884 struct hlist_node *n; 1884 struct hlist_node *n;
1885 unsigned long key; 1885 unsigned long key;
1886 int resched;
1887 1886
1888 key = hash_long(ip, FTRACE_HASH_BITS); 1887 key = hash_long(ip, FTRACE_HASH_BITS);
1889 1888
@@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1897 * period. This syncs the hash iteration and freeing of items 1896 * period. This syncs the hash iteration and freeing of items
1898 * on the hash. rcu_read_lock is too dangerous here. 1897 * on the hash. rcu_read_lock is too dangerous here.
1899 */ 1898 */
1900 resched = ftrace_preempt_disable(); 1899 preempt_disable_notrace();
1901 hlist_for_each_entry_rcu(entry, n, hhd, node) { 1900 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1902 if (entry->ip == ip) 1901 if (entry->ip == ip)
1903 entry->ops->func(ip, parent_ip, &entry->data); 1902 entry->ops->func(ip, parent_ip, &entry->data);
1904 } 1903 }
1905 ftrace_preempt_enable(resched); 1904 preempt_enable_notrace();
1906} 1905}
1907 1906
1908static struct ftrace_ops trace_probe_ops __read_mostly = 1907static struct ftrace_ops trace_probe_ops __read_mostly =
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
11#include <linux/debugfs.h>
12#include <linux/dcache.h>
13#include <linux/fs.h>
14
15#include <linux/kmemtrace.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20/* Select an alternative, minimalistic output than the original one */
21#define TRACE_KMEM_OPT_MINIMAL 0x1
22
23static struct tracer_opt kmem_opts[] = {
24 /* Default disable the minimalistic output */
25 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
26 { }
27};
28
29static struct tracer_flags kmem_tracer_flags = {
30 .val = 0,
31 .opts = kmem_opts
32};
33
34static struct trace_array *kmemtrace_array;
35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
46 struct trace_array *tr = kmemtrace_array;
47 struct kmemtrace_alloc_entry *entry;
48 struct ring_buffer_event *event;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
51 if (!event)
52 return;
53
54 entry = ring_buffer_event_data(event);
55 tracing_generic_entry_update(&entry->ent, 0, 0);
56
57 entry->ent.type = TRACE_KMEM_ALLOC;
58 entry->type_id = type_id;
59 entry->call_site = call_site;
60 entry->ptr = ptr;
61 entry->bytes_req = bytes_req;
62 entry->bytes_alloc = bytes_alloc;
63 entry->gfp_flags = gfp_flags;
64 entry->node = node;
65
66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
68
69 trace_wake_up();
70}
71
72static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
73 unsigned long call_site,
74 const void *ptr)
75{
76 struct ftrace_event_call *call = &event_kmem_free;
77 struct trace_array *tr = kmemtrace_array;
78 struct kmemtrace_free_entry *entry;
79 struct ring_buffer_event *event;
80
81 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
82 if (!event)
83 return;
84 entry = ring_buffer_event_data(event);
85 tracing_generic_entry_update(&entry->ent, 0, 0);
86
87 entry->ent.type = TRACE_KMEM_FREE;
88 entry->type_id = type_id;
89 entry->call_site = call_site;
90 entry->ptr = ptr;
91
92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
94
95 trace_wake_up();
96}
97
98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
100 const void *ptr,
101 size_t bytes_req,
102 size_t bytes_alloc,
103 gfp_t gfp_flags)
104{
105 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
106 bytes_req, bytes_alloc, gfp_flags, -1);
107}
108
109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
111 const void *ptr,
112 size_t bytes_req,
113 size_t bytes_alloc,
114 gfp_t gfp_flags)
115{
116 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
117 bytes_req, bytes_alloc, gfp_flags, -1);
118}
119
120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
122 const void *ptr,
123 size_t bytes_req,
124 size_t bytes_alloc,
125 gfp_t gfp_flags,
126 int node)
127{
128 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
129 bytes_req, bytes_alloc, gfp_flags, node);
130}
131
132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
134 const void *ptr,
135 size_t bytes_req,
136 size_t bytes_alloc,
137 gfp_t gfp_flags,
138 int node)
139{
140 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
141 bytes_req, bytes_alloc, gfp_flags, node);
142}
143
144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
146{
147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
148}
149
150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
152{
153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
154}
155
156static int kmemtrace_start_probes(void)
157{
158 int err;
159
160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
161 if (err)
162 return err;
163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
164 if (err)
165 return err;
166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
167 if (err)
168 return err;
169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
170 if (err)
171 return err;
172 err = register_trace_kfree(kmemtrace_kfree, NULL);
173 if (err)
174 return err;
175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
176
177 return err;
178}
179
180static void kmemtrace_stop_probes(void)
181{
182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
186 unregister_trace_kfree(kmemtrace_kfree, NULL);
187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
188}
189
190static int kmem_trace_init(struct trace_array *tr)
191{
192 kmemtrace_array = tr;
193
194 tracing_reset_online_cpus(tr);
195
196 kmemtrace_start_probes();
197
198 return 0;
199}
200
201static void kmem_trace_reset(struct trace_array *tr)
202{
203 kmemtrace_stop_probes();
204}
205
206static void kmemtrace_headers(struct seq_file *s)
207{
208 /* Don't need headers for the original kmemtrace output */
209 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
210 return;
211
212 seq_printf(s, "#\n");
213 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
214 " POINTER NODE CALLER\n");
215 seq_printf(s, "# FREE | | | | "
216 " | | | |\n");
217 seq_printf(s, "# |\n\n");
218}
219
220/*
221 * The following functions give the original output from kmemtrace,
222 * plus the origin CPU, since reordering occurs in-kernel now.
223 */
224
225#define KMEMTRACE_USER_ALLOC 0
226#define KMEMTRACE_USER_FREE 1
227
228struct kmemtrace_user_event {
229 u8 event_id;
230 u8 type_id;
231 u16 event_size;
232 u32 cpu;
233 u64 timestamp;
234 unsigned long call_site;
235 unsigned long ptr;
236};
237
238struct kmemtrace_user_event_alloc {
239 size_t bytes_req;
240 size_t bytes_alloc;
241 unsigned gfp_flags;
242 int node;
243};
244
245static enum print_line_t
246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
248{
249 struct trace_seq *s = &iter->seq;
250 struct kmemtrace_alloc_entry *entry;
251 int ret;
252
253 trace_assign_type(entry, iter->ent);
254
255 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
256 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
257 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
258 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
259 (unsigned long)entry->gfp_flags, entry->node);
260
261 if (!ret)
262 return TRACE_TYPE_PARTIAL_LINE;
263 return TRACE_TYPE_HANDLED;
264}
265
266static enum print_line_t
267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
269{
270 struct trace_seq *s = &iter->seq;
271 struct kmemtrace_free_entry *entry;
272 int ret;
273
274 trace_assign_type(entry, iter->ent);
275
276 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
277 entry->type_id, (void *)entry->call_site,
278 (unsigned long)entry->ptr);
279
280 if (!ret)
281 return TRACE_TYPE_PARTIAL_LINE;
282 return TRACE_TYPE_HANDLED;
283}
284
285static enum print_line_t
286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
288{
289 struct trace_seq *s = &iter->seq;
290 struct kmemtrace_alloc_entry *entry;
291 struct kmemtrace_user_event *ev;
292 struct kmemtrace_user_event_alloc *ev_alloc;
293
294 trace_assign_type(entry, iter->ent);
295
296 ev = trace_seq_reserve(s, sizeof(*ev));
297 if (!ev)
298 return TRACE_TYPE_PARTIAL_LINE;
299
300 ev->event_id = KMEMTRACE_USER_ALLOC;
301 ev->type_id = entry->type_id;
302 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
303 ev->cpu = iter->cpu;
304 ev->timestamp = iter->ts;
305 ev->call_site = entry->call_site;
306 ev->ptr = (unsigned long)entry->ptr;
307
308 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
309 if (!ev_alloc)
310 return TRACE_TYPE_PARTIAL_LINE;
311
312 ev_alloc->bytes_req = entry->bytes_req;
313 ev_alloc->bytes_alloc = entry->bytes_alloc;
314 ev_alloc->gfp_flags = entry->gfp_flags;
315 ev_alloc->node = entry->node;
316
317 return TRACE_TYPE_HANDLED;
318}
319
320static enum print_line_t
321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
323{
324 struct trace_seq *s = &iter->seq;
325 struct kmemtrace_free_entry *entry;
326 struct kmemtrace_user_event *ev;
327
328 trace_assign_type(entry, iter->ent);
329
330 ev = trace_seq_reserve(s, sizeof(*ev));
331 if (!ev)
332 return TRACE_TYPE_PARTIAL_LINE;
333
334 ev->event_id = KMEMTRACE_USER_FREE;
335 ev->type_id = entry->type_id;
336 ev->event_size = sizeof(*ev);
337 ev->cpu = iter->cpu;
338 ev->timestamp = iter->ts;
339 ev->call_site = entry->call_site;
340 ev->ptr = (unsigned long)entry->ptr;
341
342 return TRACE_TYPE_HANDLED;
343}
344
345/* The two other following provide a more minimalistic output */
346static enum print_line_t
347kmemtrace_print_alloc_compress(struct trace_iterator *iter)
348{
349 struct kmemtrace_alloc_entry *entry;
350 struct trace_seq *s = &iter->seq;
351 int ret;
352
353 trace_assign_type(entry, iter->ent);
354
355 /* Alloc entry */
356 ret = trace_seq_printf(s, " + ");
357 if (!ret)
358 return TRACE_TYPE_PARTIAL_LINE;
359
360 /* Type */
361 switch (entry->type_id) {
362 case KMEMTRACE_TYPE_KMALLOC:
363 ret = trace_seq_printf(s, "K ");
364 break;
365 case KMEMTRACE_TYPE_CACHE:
366 ret = trace_seq_printf(s, "C ");
367 break;
368 case KMEMTRACE_TYPE_PAGES:
369 ret = trace_seq_printf(s, "P ");
370 break;
371 default:
372 ret = trace_seq_printf(s, "? ");
373 }
374
375 if (!ret)
376 return TRACE_TYPE_PARTIAL_LINE;
377
378 /* Requested */
379 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
380 if (!ret)
381 return TRACE_TYPE_PARTIAL_LINE;
382
383 /* Allocated */
384 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
385 if (!ret)
386 return TRACE_TYPE_PARTIAL_LINE;
387
388 /* Flags
389 * TODO: would be better to see the name of the GFP flag names
390 */
391 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
392 if (!ret)
393 return TRACE_TYPE_PARTIAL_LINE;
394
395 /* Pointer to allocated */
396 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
397 if (!ret)
398 return TRACE_TYPE_PARTIAL_LINE;
399
400 /* Node and call site*/
401 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
402 (void *)entry->call_site);
403 if (!ret)
404 return TRACE_TYPE_PARTIAL_LINE;
405
406 return TRACE_TYPE_HANDLED;
407}
408
409static enum print_line_t
410kmemtrace_print_free_compress(struct trace_iterator *iter)
411{
412 struct kmemtrace_free_entry *entry;
413 struct trace_seq *s = &iter->seq;
414 int ret;
415
416 trace_assign_type(entry, iter->ent);
417
418 /* Free entry */
419 ret = trace_seq_printf(s, " - ");
420 if (!ret)
421 return TRACE_TYPE_PARTIAL_LINE;
422
423 /* Type */
424 switch (entry->type_id) {
425 case KMEMTRACE_TYPE_KMALLOC:
426 ret = trace_seq_printf(s, "K ");
427 break;
428 case KMEMTRACE_TYPE_CACHE:
429 ret = trace_seq_printf(s, "C ");
430 break;
431 case KMEMTRACE_TYPE_PAGES:
432 ret = trace_seq_printf(s, "P ");
433 break;
434 default:
435 ret = trace_seq_printf(s, "? ");
436 }
437
438 if (!ret)
439 return TRACE_TYPE_PARTIAL_LINE;
440
441 /* Skip requested/allocated/flags */
442 ret = trace_seq_printf(s, " ");
443 if (!ret)
444 return TRACE_TYPE_PARTIAL_LINE;
445
446 /* Pointer to allocated */
447 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
448 if (!ret)
449 return TRACE_TYPE_PARTIAL_LINE;
450
451 /* Skip node and print call site*/
452 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
453 if (!ret)
454 return TRACE_TYPE_PARTIAL_LINE;
455
456 return TRACE_TYPE_HANDLED;
457}
458
459static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
460{
461 struct trace_entry *entry = iter->ent;
462
463 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
464 return TRACE_TYPE_UNHANDLED;
465
466 switch (entry->type) {
467 case TRACE_KMEM_ALLOC:
468 return kmemtrace_print_alloc_compress(iter);
469 case TRACE_KMEM_FREE:
470 return kmemtrace_print_free_compress(iter);
471 default:
472 return TRACE_TYPE_UNHANDLED;
473 }
474}
475
476static struct trace_event_functions kmem_trace_alloc_funcs = {
477 .trace = kmemtrace_print_alloc,
478 .binary = kmemtrace_print_alloc_user,
479};
480
481static struct trace_event kmem_trace_alloc = {
482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
487 .trace = kmemtrace_print_free,
488 .binary = kmemtrace_print_free_user,
489};
490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
496static struct tracer kmem_tracer __read_mostly = {
497 .name = "kmemtrace",
498 .init = kmem_trace_init,
499 .reset = kmem_trace_reset,
500 .print_line = kmemtrace_print_line,
501 .print_header = kmemtrace_headers,
502 .flags = &kmem_tracer_flags
503};
504
505void kmemtrace_init(void)
506{
507 /* earliest opportunity to start kmem tracing */
508}
509
510static int __init init_kmem_tracer(void)
511{
512 if (!register_ftrace_event(&kmem_trace_alloc)) {
513 pr_warning("Warning: could not register kmem events\n");
514 return 1;
515 }
516
517 if (!register_ftrace_event(&kmem_trace_free)) {
518 pr_warning("Warning: could not register kmem events\n");
519 return 1;
520 }
521
522 if (register_tracer(&kmem_tracer) != 0) {
523 pr_warning("Warning: could not register the kmem tracer\n");
524 return 1;
525 }
526
527 return 0;
528}
529device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1da7b6ea8b85..19cccc3c3028 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
443 */ 443 */
444struct ring_buffer_per_cpu { 444struct ring_buffer_per_cpu {
445 int cpu; 445 int cpu;
446 atomic_t record_disabled;
446 struct ring_buffer *buffer; 447 struct ring_buffer *buffer;
447 spinlock_t reader_lock; /* serialize readers */ 448 spinlock_t reader_lock; /* serialize readers */
448 arch_spinlock_t lock; 449 arch_spinlock_t lock;
@@ -462,7 +463,6 @@ struct ring_buffer_per_cpu {
462 unsigned long read; 463 unsigned long read;
463 u64 write_stamp; 464 u64 write_stamp;
464 u64 read_stamp; 465 u64 read_stamp;
465 atomic_t record_disabled;
466}; 466};
467 467
468struct ring_buffer { 468struct ring_buffer {
@@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void)
2242 2242
2243#endif 2243#endif
2244 2244
2245static DEFINE_PER_CPU(int, rb_need_resched);
2246
2247/** 2245/**
2248 * ring_buffer_lock_reserve - reserve a part of the buffer 2246 * ring_buffer_lock_reserve - reserve a part of the buffer
2249 * @buffer: the ring buffer to reserve from 2247 * @buffer: the ring buffer to reserve from
@@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2264{ 2262{
2265 struct ring_buffer_per_cpu *cpu_buffer; 2263 struct ring_buffer_per_cpu *cpu_buffer;
2266 struct ring_buffer_event *event; 2264 struct ring_buffer_event *event;
2267 int cpu, resched; 2265 int cpu;
2268 2266
2269 if (ring_buffer_flags != RB_BUFFERS_ON) 2267 if (ring_buffer_flags != RB_BUFFERS_ON)
2270 return NULL; 2268 return NULL;
2271 2269
2272 /* If we are tracing schedule, we don't want to recurse */ 2270 /* If we are tracing schedule, we don't want to recurse */
2273 resched = ftrace_preempt_disable(); 2271 preempt_disable_notrace();
2274 2272
2275 if (atomic_read(&buffer->record_disabled)) 2273 if (atomic_read(&buffer->record_disabled))
2276 goto out_nocheck; 2274 goto out_nocheck;
@@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2295 if (!event) 2293 if (!event)
2296 goto out; 2294 goto out;
2297 2295
2298 /*
2299 * Need to store resched state on this cpu.
2300 * Only the first needs to.
2301 */
2302
2303 if (preempt_count() == 1)
2304 per_cpu(rb_need_resched, cpu) = resched;
2305
2306 return event; 2296 return event;
2307 2297
2308 out: 2298 out:
2309 trace_recursive_unlock(); 2299 trace_recursive_unlock();
2310 2300
2311 out_nocheck: 2301 out_nocheck:
2312 ftrace_preempt_enable(resched); 2302 preempt_enable_notrace();
2313 return NULL; 2303 return NULL;
2314} 2304}
2315EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2305EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2355 2345
2356 trace_recursive_unlock(); 2346 trace_recursive_unlock();
2357 2347
2358 /* 2348 preempt_enable_notrace();
2359 * Only the last preempt count needs to restore preemption.
2360 */
2361 if (preempt_count() == 1)
2362 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2363 else
2364 preempt_enable_no_resched_notrace();
2365 2349
2366 return 0; 2350 return 0;
2367} 2351}
@@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
2469 2453
2470 trace_recursive_unlock(); 2454 trace_recursive_unlock();
2471 2455
2472 /* 2456 preempt_enable_notrace();
2473 * Only the last preempt count needs to restore preemption.
2474 */
2475 if (preempt_count() == 1)
2476 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2477 else
2478 preempt_enable_no_resched_notrace();
2479 2457
2480} 2458}
2481EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); 2459EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
2501 struct ring_buffer_event *event; 2479 struct ring_buffer_event *event;
2502 void *body; 2480 void *body;
2503 int ret = -EBUSY; 2481 int ret = -EBUSY;
2504 int cpu, resched; 2482 int cpu;
2505 2483
2506 if (ring_buffer_flags != RB_BUFFERS_ON) 2484 if (ring_buffer_flags != RB_BUFFERS_ON)
2507 return -EBUSY; 2485 return -EBUSY;
2508 2486
2509 resched = ftrace_preempt_disable(); 2487 preempt_disable_notrace();
2510 2488
2511 if (atomic_read(&buffer->record_disabled)) 2489 if (atomic_read(&buffer->record_disabled))
2512 goto out; 2490 goto out;
@@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
2536 2514
2537 ret = 0; 2515 ret = 0;
2538 out: 2516 out:
2539 ftrace_preempt_enable(resched); 2517 preempt_enable_notrace();
2540 2518
2541 return ret; 2519 return ret;
2542} 2520}
@@ -3868,6 +3846,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3868 rpos = reader->read; 3846 rpos = reader->read;
3869 pos += size; 3847 pos += size;
3870 3848
3849 if (rpos >= commit)
3850 break;
3851
3871 event = rb_reader_event(cpu_buffer); 3852 event = rb_reader_event(cpu_buffer);
3872 size = rb_event_length(event); 3853 size = rb_event_length(event);
3873 } while (len > size); 3854 } while (len > size);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..9ec59f541156 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
101 preempt_enable(); 101 preempt_enable();
102} 102}
103 103
104static cpumask_var_t __read_mostly tracing_buffer_mask; 104cpumask_var_t __read_mostly tracing_buffer_mask;
105
106#define for_each_tracing_cpu(cpu) \
107 for_each_cpu(cpu, tracing_buffer_mask)
108 105
109/* 106/*
110 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 107 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344/* trace_flags holds trace_options default values */ 341/* trace_flags holds trace_options default values */
345unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 342unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
346 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 343 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
347 TRACE_ITER_GRAPH_TIME; 344 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
348 345
349static int trace_stop_count; 346static int trace_stop_count;
350static DEFINE_SPINLOCK(tracing_start_lock); 347static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +425,7 @@ static const char *trace_options[] = {
428 "latency-format", 425 "latency-format",
429 "sleep-time", 426 "sleep-time",
430 "graph-time", 427 "graph-time",
428 "record-cmd",
431 NULL 429 NULL
432}; 430};
433 431
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
659 return; 657 return;
660 658
661 WARN_ON_ONCE(!irqs_disabled()); 659 WARN_ON_ONCE(!irqs_disabled());
660 if (!current_trace->use_max_tr) {
661 WARN_ON_ONCE(1);
662 return;
663 }
662 arch_spin_lock(&ftrace_max_lock); 664 arch_spin_lock(&ftrace_max_lock);
663 665
664 tr->buffer = max_tr.buffer; 666 tr->buffer = max_tr.buffer;
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
685 return; 687 return;
686 688
687 WARN_ON_ONCE(!irqs_disabled()); 689 WARN_ON_ONCE(!irqs_disabled());
690 if (!current_trace->use_max_tr) {
691 WARN_ON_ONCE(1);
692 return;
693 }
694
688 arch_spin_lock(&ftrace_max_lock); 695 arch_spin_lock(&ftrace_max_lock);
689 696
690 ftrace_disable_cpu(); 697 ftrace_disable_cpu();
@@ -729,18 +736,11 @@ __acquires(kernel_lock)
729 return -1; 736 return -1;
730 } 737 }
731 738
732 if (strlen(type->name) > MAX_TRACER_SIZE) { 739 if (strlen(type->name) >= MAX_TRACER_SIZE) {
733 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); 740 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
734 return -1; 741 return -1;
735 } 742 }
736 743
737 /*
738 * When this gets called we hold the BKL which means that
739 * preemption is disabled. Various trace selftests however
740 * need to disable and enable preemption for successful tests.
741 * So we drop the BKL here and grab it after the tests again.
742 */
743 unlock_kernel();
744 mutex_lock(&trace_types_lock); 744 mutex_lock(&trace_types_lock);
745 745
746 tracing_selftest_running = true; 746 tracing_selftest_running = true;
@@ -822,7 +822,6 @@ __acquires(kernel_lock)
822#endif 822#endif
823 823
824 out_unlock: 824 out_unlock:
825 lock_kernel();
826 return ret; 825 return ret;
827} 826}
828 827
@@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1331 1330
1332#endif /* CONFIG_STACKTRACE */ 1331#endif /* CONFIG_STACKTRACE */
1333 1332
1334static void
1335ftrace_trace_special(void *__tr,
1336 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1337 int pc)
1338{
1339 struct ftrace_event_call *call = &event_special;
1340 struct ring_buffer_event *event;
1341 struct trace_array *tr = __tr;
1342 struct ring_buffer *buffer = tr->buffer;
1343 struct special_entry *entry;
1344
1345 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1346 sizeof(*entry), 0, pc);
1347 if (!event)
1348 return;
1349 entry = ring_buffer_event_data(event);
1350 entry->arg1 = arg1;
1351 entry->arg2 = arg2;
1352 entry->arg3 = arg3;
1353
1354 if (!filter_check_discard(call, entry, buffer, event))
1355 trace_buffer_unlock_commit(buffer, event, 0, pc);
1356}
1357
1358void
1359__trace_special(void *__tr, void *__data,
1360 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1361{
1362 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1363}
1364
1365void
1366ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1367{
1368 struct trace_array *tr = &global_trace;
1369 struct trace_array_cpu *data;
1370 unsigned long flags;
1371 int cpu;
1372 int pc;
1373
1374 if (tracing_disabled)
1375 return;
1376
1377 pc = preempt_count();
1378 local_irq_save(flags);
1379 cpu = raw_smp_processor_id();
1380 data = tr->data[cpu];
1381
1382 if (likely(atomic_inc_return(&data->disabled) == 1))
1383 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1384
1385 atomic_dec(&data->disabled);
1386 local_irq_restore(flags);
1387}
1388
1389/** 1333/**
1390 * trace_vbprintk - write binary msg to tracing buffer 1334 * trace_vbprintk - write binary msg to tracing buffer
1391 * 1335 *
@@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1404 struct bprint_entry *entry; 1348 struct bprint_entry *entry;
1405 unsigned long flags; 1349 unsigned long flags;
1406 int disable; 1350 int disable;
1407 int resched;
1408 int cpu, len = 0, size, pc; 1351 int cpu, len = 0, size, pc;
1409 1352
1410 if (unlikely(tracing_selftest_running || tracing_disabled)) 1353 if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1414 pause_graph_tracing(); 1357 pause_graph_tracing();
1415 1358
1416 pc = preempt_count(); 1359 pc = preempt_count();
1417 resched = ftrace_preempt_disable(); 1360 preempt_disable_notrace();
1418 cpu = raw_smp_processor_id(); 1361 cpu = raw_smp_processor_id();
1419 data = tr->data[cpu]; 1362 data = tr->data[cpu];
1420 1363
@@ -1452,7 +1395,7 @@ out_unlock:
1452 1395
1453out: 1396out:
1454 atomic_dec_return(&data->disabled); 1397 atomic_dec_return(&data->disabled);
1455 ftrace_preempt_enable(resched); 1398 preempt_enable_notrace();
1456 unpause_graph_tracing(); 1399 unpause_graph_tracing();
1457 1400
1458 return len; 1401 return len;
@@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1539} 1482}
1540EXPORT_SYMBOL_GPL(trace_vprintk); 1483EXPORT_SYMBOL_GPL(trace_vprintk);
1541 1484
1542enum trace_file_type {
1543 TRACE_FILE_LAT_FMT = 1,
1544 TRACE_FILE_ANNOTATE = 2,
1545};
1546
1547static void trace_iterator_increment(struct trace_iterator *iter) 1485static void trace_iterator_increment(struct trace_iterator *iter)
1548{ 1486{
1549 /* Don't allow ftrace to trace into the ring buffers */ 1487 /* Don't allow ftrace to trace into the ring buffers */
@@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1641} 1579}
1642 1580
1643/* Find the next real entry, and increment the iterator to the next entry */ 1581/* Find the next real entry, and increment the iterator to the next entry */
1644static void *find_next_entry_inc(struct trace_iterator *iter) 1582void *trace_find_next_entry_inc(struct trace_iterator *iter)
1645{ 1583{
1646 iter->ent = __find_next_entry(iter, &iter->cpu, 1584 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts); 1585 &iter->lost_events, &iter->ts);
@@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1676 return NULL; 1614 return NULL;
1677 1615
1678 if (iter->idx < 0) 1616 if (iter->idx < 0)
1679 ent = find_next_entry_inc(iter); 1617 ent = trace_find_next_entry_inc(iter);
1680 else 1618 else
1681 ent = iter; 1619 ent = iter;
1682 1620
1683 while (ent && iter->idx < i) 1621 while (ent && iter->idx < i)
1684 ent = find_next_entry_inc(iter); 1622 ent = trace_find_next_entry_inc(iter);
1685 1623
1686 iter->pos = *pos; 1624 iter->pos = *pos;
1687 1625
1688 return ent; 1626 return ent;
1689} 1627}
1690 1628
1691static void tracing_iter_reset(struct trace_iterator *iter, int cpu) 1629void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1692{ 1630{
1693 struct trace_array *tr = iter->tr; 1631 struct trace_array *tr = iter->tr;
1694 struct ring_buffer_event *event; 1632 struct ring_buffer_event *event;
@@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter)
2049} 1987}
2050 1988
2051/* Called with trace_event_read_lock() held. */ 1989/* Called with trace_event_read_lock() held. */
2052static enum print_line_t print_trace_line(struct trace_iterator *iter) 1990enum print_line_t print_trace_line(struct trace_iterator *iter)
2053{ 1991{
2054 enum print_line_t ret; 1992 enum print_line_t ret;
2055 1993
@@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = {
2394 .open = show_traces_open, 2332 .open = show_traces_open,
2395 .read = seq_read, 2333 .read = seq_read,
2396 .release = seq_release, 2334 .release = seq_release,
2335 .llseek = seq_lseek,
2397}; 2336};
2398 2337
2399/* 2338/*
@@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = {
2487 .open = tracing_open_generic, 2426 .open = tracing_open_generic,
2488 .read = tracing_cpumask_read, 2427 .read = tracing_cpumask_read,
2489 .write = tracing_cpumask_write, 2428 .write = tracing_cpumask_write,
2429 .llseek = generic_file_llseek,
2490}; 2430};
2491 2431
2492static int tracing_trace_options_show(struct seq_file *m, void *v) 2432static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2562 trace_flags |= mask; 2502 trace_flags |= mask;
2563 else 2503 else
2564 trace_flags &= ~mask; 2504 trace_flags &= ~mask;
2505
2506 if (mask == TRACE_ITER_RECORD_CMD)
2507 trace_event_enable_cmd_record(enabled);
2565} 2508}
2566 2509
2567static ssize_t 2510static ssize_t
@@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2653static const struct file_operations tracing_readme_fops = { 2596static const struct file_operations tracing_readme_fops = {
2654 .open = tracing_open_generic, 2597 .open = tracing_open_generic,
2655 .read = tracing_readme_read, 2598 .read = tracing_readme_read,
2599 .llseek = generic_file_llseek,
2656}; 2600};
2657 2601
2658static ssize_t 2602static ssize_t
@@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2703static const struct file_operations tracing_saved_cmdlines_fops = { 2647static const struct file_operations tracing_saved_cmdlines_fops = {
2704 .open = tracing_open_generic, 2648 .open = tracing_open_generic,
2705 .read = tracing_saved_cmdlines_read, 2649 .read = tracing_saved_cmdlines_read,
2650 .llseek = generic_file_llseek,
2706}; 2651};
2707 2652
2708static ssize_t 2653static ssize_t
@@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
2798 if (ret < 0) 2743 if (ret < 0)
2799 return ret; 2744 return ret;
2800 2745
2746 if (!current_trace->use_max_tr)
2747 goto out;
2748
2801 ret = ring_buffer_resize(max_tr.buffer, size); 2749 ret = ring_buffer_resize(max_tr.buffer, size);
2802 if (ret < 0) { 2750 if (ret < 0) {
2803 int r; 2751 int r;
@@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
2825 return ret; 2773 return ret;
2826 } 2774 }
2827 2775
2776 max_tr.entries = size;
2777 out:
2828 global_trace.entries = size; 2778 global_trace.entries = size;
2829 2779
2830 return ret; 2780 return ret;
2831} 2781}
2832 2782
2783
2833/** 2784/**
2834 * tracing_update_buffers - used by tracing facility to expand ring buffers 2785 * tracing_update_buffers - used by tracing facility to expand ring buffers
2835 * 2786 *
@@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf)
2890 trace_branch_disable(); 2841 trace_branch_disable();
2891 if (current_trace && current_trace->reset) 2842 if (current_trace && current_trace->reset)
2892 current_trace->reset(tr); 2843 current_trace->reset(tr);
2893 2844 if (current_trace && current_trace->use_max_tr) {
2845 /*
2846 * We don't free the ring buffer. instead, resize it because
2847 * The max_tr ring buffer has some state (e.g. ring->clock) and
2848 * we want preserve it.
2849 */
2850 ring_buffer_resize(max_tr.buffer, 1);
2851 max_tr.entries = 1;
2852 }
2894 destroy_trace_option_files(topts); 2853 destroy_trace_option_files(topts);
2895 2854
2896 current_trace = t; 2855 current_trace = t;
2897 2856
2898 topts = create_trace_option_files(current_trace); 2857 topts = create_trace_option_files(current_trace);
2858 if (current_trace->use_max_tr) {
2859 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
2860 if (ret < 0)
2861 goto out;
2862 max_tr.entries = global_trace.entries;
2863 }
2899 2864
2900 if (t->init) { 2865 if (t->init) {
2901 ret = tracer_init(t, tr); 2866 ret = tracer_init(t, tr);
@@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3032 if (iter->trace->pipe_open) 2997 if (iter->trace->pipe_open)
3033 iter->trace->pipe_open(iter); 2998 iter->trace->pipe_open(iter);
3034 2999
3000 nonseekable_open(inode, filp);
3035out: 3001out:
3036 mutex_unlock(&trace_types_lock); 3002 mutex_unlock(&trace_types_lock);
3037 return ret; 3003 return ret;
@@ -3211,7 +3177,7 @@ waitagain:
3211 3177
3212 trace_event_read_lock(); 3178 trace_event_read_lock();
3213 trace_access_lock(iter->cpu_file); 3179 trace_access_lock(iter->cpu_file);
3214 while (find_next_entry_inc(iter) != NULL) { 3180 while (trace_find_next_entry_inc(iter) != NULL) {
3215 enum print_line_t ret; 3181 enum print_line_t ret;
3216 int len = iter->seq.len; 3182 int len = iter->seq.len;
3217 3183
@@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3294 if (ret != TRACE_TYPE_NO_CONSUME) 3260 if (ret != TRACE_TYPE_NO_CONSUME)
3295 trace_consume(iter); 3261 trace_consume(iter);
3296 rem -= count; 3262 rem -= count;
3297 if (!find_next_entry_inc(iter)) { 3263 if (!trace_find_next_entry_inc(iter)) {
3298 rem = 0; 3264 rem = 0;
3299 iter->ent = NULL; 3265 iter->ent = NULL;
3300 break; 3266 break;
@@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3350 if (ret <= 0) 3316 if (ret <= 0)
3351 goto out_err; 3317 goto out_err;
3352 3318
3353 if (!iter->ent && !find_next_entry_inc(iter)) { 3319 if (!iter->ent && !trace_find_next_entry_inc(iter)) {
3354 ret = -EFAULT; 3320 ret = -EFAULT;
3355 goto out_err; 3321 goto out_err;
3356 } 3322 }
@@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3477 } 3443 }
3478 3444
3479 tracing_start(); 3445 tracing_start();
3480 max_tr.entries = global_trace.entries;
3481 mutex_unlock(&trace_types_lock); 3446 mutex_unlock(&trace_types_lock);
3482 3447
3483 return cnt; 3448 return cnt;
@@ -3498,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3498 size_t cnt, loff_t *fpos) 3463 size_t cnt, loff_t *fpos)
3499{ 3464{
3500 char *buf; 3465 char *buf;
3466 size_t written;
3501 3467
3502 if (tracing_disabled) 3468 if (tracing_disabled)
3503 return -EINVAL; 3469 return -EINVAL;
@@ -3519,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3519 } else 3485 } else
3520 buf[cnt] = '\0'; 3486 buf[cnt] = '\0';
3521 3487
3522 cnt = mark_printk("%s", buf); 3488 written = mark_printk("%s", buf);
3523 kfree(buf); 3489 kfree(buf);
3524 *fpos += cnt; 3490 *fpos += written;
3525 3491
3526 return cnt; 3492 /* don't tell userspace we wrote more - it might confuse them */
3493 if (written > cnt)
3494 written = cnt;
3495
3496 return written;
3527} 3497}
3528 3498
3529static int tracing_clock_show(struct seq_file *m, void *v) 3499static int tracing_clock_show(struct seq_file *m, void *v)
@@ -3590,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = {
3590 .open = tracing_open_generic, 3560 .open = tracing_open_generic,
3591 .read = tracing_max_lat_read, 3561 .read = tracing_max_lat_read,
3592 .write = tracing_max_lat_write, 3562 .write = tracing_max_lat_write,
3563 .llseek = generic_file_llseek,
3593}; 3564};
3594 3565
3595static const struct file_operations tracing_ctrl_fops = { 3566static const struct file_operations tracing_ctrl_fops = {
3596 .open = tracing_open_generic, 3567 .open = tracing_open_generic,
3597 .read = tracing_ctrl_read, 3568 .read = tracing_ctrl_read,
3598 .write = tracing_ctrl_write, 3569 .write = tracing_ctrl_write,
3570 .llseek = generic_file_llseek,
3599}; 3571};
3600 3572
3601static const struct file_operations set_tracer_fops = { 3573static const struct file_operations set_tracer_fops = {
3602 .open = tracing_open_generic, 3574 .open = tracing_open_generic,
3603 .read = tracing_set_trace_read, 3575 .read = tracing_set_trace_read,
3604 .write = tracing_set_trace_write, 3576 .write = tracing_set_trace_write,
3577 .llseek = generic_file_llseek,
3605}; 3578};
3606 3579
3607static const struct file_operations tracing_pipe_fops = { 3580static const struct file_operations tracing_pipe_fops = {
@@ -3610,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = {
3610 .read = tracing_read_pipe, 3583 .read = tracing_read_pipe,
3611 .splice_read = tracing_splice_read_pipe, 3584 .splice_read = tracing_splice_read_pipe,
3612 .release = tracing_release_pipe, 3585 .release = tracing_release_pipe,
3586 .llseek = no_llseek,
3613}; 3587};
3614 3588
3615static const struct file_operations tracing_entries_fops = { 3589static const struct file_operations tracing_entries_fops = {
3616 .open = tracing_open_generic, 3590 .open = tracing_open_generic,
3617 .read = tracing_entries_read, 3591 .read = tracing_entries_read,
3618 .write = tracing_entries_write, 3592 .write = tracing_entries_write,
3593 .llseek = generic_file_llseek,
3619}; 3594};
3620 3595
3621static const struct file_operations tracing_mark_fops = { 3596static const struct file_operations tracing_mark_fops = {
3622 .open = tracing_open_generic, 3597 .open = tracing_open_generic,
3623 .write = tracing_mark_write, 3598 .write = tracing_mark_write,
3599 .llseek = generic_file_llseek,
3624}; 3600};
3625 3601
3626static const struct file_operations trace_clock_fops = { 3602static const struct file_operations trace_clock_fops = {
@@ -3926,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3926static const struct file_operations tracing_stats_fops = { 3902static const struct file_operations tracing_stats_fops = {
3927 .open = tracing_open_generic, 3903 .open = tracing_open_generic,
3928 .read = tracing_stats_read, 3904 .read = tracing_stats_read,
3905 .llseek = generic_file_llseek,
3929}; 3906};
3930 3907
3931#ifdef CONFIG_DYNAMIC_FTRACE 3908#ifdef CONFIG_DYNAMIC_FTRACE
@@ -3962,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3962static const struct file_operations tracing_dyn_info_fops = { 3939static const struct file_operations tracing_dyn_info_fops = {
3963 .open = tracing_open_generic, 3940 .open = tracing_open_generic,
3964 .read = tracing_read_dyn_info, 3941 .read = tracing_read_dyn_info,
3942 .llseek = generic_file_llseek,
3965}; 3943};
3966#endif 3944#endif
3967 3945
@@ -4115,6 +4093,7 @@ static const struct file_operations trace_options_fops = {
4115 .open = tracing_open_generic, 4093 .open = tracing_open_generic,
4116 .read = trace_options_read, 4094 .read = trace_options_read,
4117 .write = trace_options_write, 4095 .write = trace_options_write,
4096 .llseek = generic_file_llseek,
4118}; 4097};
4119 4098
4120static ssize_t 4099static ssize_t
@@ -4166,6 +4145,7 @@ static const struct file_operations trace_options_core_fops = {
4166 .open = tracing_open_generic, 4145 .open = tracing_open_generic,
4167 .read = trace_options_core_read, 4146 .read = trace_options_core_read,
4168 .write = trace_options_core_write, 4147 .write = trace_options_core_write,
4148 .llseek = generic_file_llseek,
4169}; 4149};
4170 4150
4171struct dentry *trace_create_file(const char *name, 4151struct dentry *trace_create_file(const char *name,
@@ -4355,9 +4335,6 @@ static __init int tracer_init_debugfs(void)
4355 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4335 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4356 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4336 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4357#endif 4337#endif
4358#ifdef CONFIG_SYSPROF_TRACER
4359 init_tracer_sysprof_debugfs(d_tracer);
4360#endif
4361 4338
4362 create_trace_options_dir(); 4339 create_trace_options_dir();
4363 4340
@@ -4414,7 +4391,7 @@ static struct notifier_block trace_die_notifier = {
4414 */ 4391 */
4415#define KERN_TRACE KERN_EMERG 4392#define KERN_TRACE KERN_EMERG
4416 4393
4417static void 4394void
4418trace_printk_seq(struct trace_seq *s) 4395trace_printk_seq(struct trace_seq *s)
4419{ 4396{
4420 /* Probably should print a warning here. */ 4397 /* Probably should print a warning here. */
@@ -4429,6 +4406,13 @@ trace_printk_seq(struct trace_seq *s)
4429 trace_seq_init(s); 4406 trace_seq_init(s);
4430} 4407}
4431 4408
4409void trace_init_global_iter(struct trace_iterator *iter)
4410{
4411 iter->tr = &global_trace;
4412 iter->trace = current_trace;
4413 iter->cpu_file = TRACE_PIPE_ALL_CPU;
4414}
4415
4432static void 4416static void
4433__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) 4417__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4434{ 4418{
@@ -4454,8 +4438,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4454 if (disable_tracing) 4438 if (disable_tracing)
4455 ftrace_kill(); 4439 ftrace_kill();
4456 4440
4441 trace_init_global_iter(&iter);
4442
4457 for_each_tracing_cpu(cpu) { 4443 for_each_tracing_cpu(cpu) {
4458 atomic_inc(&global_trace.data[cpu]->disabled); 4444 atomic_inc(&iter.tr->data[cpu]->disabled);
4459 } 4445 }
4460 4446
4461 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 4447 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4504,7 +4490,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4504 iter.iter_flags |= TRACE_FILE_LAT_FMT; 4490 iter.iter_flags |= TRACE_FILE_LAT_FMT;
4505 iter.pos = -1; 4491 iter.pos = -1;
4506 4492
4507 if (find_next_entry_inc(&iter) != NULL) { 4493 if (trace_find_next_entry_inc(&iter) != NULL) {
4508 int ret; 4494 int ret;
4509 4495
4510 ret = print_trace_line(&iter); 4496 ret = print_trace_line(&iter);
@@ -4526,7 +4512,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4526 trace_flags |= old_userobj; 4512 trace_flags |= old_userobj;
4527 4513
4528 for_each_tracing_cpu(cpu) { 4514 for_each_tracing_cpu(cpu) {
4529 atomic_dec(&global_trace.data[cpu]->disabled); 4515 atomic_dec(&iter.tr->data[cpu]->disabled);
4530 } 4516 }
4531 tracing_on(); 4517 tracing_on();
4532 } 4518 }
@@ -4575,16 +4561,14 @@ __init static int tracer_alloc_buffers(void)
4575 4561
4576 4562
4577#ifdef CONFIG_TRACER_MAX_TRACE 4563#ifdef CONFIG_TRACER_MAX_TRACE
4578 max_tr.buffer = ring_buffer_alloc(ring_buf_size, 4564 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
4579 TRACE_BUFFER_FLAGS);
4580 if (!max_tr.buffer) { 4565 if (!max_tr.buffer) {
4581 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4566 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4582 WARN_ON(1); 4567 WARN_ON(1);
4583 ring_buffer_free(global_trace.buffer); 4568 ring_buffer_free(global_trace.buffer);
4584 goto out_free_cpumask; 4569 goto out_free_cpumask;
4585 } 4570 }
4586 max_tr.entries = ring_buffer_size(max_tr.buffer); 4571 max_tr.entries = 1;
4587 WARN_ON(max_tr.entries != global_trace.entries);
4588#endif 4572#endif
4589 4573
4590 /* Allocate the first page for all buffers */ 4574 /* Allocate the first page for all buffers */
@@ -4597,9 +4581,6 @@ __init static int tracer_alloc_buffers(void)
4597 4581
4598 register_tracer(&nop_trace); 4582 register_tracer(&nop_trace);
4599 current_trace = &nop_trace; 4583 current_trace = &nop_trace;
4600#ifdef CONFIG_BOOT_TRACER
4601 register_tracer(&boot_tracer);
4602#endif
4603 /* All seems OK, enable tracing */ 4584 /* All seems OK, enable tracing */
4604 tracing_disabled = 0; 4585 tracing_disabled = 0;
4605 4586
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..d39b3c5454a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h> 10#include <linux/tracepoint.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h>
13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h> 12#include <linux/hw_breakpoint.h>
15
16#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
17#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
18 15
@@ -25,30 +22,17 @@ enum trace_type {
25 TRACE_STACK, 22 TRACE_STACK,
26 TRACE_PRINT, 23 TRACE_PRINT,
27 TRACE_BPRINT, 24 TRACE_BPRINT,
28 TRACE_SPECIAL,
29 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
30 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
31 TRACE_BRANCH, 27 TRACE_BRANCH,
32 TRACE_BOOT_CALL,
33 TRACE_BOOT_RET,
34 TRACE_GRAPH_RET, 28 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 29 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 30 TRACE_USER_STACK,
37 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE,
39 TRACE_BLK, 31 TRACE_BLK,
40 TRACE_KSYM,
41 32
42 __TRACE_LAST_TYPE, 33 __TRACE_LAST_TYPE,
43}; 34};
44 35
45enum kmemtrace_type_id {
46 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49};
50
51extern struct tracer boot_tracer;
52 36
53#undef __field 37#undef __field
54#define __field(type, item) type item; 38#define __field(type, item) type item;
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void);
204 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 188 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
205 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 189 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
206 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 190 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
207 IF_ASSIGN(var, ent, struct special_entry, 0); \
208 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 191 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
209 TRACE_MMIO_RW); \ 192 TRACE_MMIO_RW); \
210 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 193 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
211 TRACE_MMIO_MAP); \ 194 TRACE_MMIO_MAP); \
212 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
213 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
214 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ 195 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
215 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ 196 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
216 TRACE_GRAPH_ENT); \ 197 TRACE_GRAPH_ENT); \
217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 198 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
218 TRACE_GRAPH_RET); \ 199 TRACE_GRAPH_RET); \
219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
220 TRACE_KMEM_ALLOC); \
221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
222 TRACE_KMEM_FREE); \
223 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
224 __ftrace_bad_type(); \ 200 __ftrace_bad_type(); \
225 } while (0) 201 } while (0)
226 202
@@ -298,6 +274,7 @@ struct tracer {
298 struct tracer *next; 274 struct tracer *next;
299 int print_max; 275 int print_max;
300 struct tracer_flags *flags; 276 struct tracer_flags *flags;
277 int use_max_tr;
301}; 278};
302 279
303 280
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
318 const struct file_operations *fops); 295 const struct file_operations *fops);
319 296
320struct dentry *tracing_init_dentry(void); 297struct dentry *tracing_init_dentry(void);
321void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
322 298
323struct ring_buffer_event; 299struct ring_buffer_event;
324 300
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
338struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 314struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
339 int *ent_cpu, u64 *ent_ts); 315 int *ent_cpu, u64 *ent_ts);
340 316
317int trace_empty(struct trace_iterator *iter);
318
319void *trace_find_next_entry_inc(struct trace_iterator *iter);
320
321void trace_init_global_iter(struct trace_iterator *iter);
322
323void tracing_iter_reset(struct trace_iterator *iter, int cpu);
324
341void default_wait_pipe(struct trace_iterator *iter); 325void default_wait_pipe(struct trace_iterator *iter);
342void poll_wait_pipe(struct trace_iterator *iter); 326void poll_wait_pipe(struct trace_iterator *iter);
343 327
@@ -355,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
355 struct task_struct *wakee, 339 struct task_struct *wakee,
356 struct task_struct *cur, 340 struct task_struct *cur,
357 unsigned long flags, int pc); 341 unsigned long flags, int pc);
358void trace_special(struct trace_array *tr,
359 struct trace_array_cpu *data,
360 unsigned long arg1,
361 unsigned long arg2,
362 unsigned long arg3, int pc);
363void trace_function(struct trace_array *tr, 342void trace_function(struct trace_array *tr,
364 unsigned long ip, 343 unsigned long ip,
365 unsigned long parent_ip, 344 unsigned long parent_ip,
@@ -380,8 +359,15 @@ void tracing_start_sched_switch_record(void);
380int register_tracer(struct tracer *type); 359int register_tracer(struct tracer *type);
381void unregister_tracer(struct tracer *type); 360void unregister_tracer(struct tracer *type);
382int is_tracing_stopped(void); 361int is_tracing_stopped(void);
362enum trace_file_type {
363 TRACE_FILE_LAT_FMT = 1,
364 TRACE_FILE_ANNOTATE = 2,
365};
366
367extern cpumask_var_t __read_mostly tracing_buffer_mask;
383 368
384extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); 369#define for_each_tracing_cpu(cpu) \
370 for_each_cpu(cpu, tracing_buffer_mask)
385 371
386extern unsigned long nsecs_to_usecs(unsigned long nsecs); 372extern unsigned long nsecs_to_usecs(unsigned long nsecs);
387 373
@@ -452,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
452 struct trace_array *tr); 438 struct trace_array *tr);
453extern int trace_selftest_startup_sched_switch(struct tracer *trace, 439extern int trace_selftest_startup_sched_switch(struct tracer *trace,
454 struct trace_array *tr); 440 struct trace_array *tr);
455extern int trace_selftest_startup_sysprof(struct tracer *trace,
456 struct trace_array *tr);
457extern int trace_selftest_startup_branch(struct tracer *trace, 441extern int trace_selftest_startup_branch(struct tracer *trace,
458 struct trace_array *tr); 442 struct trace_array *tr);
459extern int trace_selftest_startup_ksym(struct tracer *trace,
460 struct trace_array *tr);
461#endif /* CONFIG_FTRACE_STARTUP_TEST */ 443#endif /* CONFIG_FTRACE_STARTUP_TEST */
462 444
463extern void *head_page(struct trace_array_cpu *data); 445extern void *head_page(struct trace_array_cpu *data);
@@ -471,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr,
471 unsigned long ip, const char *fmt, va_list args); 453 unsigned long ip, const char *fmt, va_list args);
472int trace_array_printk(struct trace_array *tr, 454int trace_array_printk(struct trace_array *tr,
473 unsigned long ip, const char *fmt, ...); 455 unsigned long ip, const char *fmt, ...);
456void trace_printk_seq(struct trace_seq *s);
457enum print_line_t print_trace_line(struct trace_iterator *iter);
474 458
475extern unsigned long trace_flags; 459extern unsigned long trace_flags;
476 460
@@ -617,6 +601,7 @@ enum trace_iterator_flags {
617 TRACE_ITER_LATENCY_FMT = 0x20000, 601 TRACE_ITER_LATENCY_FMT = 0x20000,
618 TRACE_ITER_SLEEP_TIME = 0x40000, 602 TRACE_ITER_SLEEP_TIME = 0x40000,
619 TRACE_ITER_GRAPH_TIME = 0x80000, 603 TRACE_ITER_GRAPH_TIME = 0x80000,
604 TRACE_ITER_RECORD_CMD = 0x100000,
620}; 605};
621 606
622/* 607/*
@@ -628,54 +613,6 @@ enum trace_iterator_flags {
628 613
629extern struct tracer nop_trace; 614extern struct tracer nop_trace;
630 615
631/**
632 * ftrace_preempt_disable - disable preemption scheduler safe
633 *
634 * When tracing can happen inside the scheduler, there exists
635 * cases that the tracing might happen before the need_resched
636 * flag is checked. If this happens and the tracer calls
637 * preempt_enable (after a disable), a schedule might take place
638 * causing an infinite recursion.
639 *
640 * To prevent this, we read the need_resched flag before
641 * disabling preemption. When we want to enable preemption we
642 * check the flag, if it is set, then we call preempt_enable_no_resched.
643 * Otherwise, we call preempt_enable.
644 *
645 * The rational for doing the above is that if need_resched is set
646 * and we have yet to reschedule, we are either in an atomic location
647 * (where we do not need to check for scheduling) or we are inside
648 * the scheduler and do not want to resched.
649 */
650static inline int ftrace_preempt_disable(void)
651{
652 int resched;
653
654 resched = need_resched();
655 preempt_disable_notrace();
656
657 return resched;
658}
659
660/**
661 * ftrace_preempt_enable - enable preemption scheduler safe
662 * @resched: the return value from ftrace_preempt_disable
663 *
664 * This is a scheduler safe way to enable preemption and not miss
665 * any preemption checks. The disabled saved the state of preemption.
666 * If resched is set, then we are either inside an atomic or
667 * are inside the scheduler (we would have already scheduled
668 * otherwise). In this case, we do not want to call normal
669 * preempt_enable, but preempt_enable_no_resched instead.
670 */
671static inline void ftrace_preempt_enable(int resched)
672{
673 if (resched)
674 preempt_enable_no_resched_notrace();
675 else
676 preempt_enable_notrace();
677}
678
679#ifdef CONFIG_BRANCH_TRACER 616#ifdef CONFIG_BRANCH_TRACER
680extern int enable_branch_tracing(struct trace_array *tr); 617extern int enable_branch_tracing(struct trace_array *tr);
681extern void disable_branch_tracing(void); 618extern void disable_branch_tracing(void);
@@ -766,6 +703,8 @@ struct filter_pred {
766 int pop_n; 703 int pop_n;
767}; 704};
768 705
706extern struct list_head ftrace_common_fields;
707
769extern enum regex_type 708extern enum regex_type
770filter_parse_regex(char *buff, int len, char **search, int *not); 709filter_parse_regex(char *buff, int len, char **search, int *not);
771extern void print_event_filter(struct ftrace_event_call *call, 710extern void print_event_filter(struct ftrace_event_call *call,
@@ -795,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
795 return 0; 734 return 0;
796} 735}
797 736
737extern void trace_event_enable_cmd_record(bool enable);
738
798extern struct mutex event_mutex; 739extern struct mutex event_mutex;
799extern struct list_head ftrace_events; 740extern struct list_head ftrace_events;
800 741
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12#include <linux/time.h>
13
14#include "trace.h"
15#include "trace_output.h"
16
17static struct trace_array *boot_trace;
18static bool pre_initcalls_finished;
19
20/* Tells the boot tracer that the pre_smp_initcalls are finished.
21 * So we are ready .
22 * It doesn't enable sched events tracing however.
23 * You have to call enable_boot_trace to do so.
24 */
25void start_boot_trace(void)
26{
27 pre_initcalls_finished = true;
28}
29
30void enable_boot_trace(void)
31{
32 if (boot_trace && pre_initcalls_finished)
33 tracing_start_sched_switch_record();
34}
35
36void disable_boot_trace(void)
37{
38 if (boot_trace && pre_initcalls_finished)
39 tracing_stop_sched_switch_record();
40}
41
42static int boot_trace_init(struct trace_array *tr)
43{
44 boot_trace = tr;
45
46 if (!tr)
47 return 0;
48
49 tracing_reset_online_cpus(tr);
50
51 tracing_sched_switch_assign_trace(tr);
52 return 0;
53}
54
55static enum print_line_t
56initcall_call_print_line(struct trace_iterator *iter)
57{
58 struct trace_entry *entry = iter->ent;
59 struct trace_seq *s = &iter->seq;
60 struct trace_boot_call *field;
61 struct boot_trace_call *call;
62 u64 ts;
63 unsigned long nsec_rem;
64 int ret;
65
66 trace_assign_type(field, entry);
67 call = &field->boot_call;
68 ts = iter->ts;
69 nsec_rem = do_div(ts, NSEC_PER_SEC);
70
71 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
72 (unsigned long)ts, nsec_rem, call->func, call->caller);
73
74 if (!ret)
75 return TRACE_TYPE_PARTIAL_LINE;
76 else
77 return TRACE_TYPE_HANDLED;
78}
79
80static enum print_line_t
81initcall_ret_print_line(struct trace_iterator *iter)
82{
83 struct trace_entry *entry = iter->ent;
84 struct trace_seq *s = &iter->seq;
85 struct trace_boot_ret *field;
86 struct boot_trace_ret *init_ret;
87 u64 ts;
88 unsigned long nsec_rem;
89 int ret;
90
91 trace_assign_type(field, entry);
92 init_ret = &field->boot_ret;
93 ts = iter->ts;
94 nsec_rem = do_div(ts, NSEC_PER_SEC);
95
96 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
97 "returned %d after %llu msecs\n",
98 (unsigned long) ts,
99 nsec_rem,
100 init_ret->func, init_ret->result, init_ret->duration);
101
102 if (!ret)
103 return TRACE_TYPE_PARTIAL_LINE;
104 else
105 return TRACE_TYPE_HANDLED;
106}
107
108static enum print_line_t initcall_print_line(struct trace_iterator *iter)
109{
110 struct trace_entry *entry = iter->ent;
111
112 switch (entry->type) {
113 case TRACE_BOOT_CALL:
114 return initcall_call_print_line(iter);
115 case TRACE_BOOT_RET:
116 return initcall_ret_print_line(iter);
117 default:
118 return TRACE_TYPE_UNHANDLED;
119 }
120}
121
122struct tracer boot_tracer __read_mostly =
123{
124 .name = "initcall",
125 .init = boot_trace_init,
126 .reset = tracing_reset_online_cpus,
127 .print_line = initcall_print_line,
128};
129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{
132 struct ftrace_event_call *call = &event_boot_call;
133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace;
137
138 if (!tr || !pre_initcalls_finished)
139 return;
140
141 /* Get its name now since this function could
142 * disappear because it is in the .init section.
143 */
144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable();
146
147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
149 sizeof(*entry), 0, 0);
150 if (!event)
151 goto out;
152 entry = ring_buffer_event_data(event);
153 entry->boot_call = *bt;
154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
156 out:
157 preempt_enable();
158}
159
160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
161{
162 struct ftrace_event_call *call = &event_boot_ret;
163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
165 struct trace_boot_ret *entry;
166 struct trace_array *tr = boot_trace;
167
168 if (!tr || !pre_initcalls_finished)
169 return;
170
171 sprint_symbol(bt->func, (unsigned long)fn);
172 preempt_disable();
173
174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
176 sizeof(*entry), 0, 0);
177 if (!event)
178 goto out;
179 entry = ring_buffer_event_data(event);
180 entry->boot_ret = *bt;
181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
183 out:
184 preempt_enable();
185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
32u64 notrace trace_clock_local(void) 32u64 notrace trace_clock_local(void)
33{ 33{
34 u64 clock; 34 u64 clock;
35 int resched;
36 35
37 /* 36 /*
38 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
39 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
40 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
41 */ 40 */
42 resched = ftrace_preempt_disable(); 41 preempt_disable_notrace();
43 clock = sched_clock(); 42 clock = sched_clock();
44 ftrace_preempt_enable(resched); 43 preempt_enable_notrace();
45 44
46 return clock; 45 return clock;
47} 46}
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void)
56 */ 55 */
57u64 notrace trace_clock(void) 56u64 notrace trace_clock(void)
58{ 57{
59 return cpu_clock(raw_smp_processor_id()); 58 return local_clock();
60} 59}
61 60
62 61
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index dc008c1240da..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
151); 151);
152 152
153/* 153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry: 154 * Stack-trace entry:
172 */ 155 */
173 156
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
271 __entry->map_id, __entry->opcode) 254 __entry->map_id, __entry->opcode)
272); 255);
273 256
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301 257
302#define TRACE_FUNC_SIZE 30 258#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20 259#define TRACE_FILE_SIZE 20
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 274 __entry->func, __entry->file, __entry->correct)
319); 275);
320 276
321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
322
323 TRACE_KMEM_ALLOC,
324
325 F_STRUCT(
326 __field( enum kmemtrace_type_id, type_id )
327 __field( unsigned long, call_site )
328 __field( const void *, ptr )
329 __field( size_t, bytes_req )
330 __field( size_t, bytes_alloc )
331 __field( gfp_t, gfp_flags )
332 __field( int, node )
333 ),
334
335 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
336 " flags:%x node:%d",
337 __entry->type_id, __entry->call_site, __entry->ptr,
338 __entry->bytes_req, __entry->bytes_alloc,
339 __entry->gfp_flags, __entry->node)
340);
341
342FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
343
344 TRACE_KMEM_FREE,
345
346 F_STRUCT(
347 __field( enum kmemtrace_type_id, type_id )
348 __field( unsigned long, call_site )
349 __field( const void *, ptr )
350 ),
351
352 F_printk("type:%u call_site:%lx ptr:%p",
353 __entry->type_id, __entry->call_site, __entry->ptr)
354);
355
356FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
357
358 TRACE_KSYM,
359
360 F_STRUCT(
361 __field( unsigned long, ip )
362 __field( unsigned char, type )
363 __array( char , cmd, TASK_COMM_LEN )
364 __field( unsigned long, addr )
365 ),
366
367 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
368 (void *)__entry->ip, (unsigned int)__entry->type,
369 (void *)__entry->addr, __entry->cmd)
370);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 8a2b73f7c068..000e6e85b445 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,8 +9,6 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
13
14static char *perf_trace_buf[4]; 12static char *perf_trace_buf[4];
15 13
16/* 14/*
@@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
56 } 54 }
57 } 55 }
58 56
59 if (tp_event->class->reg) 57 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret) 58 if (ret)
67 goto fail; 59 goto fail;
68 60
@@ -96,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event)
96 mutex_lock(&event_mutex); 88 mutex_lock(&event_mutex);
97 list_for_each_entry(tp_event, &ftrace_events, list) { 89 list_for_each_entry(tp_event, &ftrace_events, list) {
98 if (tp_event->event.type == event_id && 90 if (tp_event->event.type == event_id &&
99 tp_event->class && 91 tp_event->class && tp_event->class->reg &&
100 (tp_event->class->perf_probe ||
101 tp_event->class->reg) &&
102 try_module_get(tp_event->mod)) { 92 try_module_get(tp_event->mod)) {
103 ret = perf_trace_event_init(tp_event, p_event); 93 ret = perf_trace_event_init(tp_event, p_event);
104 break; 94 break;
@@ -138,18 +128,13 @@ void perf_trace_destroy(struct perf_event *p_event)
138 if (--tp_event->perf_refcount > 0) 128 if (--tp_event->perf_refcount > 0)
139 goto out; 129 goto out;
140 130
141 if (tp_event->class->reg) 131 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
142 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
143 else
144 tracepoint_probe_unregister(tp_event->name,
145 tp_event->class->perf_probe,
146 tp_event);
147 132
148 /* 133 /*
149 * Ensure our callback won't be called anymore. See 134 * Ensure our callback won't be called anymore. The buffers
150 * tracepoint_probe_unregister() and __DO_TRACE(). 135 * will be freed after that.
151 */ 136 */
152 synchronize_sched(); 137 tracepoint_synchronize_unregister();
153 138
154 free_percpu(tp_event->perf_events); 139 free_percpu(tp_event->perf_events);
155 tp_event->perf_events = NULL; 140 tp_event->perf_events = NULL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..4c758f146328 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,6 +28,7 @@
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields);
31 32
32struct list_head * 33struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call) 34trace_get_fields(struct ftrace_event_call *event_call)
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call)
37 return event_call->class->get_fields(event_call); 38 return event_call->class->get_fields(event_call);
38} 39}
39 40
40int trace_define_field(struct ftrace_event_call *call, const char *type, 41static int __trace_define_field(struct list_head *head, const char *type,
41 const char *name, int offset, int size, int is_signed, 42 const char *name, int offset, int size,
42 int filter_type) 43 int is_signed, int filter_type)
43{ 44{
44 struct ftrace_event_field *field; 45 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
49 46
50 field = kzalloc(sizeof(*field), GFP_KERNEL); 47 field = kzalloc(sizeof(*field), GFP_KERNEL);
51 if (!field) 48 if (!field)
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
68 field->size = size; 65 field->size = size;
69 field->is_signed = is_signed; 66 field->is_signed = is_signed;
70 67
71 head = trace_get_fields(call);
72 list_add(&field->link, head); 68 list_add(&field->link, head);
73 69
74 return 0; 70 return 0;
@@ -80,17 +76,32 @@ err:
80 76
81 return -ENOMEM; 77 return -ENOMEM;
82} 78}
79
80int trace_define_field(struct ftrace_event_call *call, const char *type,
81 const char *name, int offset, int size, int is_signed,
82 int filter_type)
83{
84 struct list_head *head;
85
86 if (WARN_ON(!call->class))
87 return 0;
88
89 head = trace_get_fields(call);
90 return __trace_define_field(head, type, name, offset, size,
91 is_signed, filter_type);
92}
83EXPORT_SYMBOL_GPL(trace_define_field); 93EXPORT_SYMBOL_GPL(trace_define_field);
84 94
85#define __common_field(type, item) \ 95#define __common_field(type, item) \
86 ret = trace_define_field(call, #type, "common_" #item, \ 96 ret = __trace_define_field(&ftrace_common_fields, #type, \
87 offsetof(typeof(ent), item), \ 97 "common_" #item, \
88 sizeof(ent.item), \ 98 offsetof(typeof(ent), item), \
89 is_signed_type(type), FILTER_OTHER); \ 99 sizeof(ent.item), \
100 is_signed_type(type), FILTER_OTHER); \
90 if (ret) \ 101 if (ret) \
91 return ret; 102 return ret;
92 103
93static int trace_define_common_fields(struct ftrace_event_call *call) 104static int trace_define_common_fields(void)
94{ 105{
95 int ret; 106 int ret;
96 struct trace_entry ent; 107 struct trace_entry ent;
@@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call)
130} 141}
131EXPORT_SYMBOL_GPL(trace_event_raw_init); 142EXPORT_SYMBOL_GPL(trace_event_raw_init);
132 143
144int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
145{
146 switch (type) {
147 case TRACE_REG_REGISTER:
148 return tracepoint_probe_register(call->name,
149 call->class->probe,
150 call);
151 case TRACE_REG_UNREGISTER:
152 tracepoint_probe_unregister(call->name,
153 call->class->probe,
154 call);
155 return 0;
156
157#ifdef CONFIG_PERF_EVENTS
158 case TRACE_REG_PERF_REGISTER:
159 return tracepoint_probe_register(call->name,
160 call->class->perf_probe,
161 call);
162 case TRACE_REG_PERF_UNREGISTER:
163 tracepoint_probe_unregister(call->name,
164 call->class->perf_probe,
165 call);
166 return 0;
167#endif
168 }
169 return 0;
170}
171EXPORT_SYMBOL_GPL(ftrace_event_reg);
172
173void trace_event_enable_cmd_record(bool enable)
174{
175 struct ftrace_event_call *call;
176
177 mutex_lock(&event_mutex);
178 list_for_each_entry(call, &ftrace_events, list) {
179 if (!(call->flags & TRACE_EVENT_FL_ENABLED))
180 continue;
181
182 if (enable) {
183 tracing_start_cmdline_record();
184 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
185 } else {
186 tracing_stop_cmdline_record();
187 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
188 }
189 }
190 mutex_unlock(&event_mutex);
191}
192
133static int ftrace_event_enable_disable(struct ftrace_event_call *call, 193static int ftrace_event_enable_disable(struct ftrace_event_call *call,
134 int enable) 194 int enable)
135{ 195{
@@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
139 case 0: 199 case 0:
140 if (call->flags & TRACE_EVENT_FL_ENABLED) { 200 if (call->flags & TRACE_EVENT_FL_ENABLED) {
141 call->flags &= ~TRACE_EVENT_FL_ENABLED; 201 call->flags &= ~TRACE_EVENT_FL_ENABLED;
142 tracing_stop_cmdline_record(); 202 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
143 if (call->class->reg) 203 tracing_stop_cmdline_record();
144 call->class->reg(call, TRACE_REG_UNREGISTER); 204 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
145 else 205 }
146 tracepoint_probe_unregister(call->name, 206 call->class->reg(call, TRACE_REG_UNREGISTER);
147 call->class->probe,
148 call);
149 } 207 }
150 break; 208 break;
151 case 1: 209 case 1:
152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 210 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
153 tracing_start_cmdline_record(); 211 if (trace_flags & TRACE_ITER_RECORD_CMD) {
154 if (call->class->reg) 212 tracing_start_cmdline_record();
155 ret = call->class->reg(call, TRACE_REG_REGISTER); 213 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
156 else 214 }
157 ret = tracepoint_probe_register(call->name, 215 ret = call->class->reg(call, TRACE_REG_REGISTER);
158 call->class->probe,
159 call);
160 if (ret) { 216 if (ret) {
161 tracing_stop_cmdline_record(); 217 tracing_stop_cmdline_record();
162 pr_info("event trace: Could not enable event " 218 pr_info("event trace: Could not enable event "
@@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
194 mutex_lock(&event_mutex); 250 mutex_lock(&event_mutex);
195 list_for_each_entry(call, &ftrace_events, list) { 251 list_for_each_entry(call, &ftrace_events, list) {
196 252
197 if (!call->name || !call->class || 253 if (!call->name || !call->class || !call->class->reg)
198 (!call->class->probe && !call->class->reg))
199 continue; 254 continue;
200 255
201 if (match && 256 if (match &&
@@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
321 * The ftrace subsystem is for showing formats only. 376 * The ftrace subsystem is for showing formats only.
322 * They can not be enabled or disabled via the event files. 377 * They can not be enabled or disabled via the event files.
323 */ 378 */
324 if (call->class && (call->class->probe || call->class->reg)) 379 if (call->class && call->class->reg)
325 return call; 380 return call;
326 } 381 }
327 382
@@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
474 529
475 mutex_lock(&event_mutex); 530 mutex_lock(&event_mutex);
476 list_for_each_entry(call, &ftrace_events, list) { 531 list_for_each_entry(call, &ftrace_events, list) {
477 if (!call->name || !call->class || 532 if (!call->name || !call->class || !call->class->reg)
478 (!call->class->probe && !call->class->reg))
479 continue; 533 continue;
480 534
481 if (system && strcmp(call->class->system, system) != 0) 535 if (system && strcmp(call->class->system, system) != 0)
@@ -544,85 +598,165 @@ out:
544 return ret; 598 return ret;
545} 599}
546 600
547static ssize_t 601enum {
548event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 602 FORMAT_HEADER = 1,
549 loff_t *ppos) 603 FORMAT_PRINTFMT = 2,
604};
605
606static void *f_next(struct seq_file *m, void *v, loff_t *pos)
550{ 607{
551 struct ftrace_event_call *call = filp->private_data; 608 struct ftrace_event_call *call = m->private;
552 struct ftrace_event_field *field; 609 struct ftrace_event_field *field;
553 struct list_head *head; 610 struct list_head *head;
554 struct trace_seq *s;
555 int common_field_count = 5;
556 char *buf;
557 int r = 0;
558 611
559 if (*ppos) 612 (*pos)++;
560 return 0;
561 613
562 s = kmalloc(sizeof(*s), GFP_KERNEL); 614 switch ((unsigned long)v) {
563 if (!s) 615 case FORMAT_HEADER:
564 return -ENOMEM; 616 head = &ftrace_common_fields;
565 617
566 trace_seq_init(s); 618 if (unlikely(list_empty(head)))
619 return NULL;
567 620
568 trace_seq_printf(s, "name: %s\n", call->name); 621 field = list_entry(head->prev, struct ftrace_event_field, link);
569 trace_seq_printf(s, "ID: %d\n", call->event.type); 622 return field;
570 trace_seq_printf(s, "format:\n"); 623
624 case FORMAT_PRINTFMT:
625 /* all done */
626 return NULL;
627 }
571 628
572 head = trace_get_fields(call); 629 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) {
574 /*
575 * Smartly shows the array type(except dynamic array).
576 * Normal:
577 * field:TYPE VAR
578 * If TYPE := TYPE[LEN], it is shown:
579 * field:TYPE VAR[LEN]
580 */
581 const char *array_descriptor = strchr(field->type, '[');
582 630
583 if (!strncmp(field->type, "__data_loc", 10)) 631 /*
584 array_descriptor = NULL; 632 * To separate common fields from event fields, the
633 * LSB is set on the first event field. Clear it in case.
634 */
635 v = (void *)((unsigned long)v & ~1L);
585 636
586 if (!array_descriptor) { 637 field = v;
587 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" 638 /*
588 "\tsize:%u;\tsigned:%d;\n", 639 * If this is a common field, and at the end of the list, then
589 field->type, field->name, field->offset, 640 * continue with main list.
590 field->size, !!field->is_signed); 641 */
591 } else { 642 if (field->link.prev == &ftrace_common_fields) {
592 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" 643 if (unlikely(list_empty(head)))
593 "\tsize:%u;\tsigned:%d;\n", 644 return NULL;
594 (int)(array_descriptor - field->type), 645 field = list_entry(head->prev, struct ftrace_event_field, link);
595 field->type, field->name, 646 /* Set the LSB to notify f_show to print an extra newline */
596 array_descriptor, field->offset, 647 field = (struct ftrace_event_field *)
597 field->size, !!field->is_signed); 648 ((unsigned long)field | 1);
598 } 649 return field;
650 }
599 651
600 if (--common_field_count == 0) 652 /* If we are done tell f_show to print the format */
601 r = trace_seq_printf(s, "\n"); 653 if (field->link.prev == head)
654 return (void *)FORMAT_PRINTFMT;
602 655
603 if (!r) 656 field = list_entry(field->link.prev, struct ftrace_event_field, link);
604 break;
605 }
606 657
607 if (r) 658 return field;
608 r = trace_seq_printf(s, "\nprint fmt: %s\n", 659}
609 call->print_fmt);
610 660
611 if (!r) { 661static void *f_start(struct seq_file *m, loff_t *pos)
612 /* 662{
613 * ug! The format output is bigger than a PAGE!! 663 loff_t l = 0;
614 */ 664 void *p;
615 buf = "FORMAT TOO BIG\n"; 665
616 r = simple_read_from_buffer(ubuf, cnt, ppos, 666 /* Start by showing the header */
617 buf, strlen(buf)); 667 if (!*pos)
618 goto out; 668 return (void *)FORMAT_HEADER;
669
670 p = (void *)FORMAT_HEADER;
671 do {
672 p = f_next(m, p, &l);
673 } while (p && l < *pos);
674
675 return p;
676}
677
678static int f_show(struct seq_file *m, void *v)
679{
680 struct ftrace_event_call *call = m->private;
681 struct ftrace_event_field *field;
682 const char *array_descriptor;
683
684 switch ((unsigned long)v) {
685 case FORMAT_HEADER:
686 seq_printf(m, "name: %s\n", call->name);
687 seq_printf(m, "ID: %d\n", call->event.type);
688 seq_printf(m, "format:\n");
689 return 0;
690
691 case FORMAT_PRINTFMT:
692 seq_printf(m, "\nprint fmt: %s\n",
693 call->print_fmt);
694 return 0;
619 } 695 }
620 696
621 r = simple_read_from_buffer(ubuf, cnt, ppos, 697 /*
622 s->buffer, s->len); 698 * To separate common fields from event fields, the
623 out: 699 * LSB is set on the first event field. Clear it and
624 kfree(s); 700 * print a newline if it is set.
625 return r; 701 */
702 if ((unsigned long)v & 1) {
703 seq_putc(m, '\n');
704 v = (void *)((unsigned long)v & ~1L);
705 }
706
707 field = v;
708
709 /*
710 * Smartly shows the array type(except dynamic array).
711 * Normal:
712 * field:TYPE VAR
713 * If TYPE := TYPE[LEN], it is shown:
714 * field:TYPE VAR[LEN]
715 */
716 array_descriptor = strchr(field->type, '[');
717
718 if (!strncmp(field->type, "__data_loc", 10))
719 array_descriptor = NULL;
720
721 if (!array_descriptor)
722 seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
723 field->type, field->name, field->offset,
724 field->size, !!field->is_signed);
725 else
726 seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
727 (int)(array_descriptor - field->type),
728 field->type, field->name,
729 array_descriptor, field->offset,
730 field->size, !!field->is_signed);
731
732 return 0;
733}
734
735static void f_stop(struct seq_file *m, void *p)
736{
737}
738
739static const struct seq_operations trace_format_seq_ops = {
740 .start = f_start,
741 .next = f_next,
742 .stop = f_stop,
743 .show = f_show,
744};
745
746static int trace_format_open(struct inode *inode, struct file *file)
747{
748 struct ftrace_event_call *call = inode->i_private;
749 struct seq_file *m;
750 int ret;
751
752 ret = seq_open(file, &trace_format_seq_ops);
753 if (ret < 0)
754 return ret;
755
756 m = file->private_data;
757 m->private = call;
758
759 return 0;
626} 760}
627 761
628static ssize_t 762static ssize_t
@@ -820,8 +954,10 @@ static const struct file_operations ftrace_enable_fops = {
820}; 954};
821 955
822static const struct file_operations ftrace_event_format_fops = { 956static const struct file_operations ftrace_event_format_fops = {
823 .open = tracing_open_generic, 957 .open = trace_format_open,
824 .read = event_format_read, 958 .read = seq_read,
959 .llseek = seq_lseek,
960 .release = seq_release,
825}; 961};
826 962
827static const struct file_operations ftrace_event_id_fops = { 963static const struct file_operations ftrace_event_id_fops = {
@@ -963,35 +1099,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
963 return -1; 1099 return -1;
964 } 1100 }
965 1101
966 if (call->class->probe || call->class->reg) 1102 if (call->class->reg)
967 trace_create_file("enable", 0644, call->dir, call, 1103 trace_create_file("enable", 0644, call->dir, call,
968 enable); 1104 enable);
969 1105
970#ifdef CONFIG_PERF_EVENTS 1106#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg)) 1107 if (call->event.type && call->class->reg)
972 trace_create_file("id", 0444, call->dir, call, 1108 trace_create_file("id", 0444, call->dir, call,
973 id); 1109 id);
974#endif 1110#endif
975 1111
976 if (call->class->define_fields) { 1112 /*
977 /* 1113 * Other events may have the same class. Only update
978 * Other events may have the same class. Only update 1114 * the fields if they are not already defined.
979 * the fields if they are not already defined. 1115 */
980 */ 1116 head = trace_get_fields(call);
981 head = trace_get_fields(call); 1117 if (list_empty(head)) {
982 if (list_empty(head)) { 1118 ret = call->class->define_fields(call);
983 ret = trace_define_common_fields(call); 1119 if (ret < 0) {
984 if (!ret) 1120 pr_warning("Could not initialize trace point"
985 ret = call->class->define_fields(call); 1121 " events/%s\n", call->name);
986 if (ret < 0) { 1122 return ret;
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
991 } 1123 }
992 trace_create_file("filter", 0644, call->dir, call,
993 filter);
994 } 1124 }
1125 trace_create_file("filter", 0644, call->dir, call,
1126 filter);
995 1127
996 trace_create_file("format", 0444, call->dir, call, 1128 trace_create_file("format", 0444, call->dir, call,
997 format); 1129 format);
@@ -999,11 +1131,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
999 return 0; 1131 return 0;
1000} 1132}
1001 1133
1002static int __trace_add_event_call(struct ftrace_event_call *call) 1134static int
1135__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1136 const struct file_operations *id,
1137 const struct file_operations *enable,
1138 const struct file_operations *filter,
1139 const struct file_operations *format)
1003{ 1140{
1004 struct dentry *d_events; 1141 struct dentry *d_events;
1005 int ret; 1142 int ret;
1006 1143
1144 /* The linker may leave blanks */
1007 if (!call->name) 1145 if (!call->name)
1008 return -EINVAL; 1146 return -EINVAL;
1009 1147
@@ -1011,8 +1149,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1011 ret = call->class->raw_init(call); 1149 ret = call->class->raw_init(call);
1012 if (ret < 0) { 1150 if (ret < 0) {
1013 if (ret != -ENOSYS) 1151 if (ret != -ENOSYS)
1014 pr_warning("Could not initialize trace " 1152 pr_warning("Could not initialize trace events/%s\n",
1015 "events/%s\n", call->name); 1153 call->name);
1016 return ret; 1154 return ret;
1017 } 1155 }
1018 } 1156 }
@@ -1021,11 +1159,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1021 if (!d_events) 1159 if (!d_events)
1022 return -ENOENT; 1160 return -ENOENT;
1023 1161
1024 ret = event_create_dir(call, d_events, &ftrace_event_id_fops, 1162 ret = event_create_dir(call, d_events, id, enable, filter, format);
1025 &ftrace_enable_fops, &ftrace_event_filter_fops,
1026 &ftrace_event_format_fops);
1027 if (!ret) 1163 if (!ret)
1028 list_add(&call->list, &ftrace_events); 1164 list_add(&call->list, &ftrace_events);
1165 call->mod = mod;
1029 1166
1030 return ret; 1167 return ret;
1031} 1168}
@@ -1035,7 +1172,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
1035{ 1172{
1036 int ret; 1173 int ret;
1037 mutex_lock(&event_mutex); 1174 mutex_lock(&event_mutex);
1038 ret = __trace_add_event_call(call); 1175 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1176 &ftrace_enable_fops,
1177 &ftrace_event_filter_fops,
1178 &ftrace_event_format_fops);
1039 mutex_unlock(&event_mutex); 1179 mutex_unlock(&event_mutex);
1040 return ret; 1180 return ret;
1041} 1181}
@@ -1152,8 +1292,6 @@ static void trace_module_add_events(struct module *mod)
1152{ 1292{
1153 struct ftrace_module_file_ops *file_ops = NULL; 1293 struct ftrace_module_file_ops *file_ops = NULL;
1154 struct ftrace_event_call *call, *start, *end; 1294 struct ftrace_event_call *call, *start, *end;
1155 struct dentry *d_events;
1156 int ret;
1157 1295
1158 start = mod->trace_events; 1296 start = mod->trace_events;
1159 end = mod->trace_events + mod->num_trace_events; 1297 end = mod->trace_events + mod->num_trace_events;
@@ -1161,38 +1299,14 @@ static void trace_module_add_events(struct module *mod)
1161 if (start == end) 1299 if (start == end)
1162 return; 1300 return;
1163 1301
1164 d_events = event_trace_events_dir(); 1302 file_ops = trace_create_file_ops(mod);
1165 if (!d_events) 1303 if (!file_ops)
1166 return; 1304 return;
1167 1305
1168 for_each_event(call, start, end) { 1306 for_each_event(call, start, end) {
1169 /* The linker may leave blanks */ 1307 __trace_add_event_call(call, mod,
1170 if (!call->name)
1171 continue;
1172 if (call->class->raw_init) {
1173 ret = call->class->raw_init(call);
1174 if (ret < 0) {
1175 if (ret != -ENOSYS)
1176 pr_warning("Could not initialize trace "
1177 "point events/%s\n", call->name);
1178 continue;
1179 }
1180 }
1181 /*
1182 * This module has events, create file ops for this module
1183 * if not already done.
1184 */
1185 if (!file_ops) {
1186 file_ops = trace_create_file_ops(mod);
1187 if (!file_ops)
1188 return;
1189 }
1190 call->mod = mod;
1191 ret = event_create_dir(call, d_events,
1192 &file_ops->id, &file_ops->enable, 1308 &file_ops->id, &file_ops->enable,
1193 &file_ops->filter, &file_ops->format); 1309 &file_ops->filter, &file_ops->format);
1194 if (!ret)
1195 list_add(&call->list, &ftrace_events);
1196 } 1310 }
1197} 1311}
1198 1312
@@ -1319,25 +1433,14 @@ static __init int event_trace_init(void)
1319 trace_create_file("enable", 0644, d_events, 1433 trace_create_file("enable", 0644, d_events,
1320 NULL, &ftrace_system_enable_fops); 1434 NULL, &ftrace_system_enable_fops);
1321 1435
1436 if (trace_define_common_fields())
1437 pr_warning("tracing: Failed to allocate common fields");
1438
1322 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1439 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1323 /* The linker may leave blanks */ 1440 __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1324 if (!call->name)
1325 continue;
1326 if (call->class->raw_init) {
1327 ret = call->class->raw_init(call);
1328 if (ret < 0) {
1329 if (ret != -ENOSYS)
1330 pr_warning("Could not initialize trace "
1331 "point events/%s\n", call->name);
1332 continue;
1333 }
1334 }
1335 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1336 &ftrace_enable_fops, 1441 &ftrace_enable_fops,
1337 &ftrace_event_filter_fops, 1442 &ftrace_event_filter_fops,
1338 &ftrace_event_format_fops); 1443 &ftrace_event_format_fops);
1339 if (!ret)
1340 list_add(&call->list, &ftrace_events);
1341 } 1444 }
1342 1445
1343 while (true) { 1446 while (true) {
@@ -1524,12 +1627,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1524 struct ftrace_entry *entry; 1627 struct ftrace_entry *entry;
1525 unsigned long flags; 1628 unsigned long flags;
1526 long disabled; 1629 long disabled;
1527 int resched;
1528 int cpu; 1630 int cpu;
1529 int pc; 1631 int pc;
1530 1632
1531 pc = preempt_count(); 1633 pc = preempt_count();
1532 resched = ftrace_preempt_disable(); 1634 preempt_disable_notrace();
1533 cpu = raw_smp_processor_id(); 1635 cpu = raw_smp_processor_id();
1534 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); 1636 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1535 1637
@@ -1551,7 +1653,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1551 1653
1552 out: 1654 out:
1553 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1655 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1554 ftrace_preempt_enable(resched); 1656 preempt_enable_notrace();
1555} 1657}
1556 1658
1557static struct ftrace_ops trace_ops __initdata = 1659static struct ftrace_ops trace_ops __initdata =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 57bb1bb32999..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system,
497} 497}
498 498
499static struct ftrace_event_field * 499static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500__find_event_field(struct list_head *head, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
504 503
505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) { 504 list_for_each_entry(field, head, link) {
507 if (!strcmp(field->name, name)) 505 if (!strcmp(field->name, name))
508 return field; 506 return field;
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
511 return NULL; 509 return NULL;
512} 510}
513 511
512static struct ftrace_event_field *
513find_event_field(struct ftrace_event_call *call, char *name)
514{
515 struct ftrace_event_field *field;
516 struct list_head *head;
517
518 field = __find_event_field(&ftrace_common_fields, name);
519 if (field)
520 return field;
521
522 head = trace_get_fields(call);
523 return __find_event_field(head, name);
524}
525
514static void filter_free_pred(struct filter_pred *pred) 526static void filter_free_pred(struct filter_pred *pred)
515{ 527{
516 if (!pred) 528 if (!pred)
@@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system)
627 int err; 639 int err;
628 640
629 list_for_each_entry(call, &ftrace_events, list) { 641 list_for_each_entry(call, &ftrace_events, list) {
630 if (!call->class || !call->class->define_fields)
631 continue;
632
633 if (strcmp(call->class->system, system->name) != 0) 642 if (strcmp(call->class->system, system->name) != 0)
634 continue; 643 continue;
635 644
@@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
646 struct ftrace_event_call *call; 655 struct ftrace_event_call *call;
647 656
648 list_for_each_entry(call, &ftrace_events, list) { 657 list_for_each_entry(call, &ftrace_events, list) {
649 if (!call->class || !call->class->define_fields)
650 continue;
651
652 if (strcmp(call->class->system, system->name) != 0) 658 if (strcmp(call->class->system, system->name) != 0)
653 continue; 659 continue;
654 660
@@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system,
1251 list_for_each_entry(call, &ftrace_events, list) { 1257 list_for_each_entry(call, &ftrace_events, list) {
1252 struct event_filter *filter = call->filter; 1258 struct event_filter *filter = call->filter;
1253 1259
1254 if (!call->class || !call->class->define_fields)
1255 continue;
1256
1257 if (strcmp(call->class->system, system->name) != 0) 1260 if (strcmp(call->class->system, system->name) != 0)
1258 continue; 1261 continue;
1259 1262
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 8536e2a65969..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
125 125
126#include "trace_entries.h" 126#include "trace_entries.h"
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{
130 INIT_LIST_HEAD(&call->class->fields);
131 return 0;
132}
133
134#undef __entry 128#undef __entry
135#define __entry REC 129#define __entry REC
136 130
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
158struct ftrace_event_class event_class_ftrace_##call = { \ 152struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \ 153 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \ 154 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \ 155 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
162}; \ 156}; \
163 \ 157 \
164struct ftrace_event_call __used \ 158struct ftrace_event_call __used \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
54 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
55 unsigned long flags; 55 unsigned long flags;
56 long disabled; 56 long disabled;
57 int cpu, resched; 57 int cpu;
58 int pc; 58 int pc;
59 59
60 if (unlikely(!ftrace_function_enabled)) 60 if (unlikely(!ftrace_function_enabled))
61 return; 61 return;
62 62
63 pc = preempt_count(); 63 pc = preempt_count();
64 resched = ftrace_preempt_disable(); 64 preempt_disable_notrace();
65 local_save_flags(flags); 65 local_save_flags(flags);
66 cpu = raw_smp_processor_id(); 66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu]; 67 data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
71 trace_function(tr, ip, parent_ip, flags, pc); 71 trace_function(tr, ip, parent_ip, flags, pc);
72 72
73 atomic_dec(&data->disabled); 73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched); 74 preempt_enable_notrace();
75} 75}
76 76
77static void 77static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..6f233698518e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -507,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter,
507 * if the output fails. 507 * if the output fails.
508 */ 508 */
509 data->ent = *curr; 509 data->ent = *curr;
510 data->ret = *next; 510 /*
511 * If the next event is not a return type, then
512 * we only care about what type it is. Otherwise we can
513 * safely copy the entire event.
514 */
515 if (next->ent.type == TRACE_GRAPH_RET)
516 data->ret = *next;
517 else
518 data->ret.ent.type = next->ent.type;
511 } 519 }
512 } 520 }
513 521
@@ -641,7 +649,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
641 649
642 /* Print nsecs (we don't want to exceed 7 numbers) */ 650 /* Print nsecs (we don't want to exceed 7 numbers) */
643 if (len < 7) { 651 if (len < 7) {
644 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); 652 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
653 nsecs_rem);
645 ret = trace_seq_printf(s, ".%s", nsecs_str); 654 ret = trace_seq_printf(s, ".%s", nsecs_str);
646 if (!ret) 655 if (!ret)
647 return TRACE_TYPE_PARTIAL_LINE; 656 return TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6fd486e0cef4..73a6b0601f2e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly =
649#endif 649#endif
650 .open = irqsoff_trace_open, 650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close, 651 .close = irqsoff_trace_close,
652 .use_max_tr = 1,
652}; 653};
653# define register_irqsoff(trace) register_tracer(&trace) 654# define register_irqsoff(trace) register_tracer(&trace)
654#else 655#else
@@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly =
681#endif 682#endif
682 .open = irqsoff_trace_open, 683 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close, 684 .close = irqsoff_trace_close,
685 .use_max_tr = 1,
684}; 686};
685# define register_preemptoff(trace) register_tracer(&trace) 687# define register_preemptoff(trace) register_tracer(&trace)
686#else 688#else
@@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
715#endif 717#endif
716 .open = irqsoff_trace_open, 718 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close, 719 .close = irqsoff_trace_close,
720 .use_max_tr = 1,
718}; 721};
719 722
720# define register_preemptirqsoff(trace) register_tracer(&trace) 723# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..7b8ecd751d93
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,136 @@
1/*
2 * kdb helper for dumping the ftrace buffer
3 *
4 * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
5 *
6 * ftrace_dump_buf based on ftrace_dump:
7 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
8 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
9 *
10 */
11#include <linux/init.h>
12#include <linux/kgdb.h>
13#include <linux/kdb.h>
14#include <linux/ftrace.h>
15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h"
18#include "trace_output.h"
19
20static void ftrace_dump_buf(int skip_lines, long cpu_file)
21{
22 /* use static because iter can be a bit big for the stack */
23 static struct trace_iterator iter;
24 unsigned int old_userobj;
25 int cnt = 0, cpu;
26
27 trace_init_global_iter(&iter);
28
29 for_each_tracing_cpu(cpu) {
30 atomic_inc(&iter.tr->data[cpu]->disabled);
31 }
32
33 old_userobj = trace_flags;
34
35 /* don't look at user memory in panic mode */
36 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
37
38 kdb_printf("Dumping ftrace buffer:\n");
39
40 /* reset all but tr, trace, and overruns */
41 memset(&iter.seq, 0,
42 sizeof(struct trace_iterator) -
43 offsetof(struct trace_iterator, seq));
44 iter.iter_flags |= TRACE_FILE_LAT_FMT;
45 iter.pos = -1;
46
47 if (cpu_file == TRACE_PIPE_ALL_CPU) {
48 for_each_tracing_cpu(cpu) {
49 iter.buffer_iter[cpu] =
50 ring_buffer_read_prepare(iter.tr->buffer, cpu);
51 ring_buffer_read_start(iter.buffer_iter[cpu]);
52 tracing_iter_reset(&iter, cpu);
53 }
54 } else {
55 iter.cpu_file = cpu_file;
56 iter.buffer_iter[cpu_file] =
57 ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
58 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
59 tracing_iter_reset(&iter, cpu_file);
60 }
61 if (!trace_empty(&iter))
62 trace_find_next_entry_inc(&iter);
63 while (!trace_empty(&iter)) {
64 if (!cnt)
65 kdb_printf("---------------------------------\n");
66 cnt++;
67
68 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
69 print_trace_line(&iter);
70 if (!skip_lines)
71 trace_printk_seq(&iter.seq);
72 else
73 skip_lines--;
74 if (KDB_FLAG(CMD_INTERRUPT))
75 goto out;
76 }
77
78 if (!cnt)
79 kdb_printf(" (ftrace buffer empty)\n");
80 else
81 kdb_printf("---------------------------------\n");
82
83out:
84 trace_flags = old_userobj;
85
86 for_each_tracing_cpu(cpu) {
87 atomic_dec(&iter.tr->data[cpu]->disabled);
88 }
89
90 for_each_tracing_cpu(cpu)
91 if (iter.buffer_iter[cpu])
92 ring_buffer_read_finish(iter.buffer_iter[cpu]);
93}
94
95/*
96 * kdb_ftdump - Dump the ftrace log buffer
97 */
98static int kdb_ftdump(int argc, const char **argv)
99{
100 int skip_lines = 0;
101 long cpu_file;
102 char *cp;
103
104 if (argc > 2)
105 return KDB_ARGCOUNT;
106
107 if (argc) {
108 skip_lines = simple_strtol(argv[1], &cp, 0);
109 if (*cp)
110 skip_lines = 0;
111 }
112
113 if (argc == 2) {
114 cpu_file = simple_strtol(argv[2], &cp, 0);
115 if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
116 !cpu_online(cpu_file))
117 return KDB_BADINT;
118 } else {
119 cpu_file = TRACE_PIPE_ALL_CPU;
120 }
121
122 kdb_trap_printk++;
123 ftrace_dump_buf(skip_lines, cpu_file);
124 kdb_trap_printk--;
125
126 return 0;
127}
128
129static __init int kdb_ftrace_register(void)
130{
131 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
132 "Dump ftrace log", 0, KDB_REPEAT_NONE);
133 return 0;
134}
135
136late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f52b5f50299d..8b27c9849b42 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,6 +30,8 @@
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <linux/uaccess.h>
33#include <asm/bitsperlong.h> 35#include <asm/bitsperlong.h>
34 36
35#include "trace.h" 37#include "trace.h"
@@ -38,6 +40,7 @@
38#define MAX_TRACE_ARGS 128 40#define MAX_TRACE_ARGS 128
39#define MAX_ARGSTR_LEN 63 41#define MAX_ARGSTR_LEN 63
40#define MAX_EVENT_NAME_LEN 64 42#define MAX_EVENT_NAME_LEN 64
43#define MAX_STRING_SIZE PATH_MAX
41#define KPROBE_EVENT_SYSTEM "kprobes" 44#define KPROBE_EVENT_SYSTEM "kprobes"
42 45
43/* Reserved field names */ 46/* Reserved field names */
@@ -58,14 +61,16 @@ const char *reserved_field_names[] = {
58}; 61};
59 62
60/* Printing function type */ 63/* Printing function type */
61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); 64typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
65 void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type 66#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type 67#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64 68
65/* Printing in basic type function template */ 69/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ 70#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 71static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\ 72 const char *name, \
73 void *data, void *ent)\
69{ \ 74{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ 75 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \ 76} \
@@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) 85DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) 86DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82 87
88/* data_rloc: data relative location, compatible with u32 */
89#define make_data_rloc(len, roffs) \
90 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
91#define get_rloc_len(dl) ((u32)(dl) >> 16)
92#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
93
94static inline void *get_rloc_data(u32 *dl)
95{
96 return (u8 *)dl + get_rloc_offs(*dl);
97}
98
99/* For data_loc conversion */
100static inline void *get_loc_data(u32 *dl, void *ent)
101{
102 return (u8 *)ent + get_rloc_offs(*dl);
103}
104
105/*
106 * Convert data_rloc to data_loc:
107 * data_rloc stores the offset from data_rloc itself, but data_loc
108 * stores the offset from event entry.
109 */
110#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
111
112/* For defining macros, define string/string_size types */
113typedef u32 string;
114typedef u32 string_size;
115
116/* Print type function for string type */
117static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
118 const char *name,
119 void *data, void *ent)
120{
121 int len = *(u32 *)data >> 16;
122
123 if (!len)
124 return trace_seq_printf(s, " %s=(fault)", name);
125 else
126 return trace_seq_printf(s, " %s=\"%s\"", name,
127 (const char *)get_loc_data(data, ent));
128}
129static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
130
83/* Data fetch function type */ 131/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); 132typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85 133
@@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm,
94 return fprm->fn(regs, fprm->data, dest); 142 return fprm->fn(regs, fprm->data, dest);
95} 143}
96 144
97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type 145#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
98/* 146/*
99 * Define macro for basic types - we don't need to define s* types, because 147 * Define macro for basic types - we don't need to define s* types, because
100 * we have to care only about bitwidth at recording time. 148 * we have to care only about bitwidth at recording time.
101 */ 149 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \ 150#define DEFINE_BASIC_FETCH_FUNCS(method) \
103DEFINE_FETCH_##kind(u8) \ 151DEFINE_FETCH_##method(u8) \
104DEFINE_FETCH_##kind(u16) \ 152DEFINE_FETCH_##method(u16) \
105DEFINE_FETCH_##kind(u32) \ 153DEFINE_FETCH_##method(u32) \
106DEFINE_FETCH_##kind(u64) 154DEFINE_FETCH_##method(u64)
107 155
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ 156#define CHECK_FETCH_FUNCS(method, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \ 157 (((FETCH_FUNC_NAME(method, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \ 158 (FETCH_FUNC_NAME(method, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \ 159 (FETCH_FUNC_NAME(method, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn)) 160 (FETCH_FUNC_NAME(method, u64) == fn) || \
161 (FETCH_FUNC_NAME(method, string) == fn) || \
162 (FETCH_FUNC_NAME(method, string_size) == fn)) \
163 && (fn != NULL))
113 164
114/* Data fetch function templates */ 165/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \ 166#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 167static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \ 168 void *offset, void *dest) \
118{ \ 169{ \
119 *(type *)dest = (type)regs_get_register(regs, \ 170 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \ 171 (unsigned int)((unsigned long)offset)); \
121} 172}
122DEFINE_BASIC_FETCH_FUNCS(reg) 173DEFINE_BASIC_FETCH_FUNCS(reg)
174/* No string on the register */
175#define fetch_reg_string NULL
176#define fetch_reg_string_size NULL
123 177
124#define DEFINE_FETCH_stack(type) \ 178#define DEFINE_FETCH_stack(type) \
125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 179static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
@@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
129 (unsigned int)((unsigned long)offset)); \ 183 (unsigned int)((unsigned long)offset)); \
130} 184}
131DEFINE_BASIC_FETCH_FUNCS(stack) 185DEFINE_BASIC_FETCH_FUNCS(stack)
186/* No string on the stack entry */
187#define fetch_stack_string NULL
188#define fetch_stack_string_size NULL
132 189
133#define DEFINE_FETCH_retval(type) \ 190#define DEFINE_FETCH_retval(type) \
134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ 191static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
@@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
137 *(type *)dest = (type)regs_return_value(regs); \ 194 *(type *)dest = (type)regs_return_value(regs); \
138} 195}
139DEFINE_BASIC_FETCH_FUNCS(retval) 196DEFINE_BASIC_FETCH_FUNCS(retval)
197/* No string on the retval */
198#define fetch_retval_string NULL
199#define fetch_retval_string_size NULL
140 200
141#define DEFINE_FETCH_memory(type) \ 201#define DEFINE_FETCH_memory(type) \
142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 202static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
@@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
149 *(type *)dest = retval; \ 209 *(type *)dest = retval; \
150} 210}
151DEFINE_BASIC_FETCH_FUNCS(memory) 211DEFINE_BASIC_FETCH_FUNCS(memory)
212/*
213 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
214 * length and relative data location.
215 */
216static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
217 void *addr, void *dest)
218{
219 long ret;
220 int maxlen = get_rloc_len(*(u32 *)dest);
221 u8 *dst = get_rloc_data(dest);
222 u8 *src = addr;
223 mm_segment_t old_fs = get_fs();
224 if (!maxlen)
225 return;
226 /*
227 * Try to get string again, since the string can be changed while
228 * probing.
229 */
230 set_fs(KERNEL_DS);
231 pagefault_disable();
232 do
233 ret = __copy_from_user_inatomic(dst++, src++, 1);
234 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
235 dst[-1] = '\0';
236 pagefault_enable();
237 set_fs(old_fs);
238
239 if (ret < 0) { /* Failed to fetch string */
240 ((u8 *)get_rloc_data(dest))[0] = '\0';
241 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
242 } else
243 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
244 get_rloc_offs(*(u32 *)dest));
245}
246/* Return the length of string -- including null terminal byte */
247static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
248 void *addr, void *dest)
249{
250 int ret, len = 0;
251 u8 c;
252 mm_segment_t old_fs = get_fs();
253
254 set_fs(KERNEL_DS);
255 pagefault_disable();
256 do {
257 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
258 len++;
259 } while (c && ret == 0 && len < MAX_STRING_SIZE);
260 pagefault_enable();
261 set_fs(old_fs);
262
263 if (ret < 0) /* Failed to check the length */
264 *(u32 *)dest = 0;
265 else
266 *(u32 *)dest = len;
267}
152 268
153/* Memory fetching by symbol */ 269/* Memory fetching by symbol */
154struct symbol_cache { 270struct symbol_cache {
@@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
203 *(type *)dest = 0; \ 319 *(type *)dest = 0; \
204} 320}
205DEFINE_BASIC_FETCH_FUNCS(symbol) 321DEFINE_BASIC_FETCH_FUNCS(symbol)
322DEFINE_FETCH_symbol(string)
323DEFINE_FETCH_symbol(string_size)
206 324
207/* Dereference memory access function */ 325/* Dereference memory access function */
208struct deref_fetch_param { 326struct deref_fetch_param {
@@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
224 *(type *)dest = 0; \ 342 *(type *)dest = 0; \
225} 343}
226DEFINE_BASIC_FETCH_FUNCS(deref) 344DEFINE_BASIC_FETCH_FUNCS(deref)
345DEFINE_FETCH_deref(string)
346DEFINE_FETCH_deref(string_size)
227 347
228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 348static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
229{ 349{
230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) 350 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
231 free_deref_fetch_param(data->orig.data); 351 free_deref_fetch_param(data->orig.data);
232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) 352 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
233 free_symbol_cache(data->orig.data); 353 free_symbol_cache(data->orig.data);
234 kfree(data); 354 kfree(data);
235} 355}
@@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) 360#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) 361#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242 362
243#define ASSIGN_FETCH_FUNC(kind, type) \ 363/* Fetch types */
244 .kind = FETCH_FUNC_NAME(kind, type) 364enum {
245 365 FETCH_MTD_reg = 0,
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ 366 FETCH_MTD_stack,
247 {.name = #ptype, \ 367 FETCH_MTD_retval,
248 .size = sizeof(ftype), \ 368 FETCH_MTD_memory,
249 .is_signed = sign, \ 369 FETCH_MTD_symbol,
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \ 370 FETCH_MTD_deref,
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \ 371 FETCH_MTD_END,
252ASSIGN_FETCH_FUNC(reg, ftype), \ 372};
253ASSIGN_FETCH_FUNC(stack, ftype), \ 373
254ASSIGN_FETCH_FUNC(retval, ftype), \ 374#define ASSIGN_FETCH_FUNC(method, type) \
255ASSIGN_FETCH_FUNC(memory, ftype), \ 375 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
256ASSIGN_FETCH_FUNC(symbol, ftype), \ 376
257ASSIGN_FETCH_FUNC(deref, ftype), \ 377#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
378 {.name = _name, \
379 .size = _size, \
380 .is_signed = sign, \
381 .print = PRINT_TYPE_FUNC_NAME(ptype), \
382 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
383 .fmttype = _fmttype, \
384 .fetch = { \
385ASSIGN_FETCH_FUNC(reg, ftype), \
386ASSIGN_FETCH_FUNC(stack, ftype), \
387ASSIGN_FETCH_FUNC(retval, ftype), \
388ASSIGN_FETCH_FUNC(memory, ftype), \
389ASSIGN_FETCH_FUNC(symbol, ftype), \
390ASSIGN_FETCH_FUNC(deref, ftype), \
391 } \
258 } 392 }
259 393
394#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
395 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
396
397#define FETCH_TYPE_STRING 0
398#define FETCH_TYPE_STRSIZE 1
399
260/* Fetch type information table */ 400/* Fetch type information table */
261static const struct fetch_type { 401static const struct fetch_type {
262 const char *name; /* Name of type */ 402 const char *name; /* Name of type */
@@ -264,14 +404,16 @@ static const struct fetch_type {
264 int is_signed; /* Signed flag */ 404 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */ 405 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */ 406 const char *fmt; /* Fromat string */
407 const char *fmttype; /* Name in format file */
267 /* Fetch functions */ 408 /* Fetch functions */
268 fetch_func_t reg; 409 fetch_func_t fetch[FETCH_MTD_END];
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = { 410} fetch_type_table[] = {
411 /* Special types */
412 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
413 sizeof(u32), 1, "__data_loc char[]"),
414 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
415 string_size, sizeof(u32), 0, "u32"),
416 /* Basic types */
275 ASSIGN_FETCH_TYPE(u8, u8, 0), 417 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0), 418 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0), 419 ASSIGN_FETCH_TYPE(u32, u32, 0),
@@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs,
302 *(unsigned long *)dest = kernel_stack_pointer(regs); 444 *(unsigned long *)dest = kernel_stack_pointer(regs);
303} 445}
304 446
447static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
448 fetch_func_t orig_fn)
449{
450 int i;
451
452 if (type != &fetch_type_table[FETCH_TYPE_STRING])
453 return NULL; /* Only string type needs size function */
454 for (i = 0; i < FETCH_MTD_END; i++)
455 if (type->fetch[i] == orig_fn)
456 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
457
458 WARN_ON(1); /* This should not happen */
459 return NULL;
460}
461
305/** 462/**
306 * Kprobe event core functions 463 * Kprobe event core functions
307 */ 464 */
308 465
309struct probe_arg { 466struct probe_arg {
310 struct fetch_param fetch; 467 struct fetch_param fetch;
468 struct fetch_param fetch_size;
311 unsigned int offset; /* Offset from argument entry */ 469 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */ 470 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */ 471 const char *comm; /* Command of this argument */
@@ -429,9 +587,9 @@ error:
429 587
430static void free_probe_arg(struct probe_arg *arg) 588static void free_probe_arg(struct probe_arg *arg)
431{ 589{
432 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) 590 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
433 free_deref_fetch_param(arg->fetch.data); 591 free_deref_fetch_param(arg->fetch.data);
434 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) 592 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
435 free_symbol_cache(arg->fetch.data); 593 free_symbol_cache(arg->fetch.data);
436 kfree(arg->name); 594 kfree(arg->name);
437 kfree(arg->comm); 595 kfree(arg->comm);
@@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
548 706
549 if (strcmp(arg, "retval") == 0) { 707 if (strcmp(arg, "retval") == 0) {
550 if (is_return) 708 if (is_return)
551 f->fn = t->retval; 709 f->fn = t->fetch[FETCH_MTD_retval];
552 else 710 else
553 ret = -EINVAL; 711 ret = -EINVAL;
554 } else if (strncmp(arg, "stack", 5) == 0) { 712 } else if (strncmp(arg, "stack", 5) == 0) {
@@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
562 if (ret || param > PARAM_MAX_STACK) 720 if (ret || param > PARAM_MAX_STACK)
563 ret = -EINVAL; 721 ret = -EINVAL;
564 else { 722 else {
565 f->fn = t->stack; 723 f->fn = t->fetch[FETCH_MTD_stack];
566 f->data = (void *)param; 724 f->data = (void *)param;
567 } 725 }
568 } else 726 } else
@@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
588 case '%': /* named register */ 746 case '%': /* named register */
589 ret = regs_query_register_offset(arg + 1); 747 ret = regs_query_register_offset(arg + 1);
590 if (ret >= 0) { 748 if (ret >= 0) {
591 f->fn = t->reg; 749 f->fn = t->fetch[FETCH_MTD_reg];
592 f->data = (void *)(unsigned long)ret; 750 f->data = (void *)(unsigned long)ret;
593 ret = 0; 751 ret = 0;
594 } 752 }
@@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
598 ret = strict_strtoul(arg + 1, 0, &param); 756 ret = strict_strtoul(arg + 1, 0, &param);
599 if (ret) 757 if (ret)
600 break; 758 break;
601 f->fn = t->memory; 759 f->fn = t->fetch[FETCH_MTD_memory];
602 f->data = (void *)param; 760 f->data = (void *)param;
603 } else { 761 } else {
604 ret = split_symbol_offset(arg + 1, &offset); 762 ret = split_symbol_offset(arg + 1, &offset);
@@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
606 break; 764 break;
607 f->data = alloc_symbol_cache(arg + 1, offset); 765 f->data = alloc_symbol_cache(arg + 1, offset);
608 if (f->data) 766 if (f->data)
609 f->fn = t->symbol; 767 f->fn = t->fetch[FETCH_MTD_symbol];
610 } 768 }
611 break; 769 break;
612 case '+': /* deref memory */ 770 case '+': /* deref memory */
@@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
636 if (ret) 794 if (ret)
637 kfree(dprm); 795 kfree(dprm);
638 else { 796 else {
639 f->fn = t->deref; 797 f->fn = t->fetch[FETCH_MTD_deref];
640 f->data = (void *)dprm; 798 f->data = (void *)dprm;
641 } 799 }
642 } 800 }
643 break; 801 break;
644 } 802 }
645 if (!ret && !f->fn) 803 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
804 pr_info("%s type has no corresponding fetch method.\n",
805 t->name);
646 ret = -EINVAL; 806 ret = -EINVAL;
807 }
647 return ret; 808 return ret;
648} 809}
649 810
@@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
652 struct probe_arg *parg, int is_return) 813 struct probe_arg *parg, int is_return)
653{ 814{
654 const char *t; 815 const char *t;
816 int ret;
655 817
656 if (strlen(arg) > MAX_ARGSTR_LEN) { 818 if (strlen(arg) > MAX_ARGSTR_LEN) {
657 pr_info("Argument is too long.: %s\n", arg); 819 pr_info("Argument is too long.: %s\n", arg);
@@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
674 } 836 }
675 parg->offset = tp->size; 837 parg->offset = tp->size;
676 tp->size += parg->type->size; 838 tp->size += parg->type->size;
677 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 839 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
840 if (ret >= 0) {
841 parg->fetch_size.fn = get_fetch_size_function(parg->type,
842 parg->fetch.fn);
843 parg->fetch_size.data = parg->fetch.data;
844 }
845 return ret;
678} 846}
679 847
680/* Return 1 if name is reserved or already used by another argument */ 848/* Return 1 if name is reserved or already used by another argument */
@@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv)
757 pr_info("Delete command needs an event name.\n"); 925 pr_info("Delete command needs an event name.\n");
758 return -EINVAL; 926 return -EINVAL;
759 } 927 }
928 mutex_lock(&probe_lock);
760 tp = find_probe_event(event, group); 929 tp = find_probe_event(event, group);
761 if (!tp) { 930 if (!tp) {
931 mutex_unlock(&probe_lock);
762 pr_info("Event %s/%s doesn't exist.\n", group, event); 932 pr_info("Event %s/%s doesn't exist.\n", group, event);
763 return -ENOENT; 933 return -ENOENT;
764 } 934 }
765 /* delete an event */ 935 /* delete an event */
766 unregister_trace_probe(tp); 936 unregister_trace_probe(tp);
767 free_trace_probe(tp); 937 free_trace_probe(tp);
938 mutex_unlock(&probe_lock);
768 return 0; 939 return 0;
769 } 940 }
770 941
@@ -1043,6 +1214,54 @@ static const struct file_operations kprobe_profile_ops = {
1043 .release = seq_release, 1214 .release = seq_release,
1044}; 1215};
1045 1216
1217/* Sum up total data length for dynamic arraies (strings) */
1218static __kprobes int __get_data_size(struct trace_probe *tp,
1219 struct pt_regs *regs)
1220{
1221 int i, ret = 0;
1222 u32 len;
1223
1224 for (i = 0; i < tp->nr_args; i++)
1225 if (unlikely(tp->args[i].fetch_size.fn)) {
1226 call_fetch(&tp->args[i].fetch_size, regs, &len);
1227 ret += len;
1228 }
1229
1230 return ret;
1231}
1232
1233/* Store the value of each argument */
1234static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
1235 struct pt_regs *regs,
1236 u8 *data, int maxlen)
1237{
1238 int i;
1239 u32 end = tp->size;
1240 u32 *dl; /* Data (relative) location */
1241
1242 for (i = 0; i < tp->nr_args; i++) {
1243 if (unlikely(tp->args[i].fetch_size.fn)) {
1244 /*
1245 * First, we set the relative location and
1246 * maximum data length to *dl
1247 */
1248 dl = (u32 *)(data + tp->args[i].offset);
1249 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
1250 /* Then try to fetch string or dynamic array data */
1251 call_fetch(&tp->args[i].fetch, regs, dl);
1252 /* Reduce maximum length */
1253 end += get_rloc_len(*dl);
1254 maxlen -= get_rloc_len(*dl);
1255 /* Trick here, convert data_rloc to data_loc */
1256 *dl = convert_rloc_to_loc(*dl,
1257 ent_size + tp->args[i].offset);
1258 } else
1259 /* Just fetching data normally */
1260 call_fetch(&tp->args[i].fetch, regs,
1261 data + tp->args[i].offset);
1262 }
1263}
1264
1046/* Kprobe handler */ 1265/* Kprobe handler */
1047static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1266static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1048{ 1267{
@@ -1050,8 +1269,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1050 struct kprobe_trace_entry_head *entry; 1269 struct kprobe_trace_entry_head *entry;
1051 struct ring_buffer_event *event; 1270 struct ring_buffer_event *event;
1052 struct ring_buffer *buffer; 1271 struct ring_buffer *buffer;
1053 u8 *data; 1272 int size, dsize, pc;
1054 int size, i, pc;
1055 unsigned long irq_flags; 1273 unsigned long irq_flags;
1056 struct ftrace_event_call *call = &tp->call; 1274 struct ftrace_event_call *call = &tp->call;
1057 1275
@@ -1060,7 +1278,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1060 local_save_flags(irq_flags); 1278 local_save_flags(irq_flags);
1061 pc = preempt_count(); 1279 pc = preempt_count();
1062 1280
1063 size = sizeof(*entry) + tp->size; 1281 dsize = __get_data_size(tp, regs);
1282 size = sizeof(*entry) + tp->size + dsize;
1064 1283
1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1284 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1066 size, irq_flags, pc); 1285 size, irq_flags, pc);
@@ -1069,9 +1288,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1069 1288
1070 entry = ring_buffer_event_data(event); 1289 entry = ring_buffer_event_data(event);
1071 entry->ip = (unsigned long)kp->addr; 1290 entry->ip = (unsigned long)kp->addr;
1072 data = (u8 *)&entry[1]; 1291 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1073 for (i = 0; i < tp->nr_args; i++)
1074 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1075 1292
1076 if (!filter_current_check_discard(buffer, call, entry, event)) 1293 if (!filter_current_check_discard(buffer, call, entry, event))
1077 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1294 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1085,15 +1302,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1085 struct kretprobe_trace_entry_head *entry; 1302 struct kretprobe_trace_entry_head *entry;
1086 struct ring_buffer_event *event; 1303 struct ring_buffer_event *event;
1087 struct ring_buffer *buffer; 1304 struct ring_buffer *buffer;
1088 u8 *data; 1305 int size, pc, dsize;
1089 int size, i, pc;
1090 unsigned long irq_flags; 1306 unsigned long irq_flags;
1091 struct ftrace_event_call *call = &tp->call; 1307 struct ftrace_event_call *call = &tp->call;
1092 1308
1093 local_save_flags(irq_flags); 1309 local_save_flags(irq_flags);
1094 pc = preempt_count(); 1310 pc = preempt_count();
1095 1311
1096 size = sizeof(*entry) + tp->size; 1312 dsize = __get_data_size(tp, regs);
1313 size = sizeof(*entry) + tp->size + dsize;
1097 1314
1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1315 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1099 size, irq_flags, pc); 1316 size, irq_flags, pc);
@@ -1103,9 +1320,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1103 entry = ring_buffer_event_data(event); 1320 entry = ring_buffer_event_data(event);
1104 entry->func = (unsigned long)tp->rp.kp.addr; 1321 entry->func = (unsigned long)tp->rp.kp.addr;
1105 entry->ret_ip = (unsigned long)ri->ret_addr; 1322 entry->ret_ip = (unsigned long)ri->ret_addr;
1106 data = (u8 *)&entry[1]; 1323 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1107 for (i = 0; i < tp->nr_args; i++)
1108 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1109 1324
1110 if (!filter_current_check_discard(buffer, call, entry, event)) 1325 if (!filter_current_check_discard(buffer, call, entry, event))
1111 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1326 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1137,7 +1352,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1137 data = (u8 *)&field[1]; 1352 data = (u8 *)&field[1];
1138 for (i = 0; i < tp->nr_args; i++) 1353 for (i = 0; i < tp->nr_args; i++)
1139 if (!tp->args[i].type->print(s, tp->args[i].name, 1354 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset)) 1355 data + tp->args[i].offset, field))
1141 goto partial; 1356 goto partial;
1142 1357
1143 if (!trace_seq_puts(s, "\n")) 1358 if (!trace_seq_puts(s, "\n"))
@@ -1179,7 +1394,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1179 data = (u8 *)&field[1]; 1394 data = (u8 *)&field[1];
1180 for (i = 0; i < tp->nr_args; i++) 1395 for (i = 0; i < tp->nr_args; i++)
1181 if (!tp->args[i].type->print(s, tp->args[i].name, 1396 if (!tp->args[i].type->print(s, tp->args[i].name,
1182 data + tp->args[i].offset)) 1397 data + tp->args[i].offset, field))
1183 goto partial; 1398 goto partial;
1184 1399
1185 if (!trace_seq_puts(s, "\n")) 1400 if (!trace_seq_puts(s, "\n"))
@@ -1214,11 +1429,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1214 } 1429 }
1215} 1430}
1216 1431
1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1218{
1219 return 0;
1220}
1221
1222#undef DEFINE_FIELD 1432#undef DEFINE_FIELD
1223#define DEFINE_FIELD(type, item, name, is_signed) \ 1433#define DEFINE_FIELD(type, item, name, is_signed) \
1224 do { \ 1434 do { \
@@ -1239,7 +1449,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1239 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1449 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1240 /* Set argument names as fields */ 1450 /* Set argument names as fields */
1241 for (i = 0; i < tp->nr_args; i++) { 1451 for (i = 0; i < tp->nr_args; i++) {
1242 ret = trace_define_field(event_call, tp->args[i].type->name, 1452 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1243 tp->args[i].name, 1453 tp->args[i].name,
1244 sizeof(field) + tp->args[i].offset, 1454 sizeof(field) + tp->args[i].offset,
1245 tp->args[i].type->size, 1455 tp->args[i].type->size,
@@ -1261,7 +1471,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1261 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1471 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1262 /* Set argument names as fields */ 1472 /* Set argument names as fields */
1263 for (i = 0; i < tp->nr_args; i++) { 1473 for (i = 0; i < tp->nr_args; i++) {
1264 ret = trace_define_field(event_call, tp->args[i].type->name, 1474 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1265 tp->args[i].name, 1475 tp->args[i].name,
1266 sizeof(field) + tp->args[i].offset, 1476 sizeof(field) + tp->args[i].offset,
1267 tp->args[i].type->size, 1477 tp->args[i].type->size,
@@ -1301,8 +1511,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1301 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1511 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1302 1512
1303 for (i = 0; i < tp->nr_args; i++) { 1513 for (i = 0; i < tp->nr_args; i++) {
1304 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", 1514 if (strcmp(tp->args[i].type->name, "string") == 0)
1305 tp->args[i].name); 1515 pos += snprintf(buf + pos, LEN_OR_ZERO,
1516 ", __get_str(%s)",
1517 tp->args[i].name);
1518 else
1519 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1520 tp->args[i].name);
1306 } 1521 }
1307 1522
1308#undef LEN_OR_ZERO 1523#undef LEN_OR_ZERO
@@ -1339,11 +1554,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1339 struct ftrace_event_call *call = &tp->call; 1554 struct ftrace_event_call *call = &tp->call;
1340 struct kprobe_trace_entry_head *entry; 1555 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head; 1556 struct hlist_head *head;
1342 u8 *data; 1557 int size, __size, dsize;
1343 int size, __size, i;
1344 int rctx; 1558 int rctx;
1345 1559
1346 __size = sizeof(*entry) + tp->size; 1560 dsize = __get_data_size(tp, regs);
1561 __size = sizeof(*entry) + tp->size + dsize;
1347 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1562 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1348 size -= sizeof(u32); 1563 size -= sizeof(u32);
1349 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1564 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1355,9 +1570,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1355 return; 1570 return;
1356 1571
1357 entry->ip = (unsigned long)kp->addr; 1572 entry->ip = (unsigned long)kp->addr;
1358 data = (u8 *)&entry[1]; 1573 memset(&entry[1], 0, dsize);
1359 for (i = 0; i < tp->nr_args; i++) 1574 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1361 1575
1362 head = this_cpu_ptr(call->perf_events); 1576 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1577 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
@@ -1371,11 +1585,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1371 struct ftrace_event_call *call = &tp->call; 1585 struct ftrace_event_call *call = &tp->call;
1372 struct kretprobe_trace_entry_head *entry; 1586 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head; 1587 struct hlist_head *head;
1374 u8 *data; 1588 int size, __size, dsize;
1375 int size, __size, i;
1376 int rctx; 1589 int rctx;
1377 1590
1378 __size = sizeof(*entry) + tp->size; 1591 dsize = __get_data_size(tp, regs);
1592 __size = sizeof(*entry) + tp->size + dsize;
1379 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1593 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1380 size -= sizeof(u32); 1594 size -= sizeof(u32);
1381 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1595 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1388,9 +1602,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1388 1602
1389 entry->func = (unsigned long)tp->rp.kp.addr; 1603 entry->func = (unsigned long)tp->rp.kp.addr;
1390 entry->ret_ip = (unsigned long)ri->ret_addr; 1604 entry->ret_ip = (unsigned long)ri->ret_addr;
1391 data = (u8 *)&entry[1]; 1605 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1392 for (i = 0; i < tp->nr_args; i++)
1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1394 1606
1395 head = this_cpu_ptr(call->perf_events); 1607 head = this_cpu_ptr(call->perf_events);
1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1608 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
@@ -1486,15 +1698,12 @@ static int register_probe_event(struct trace_probe *tp)
1486 int ret; 1698 int ret;
1487 1699
1488 /* Initialize ftrace_event_call */ 1700 /* Initialize ftrace_event_call */
1701 INIT_LIST_HEAD(&call->class->fields);
1489 if (probe_is_return(tp)) { 1702 if (probe_is_return(tp)) {
1490 INIT_LIST_HEAD(&call->class->fields);
1491 call->event.funcs = &kretprobe_funcs; 1703 call->event.funcs = &kretprobe_funcs;
1492 call->class->raw_init = probe_event_raw_init;
1493 call->class->define_fields = kretprobe_event_define_fields; 1704 call->class->define_fields = kretprobe_event_define_fields;
1494 } else { 1705 } else {
1495 INIT_LIST_HEAD(&call->class->fields);
1496 call->event.funcs = &kprobe_funcs; 1706 call->event.funcs = &kprobe_funcs;
1497 call->class->raw_init = probe_event_raw_init;
1498 call->class->define_fields = kprobe_event_define_fields; 1707 call->class->define_fields = kprobe_event_define_fields;
1499 } 1708 }
1500 if (set_print_fmt(tp) < 0) 1709 if (set_print_fmt(tp) < 0)
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 8eaf00749b65..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,508 +0,0 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/fs.h>
28
29#include "trace_output.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35#include <asm/atomic.h>
36
37#define KSYM_TRACER_OP_LEN 3 /* rw- */
38
39struct trace_ksym {
40 struct perf_event **ksym_hbp;
41 struct perf_event_attr attr;
42#ifdef CONFIG_PROFILE_KSYM_TRACER
43 atomic64_t counter;
44#endif
45 struct hlist_node ksym_hlist;
46};
47
48static struct trace_array *ksym_trace_array;
49
50static unsigned int ksym_tracing_enabled;
51
52static HLIST_HEAD(ksym_filter_head);
53
54static DEFINE_MUTEX(ksym_tracer_mutex);
55
56#ifdef CONFIG_PROFILE_KSYM_TRACER
57
58#define MAX_UL_INT 0xffffffff
59
60void ksym_collect_stats(unsigned long hbp_hit_addr)
61{
62 struct hlist_node *node;
63 struct trace_ksym *entry;
64
65 rcu_read_lock();
66 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
67 if (entry->attr.bp_addr == hbp_hit_addr) {
68 atomic64_inc(&entry->counter);
69 break;
70 }
71 }
72 rcu_read_unlock();
73}
74#endif /* CONFIG_PROFILE_KSYM_TRACER */
75
76void ksym_hbp_handler(struct perf_event *hbp, int nmi,
77 struct perf_sample_data *data,
78 struct pt_regs *regs)
79{
80 struct ring_buffer_event *event;
81 struct ksym_trace_entry *entry;
82 struct ring_buffer *buffer;
83 int pc;
84
85 if (!ksym_tracing_enabled)
86 return;
87
88 buffer = ksym_trace_array->buffer;
89
90 pc = preempt_count();
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
93 sizeof(*entry), 0, pc);
94 if (!event)
95 return;
96
97 entry = ring_buffer_event_data(event);
98 entry->ip = instruction_pointer(regs);
99 entry->type = hw_breakpoint_type(hbp);
100 entry->addr = hw_breakpoint_addr(hbp);
101 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
102
103#ifdef CONFIG_PROFILE_KSYM_TRACER
104 ksym_collect_stats(hw_breakpoint_addr(hbp));
105#endif /* CONFIG_PROFILE_KSYM_TRACER */
106
107 trace_buffer_unlock_commit(buffer, event, 0, pc);
108}
109
110/* Valid access types are represented as
111 *
112 * rw- : Set Read/Write Access Breakpoint
113 * -w- : Set Write Access Breakpoint
114 * --- : Clear Breakpoints
115 * --x : Set Execution Break points (Not available yet)
116 *
117 */
118static int ksym_trace_get_access_type(char *str)
119{
120 int access = 0;
121
122 if (str[0] == 'r')
123 access |= HW_BREAKPOINT_R;
124
125 if (str[1] == 'w')
126 access |= HW_BREAKPOINT_W;
127
128 if (str[2] == 'x')
129 access |= HW_BREAKPOINT_X;
130
131 switch (access) {
132 case HW_BREAKPOINT_R:
133 case HW_BREAKPOINT_W:
134 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
135 return access;
136 default:
137 return -EINVAL;
138 }
139}
140
141/*
142 * There can be several possible malformed requests and we attempt to capture
143 * all of them. We enumerate some of the rules
144 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
145 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
146 * <module>:<ksym_name>:<op>.
147 * 2. No delimiter symbol ':' in the input string
148 * 3. Spurious operator symbols or symbols not in their respective positions
149 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
150 * 5. Kernel symbol not a part of /proc/kallsyms
151 * 6. Duplicate requests
152 */
153static int parse_ksym_trace_str(char *input_string, char **ksymname,
154 unsigned long *addr)
155{
156 int ret;
157
158 *ksymname = strsep(&input_string, ":");
159 *addr = kallsyms_lookup_name(*ksymname);
160
161 /* Check for malformed request: (2), (1) and (5) */
162 if ((!input_string) ||
163 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
164 (*addr == 0))
165 return -EINVAL;;
166
167 ret = ksym_trace_get_access_type(input_string);
168
169 return ret;
170}
171
172int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
173{
174 struct trace_ksym *entry;
175 int ret = -ENOMEM;
176
177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
178 if (!entry)
179 return -ENOMEM;
180
181 hw_breakpoint_init(&entry->attr);
182
183 entry->attr.bp_type = op;
184 entry->attr.bp_addr = addr;
185 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
186
187 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
188 ksym_hbp_handler);
189
190 if (IS_ERR(entry->ksym_hbp)) {
191 ret = PTR_ERR(entry->ksym_hbp);
192 if (ret == -ENOSPC) {
193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
199 goto err;
200 }
201
202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
203
204 return 0;
205
206err:
207 kfree(entry);
208
209 return ret;
210}
211
212static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
213 size_t count, loff_t *ppos)
214{
215 struct trace_ksym *entry;
216 struct hlist_node *node;
217 struct trace_seq *s;
218 ssize_t cnt = 0;
219 int ret;
220
221 s = kmalloc(sizeof(*s), GFP_KERNEL);
222 if (!s)
223 return -ENOMEM;
224 trace_seq_init(s);
225
226 mutex_lock(&ksym_tracer_mutex);
227
228 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
229 ret = trace_seq_printf(s, "%pS:",
230 (void *)(unsigned long)entry->attr.bp_addr);
231 if (entry->attr.bp_type == HW_BREAKPOINT_R)
232 ret = trace_seq_puts(s, "r--\n");
233 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
234 ret = trace_seq_puts(s, "-w-\n");
235 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
236 ret = trace_seq_puts(s, "rw-\n");
237 WARN_ON_ONCE(!ret);
238 }
239
240 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
241
242 mutex_unlock(&ksym_tracer_mutex);
243
244 kfree(s);
245
246 return cnt;
247}
248
249static void __ksym_trace_reset(void)
250{
251 struct trace_ksym *entry;
252 struct hlist_node *node, *node1;
253
254 mutex_lock(&ksym_tracer_mutex);
255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
256 ksym_hlist) {
257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
258 hlist_del_rcu(&(entry->ksym_hlist));
259 synchronize_rcu();
260 kfree(entry);
261 }
262 mutex_unlock(&ksym_tracer_mutex);
263}
264
265static ssize_t ksym_trace_filter_write(struct file *file,
266 const char __user *buffer,
267 size_t count, loff_t *ppos)
268{
269 struct trace_ksym *entry;
270 struct hlist_node *node;
271 char *buf, *input_string, *ksymname = NULL;
272 unsigned long ksym_addr = 0;
273 int ret, op, changed = 0;
274
275 buf = kzalloc(count + 1, GFP_KERNEL);
276 if (!buf)
277 return -ENOMEM;
278
279 ret = -EFAULT;
280 if (copy_from_user(buf, buffer, count))
281 goto out;
282
283 buf[count] = '\0';
284 input_string = strstrip(buf);
285
286 /*
287 * Clear all breakpoints if:
288 * 1: echo > ksym_trace_filter
289 * 2: echo 0 > ksym_trace_filter
290 * 3: echo "*:---" > ksym_trace_filter
291 */
292 if (!input_string[0] || !strcmp(input_string, "0") ||
293 !strcmp(input_string, "*:---")) {
294 __ksym_trace_reset();
295 ret = 0;
296 goto out;
297 }
298
299 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
300 if (ret < 0)
301 goto out;
302
303 mutex_lock(&ksym_tracer_mutex);
304
305 ret = -EINVAL;
306 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
307 if (entry->attr.bp_addr == ksym_addr) {
308 /* Check for malformed request: (6) */
309 if (entry->attr.bp_type != op)
310 changed = 1;
311 else
312 goto out_unlock;
313 break;
314 }
315 }
316 if (changed) {
317 unregister_wide_hw_breakpoint(entry->ksym_hbp);
318 entry->attr.bp_type = op;
319 ret = 0;
320 if (op > 0) {
321 entry->ksym_hbp =
322 register_wide_hw_breakpoint(&entry->attr,
323 ksym_hbp_handler);
324 if (IS_ERR(entry->ksym_hbp))
325 ret = PTR_ERR(entry->ksym_hbp);
326 else
327 goto out_unlock;
328 }
329 /* Error or "symbol:---" case: drop it */
330 hlist_del_rcu(&(entry->ksym_hlist));
331 synchronize_rcu();
332 kfree(entry);
333 goto out_unlock;
334 } else {
335 /* Check for malformed request: (4) */
336 if (op)
337 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
338 }
339out_unlock:
340 mutex_unlock(&ksym_tracer_mutex);
341out:
342 kfree(buf);
343 return !ret ? count : ret;
344}
345
346static const struct file_operations ksym_tracing_fops = {
347 .open = tracing_open_generic,
348 .read = ksym_trace_filter_read,
349 .write = ksym_trace_filter_write,
350};
351
352static void ksym_trace_reset(struct trace_array *tr)
353{
354 ksym_tracing_enabled = 0;
355 __ksym_trace_reset();
356}
357
358static int ksym_trace_init(struct trace_array *tr)
359{
360 int cpu, ret = 0;
361
362 for_each_online_cpu(cpu)
363 tracing_reset(tr, cpu);
364 ksym_tracing_enabled = 1;
365 ksym_trace_array = tr;
366
367 return ret;
368}
369
370static void ksym_trace_print_header(struct seq_file *m)
371{
372 seq_puts(m,
373 "# TASK-PID CPU# Symbol "
374 "Type Function\n");
375 seq_puts(m,
376 "# | | | "
377 " | |\n");
378}
379
380static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
381{
382 struct trace_entry *entry = iter->ent;
383 struct trace_seq *s = &iter->seq;
384 struct ksym_trace_entry *field;
385 char str[KSYM_SYMBOL_LEN];
386 int ret;
387
388 if (entry->type != TRACE_KSYM)
389 return TRACE_TYPE_UNHANDLED;
390
391 trace_assign_type(field, entry);
392
393 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
394 entry->pid, iter->cpu, (char *)field->addr);
395 if (!ret)
396 return TRACE_TYPE_PARTIAL_LINE;
397
398 switch (field->type) {
399 case HW_BREAKPOINT_R:
400 ret = trace_seq_printf(s, " R ");
401 break;
402 case HW_BREAKPOINT_W:
403 ret = trace_seq_printf(s, " W ");
404 break;
405 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
406 ret = trace_seq_printf(s, " RW ");
407 break;
408 default:
409 return TRACE_TYPE_PARTIAL_LINE;
410 }
411
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 sprint_symbol(str, field->ip);
416 ret = trace_seq_printf(s, "%s\n", str);
417 if (!ret)
418 return TRACE_TYPE_PARTIAL_LINE;
419
420 return TRACE_TYPE_HANDLED;
421}
422
423struct tracer ksym_tracer __read_mostly =
424{
425 .name = "ksym_tracer",
426 .init = ksym_trace_init,
427 .reset = ksym_trace_reset,
428#ifdef CONFIG_FTRACE_SELFTEST
429 .selftest = trace_selftest_startup_ksym,
430#endif
431 .print_header = ksym_trace_print_header,
432 .print_line = ksym_trace_output
433};
434
435#ifdef CONFIG_PROFILE_KSYM_TRACER
436static int ksym_profile_show(struct seq_file *m, void *v)
437{
438 struct hlist_node *node;
439 struct trace_ksym *entry;
440 int access_type = 0;
441 char fn_name[KSYM_NAME_LEN];
442
443 seq_puts(m, " Access Type ");
444 seq_puts(m, " Symbol Counter\n");
445 seq_puts(m, " ----------- ");
446 seq_puts(m, " ------ -------\n");
447
448 rcu_read_lock();
449 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
450
451 access_type = entry->attr.bp_type;
452
453 switch (access_type) {
454 case HW_BREAKPOINT_R:
455 seq_puts(m, " R ");
456 break;
457 case HW_BREAKPOINT_W:
458 seq_puts(m, " W ");
459 break;
460 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
461 seq_puts(m, " RW ");
462 break;
463 default:
464 seq_puts(m, " NA ");
465 }
466
467 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
468 seq_printf(m, " %-36s", fn_name);
469 else
470 seq_printf(m, " %-36s", "<NA>");
471 seq_printf(m, " %15llu\n",
472 (unsigned long long)atomic64_read(&entry->counter));
473 }
474 rcu_read_unlock();
475
476 return 0;
477}
478
479static int ksym_profile_open(struct inode *node, struct file *file)
480{
481 return single_open(file, ksym_profile_show, NULL);
482}
483
484static const struct file_operations ksym_profile_fops = {
485 .open = ksym_profile_open,
486 .read = seq_read,
487 .llseek = seq_lseek,
488 .release = single_release,
489};
490#endif /* CONFIG_PROFILE_KSYM_TRACER */
491
492__init static int init_ksym_trace(void)
493{
494 struct dentry *d_tracer;
495
496 d_tracer = tracing_init_dentry();
497
498 trace_create_file("ksym_trace_filter", 0644, d_tracer,
499 NULL, &ksym_tracing_fops);
500
501#ifdef CONFIG_PROFILE_KSYM_TRACER
502 trace_create_file("ksym_profile", 0444, d_tracer,
503 NULL, &ksym_profile_fops);
504#endif
505
506 return register_tracer(&ksym_tracer);
507}
508device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..02272baa2206 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18 18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 20
24static int next_event_type = __TRACE_LAST_TYPE + 1; 21static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = {
1069 .funcs = &trace_wake_funcs, 1066 .funcs = &trace_wake_funcs,
1070}; 1067};
1071 1068
1072/* TRACE_SPECIAL */
1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1074 int flags, struct trace_event *event)
1075{
1076 struct special_entry *field;
1077
1078 trace_assign_type(field, iter->ent);
1079
1080 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
1081 field->arg1,
1082 field->arg2,
1083 field->arg3))
1084 return TRACE_TYPE_PARTIAL_LINE;
1085
1086 return TRACE_TYPE_HANDLED;
1087}
1088
1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1090 int flags, struct trace_event *event)
1091{
1092 struct special_entry *field;
1093 struct trace_seq *s = &iter->seq;
1094
1095 trace_assign_type(field, iter->ent);
1096
1097 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
1098 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
1099 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1100
1101 return TRACE_TYPE_HANDLED;
1102}
1103
1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1105 int flags, struct trace_event *event)
1106{
1107 struct special_entry *field;
1108 struct trace_seq *s = &iter->seq;
1109
1110 trace_assign_type(field, iter->ent);
1111
1112 SEQ_PUT_FIELD_RET(s, field->arg1);
1113 SEQ_PUT_FIELD_RET(s, field->arg2);
1114 SEQ_PUT_FIELD_RET(s, field->arg3);
1115
1116 return TRACE_TYPE_HANDLED;
1117}
1118
1119static struct trace_event_functions trace_special_funcs = {
1120 .trace = trace_special_print,
1121 .raw = trace_special_print,
1122 .hex = trace_special_hex,
1123 .binary = trace_special_bin,
1124};
1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1131/* TRACE_STACK */ 1069/* TRACE_STACK */
1132 1070
1133static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1071static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1161 1099
1162static struct trace_event_functions trace_stack_funcs = { 1100static struct trace_event_functions trace_stack_funcs = {
1163 .trace = trace_stack_print, 1101 .trace = trace_stack_print,
1164 .raw = trace_special_print,
1165 .hex = trace_special_hex,
1166 .binary = trace_special_bin,
1167}; 1102};
1168 1103
1169static struct trace_event trace_stack_event = { 1104static struct trace_event trace_stack_event = {
@@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1194 1129
1195static struct trace_event_functions trace_user_stack_funcs = { 1130static struct trace_event_functions trace_user_stack_funcs = {
1196 .trace = trace_user_stack_print, 1131 .trace = trace_user_stack_print,
1197 .raw = trace_special_print,
1198 .hex = trace_special_hex,
1199 .binary = trace_special_bin,
1200}; 1132};
1201 1133
1202static struct trace_event trace_user_stack_event = { 1134static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = {
1314 &trace_fn_event, 1246 &trace_fn_event,
1315 &trace_ctx_event, 1247 &trace_ctx_event,
1316 &trace_wake_event, 1248 &trace_wake_event,
1317 &trace_special_event,
1318 &trace_stack_event, 1249 &trace_stack_event,
1319 &trace_user_stack_event, 1250 &trace_user_stack_event,
1320 &trace_bprint_event, 1251 &trace_bprint_event,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0e73bc2ef8c5..4086eae6e81b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
46 struct trace_array_cpu *data; 46 struct trace_array_cpu *data;
47 unsigned long flags; 47 unsigned long flags;
48 long disabled; 48 long disabled;
49 int resched;
50 int cpu; 49 int cpu;
51 int pc; 50 int pc;
52 51
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
54 return; 53 return;
55 54
56 pc = preempt_count(); 55 pc = preempt_count();
57 resched = ftrace_preempt_disable(); 56 preempt_disable_notrace();
58 57
59 cpu = raw_smp_processor_id(); 58 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu) 59 if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
74 out: 73 out:
75 atomic_dec(&data->disabled); 74 atomic_dec(&data->disabled);
76 out_enable: 75 out_enable:
77 ftrace_preempt_enable(resched); 76 preempt_enable_notrace();
78} 77}
79 78
80static struct ftrace_ops trace_ops __read_mostly = 79static struct ftrace_ops trace_ops __read_mostly =
@@ -383,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
383#ifdef CONFIG_FTRACE_SELFTEST 382#ifdef CONFIG_FTRACE_SELFTEST
384 .selftest = trace_selftest_startup_wakeup, 383 .selftest = trace_selftest_startup_wakeup,
385#endif 384#endif
385 .use_max_tr = 1,
386}; 386};
387 387
388static struct tracer wakeup_rt_tracer __read_mostly = 388static struct tracer wakeup_rt_tracer __read_mostly =
@@ -397,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
397#ifdef CONFIG_FTRACE_SELFTEST 397#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 398 .selftest = trace_selftest_startup_wakeup,
399#endif 399#endif
400 .use_max_tr = 1,
400}; 401};
401 402
402__init static int init_wakeup_tracer(void) 403__init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 250e7f9bd2f0..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_WAKE: 13 case TRACE_WAKE:
14 case TRACE_STACK: 14 case TRACE_STACK:
15 case TRACE_PRINT: 15 case TRACE_PRINT:
16 case TRACE_SPECIAL:
17 case TRACE_BRANCH: 16 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
20 case TRACE_KSYM:
21 return 1; 19 return 1;
22 } 20 }
23 return 0; 21 return 0;
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
691} 689}
692#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 690#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
693 691
694#ifdef CONFIG_SYSPROF_TRACER
695int
696trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
697{
698 unsigned long count;
699 int ret;
700
701 /* start the tracing */
702 ret = tracer_init(trace, tr);
703 if (ret) {
704 warn_failed_init_tracer(trace, ret);
705 return ret;
706 }
707
708 /* Sleep for a 1/10 of a second */
709 msleep(100);
710 /* stop the tracing. */
711 tracing_stop();
712 /* check the trace buffer */
713 ret = trace_test_buffer(tr, &count);
714 trace->reset(tr);
715 tracing_start();
716
717 if (!ret && !count) {
718 printk(KERN_CONT ".. no entries found ..");
719 ret = -1;
720 }
721
722 return ret;
723}
724#endif /* CONFIG_SYSPROF_TRACER */
725
726#ifdef CONFIG_BRANCH_TRACER 692#ifdef CONFIG_BRANCH_TRACER
727int 693int
728trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) 694trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 721}
756#endif /* CONFIG_BRANCH_TRACER */ 722#endif /* CONFIG_BRANCH_TRACER */
757 723
758#ifdef CONFIG_KSYM_TRACER
759static int ksym_selftest_dummy;
760
761int
762trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
763{
764 unsigned long count;
765 int ret;
766
767 /* start the tracing */
768 ret = tracer_init(trace, tr);
769 if (ret) {
770 warn_failed_init_tracer(trace, ret);
771 return ret;
772 }
773
774 ksym_selftest_dummy = 0;
775 /* Register the read-write tracing request */
776
777 ret = process_new_ksym_entry("ksym_selftest_dummy",
778 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
779 (unsigned long)(&ksym_selftest_dummy));
780
781 if (ret < 0) {
782 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
783 goto ret_path;
784 }
785 /* Perform a read and a write operation over the dummy variable to
786 * trigger the tracer
787 */
788 if (ksym_selftest_dummy == 0)
789 ksym_selftest_dummy++;
790
791 /* stop the tracing. */
792 tracing_stop();
793 /* check the trace buffer */
794 ret = trace_test_buffer(tr, &count);
795 trace->reset(tr);
796 tracing_start();
797
798 /* read & write operations - one each is performed on the dummy variable
799 * triggering two entries in the trace buffer
800 */
801 if (!ret && count != 2) {
802 printk(KERN_CONT "Ksym tracer startup test failed");
803 ret = -1;
804 }
805
806ret_path:
807 return ret;
808}
809#endif /* CONFIG_KSYM_TRACER */
810
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..a6b7e0e0f3eb 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
110static void 110static void
111stack_trace_call(unsigned long ip, unsigned long parent_ip) 111stack_trace_call(unsigned long ip, unsigned long parent_ip)
112{ 112{
113 int cpu, resched; 113 int cpu;
114 114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return; 116 return;
117 117
118 resched = ftrace_preempt_disable(); 118 preempt_disable_notrace();
119 119
120 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
121 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
127 out: 127 out:
128 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
129 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
130 ftrace_preempt_enable(resched); 130 preempt_enable_notrace();
131} 131}
132 132
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
@@ -249,7 +249,7 @@ static int trace_lookup_stack(struct seq_file *m, long i)
249{ 249{
250 unsigned long addr = stack_dump_trace[i]; 250 unsigned long addr = stack_dump_trace[i];
251 251
252 return seq_printf(m, "%pF\n", (void *)addr); 252 return seq_printf(m, "%pS\n", (void *)addr);
253} 253}
254 254
255static void print_disabled(struct seq_file *m) 255static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
26static struct list_head * 29static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call) 30syscall_get_enter_fields(struct ftrace_event_call *call)
28{ 31{
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34static struct list_head * 37static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call) 38syscall_get_exit_fields(struct ftrace_event_call *call)
36{ 39{
37 struct syscall_metadata *entry = call->data; 40 return &syscall_exit_fields;
38
39 return &entry->exit_fields;
40} 41}
41 42
42struct trace_event_functions enter_syscall_print_funcs = { 43struct trace_event_functions enter_syscall_print_funcs = {
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca9..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
1/*
2 * trace stack traces
3 *
4 * Copyright (C) 2004-2008, Soeren Sandmann
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/kallsyms.h>
9#include <linux/debugfs.h>
10#include <linux/hrtimer.h>
11#include <linux/uaccess.h>
12#include <linux/ftrace.h>
13#include <linux/module.h>
14#include <linux/irq.h>
15#include <linux/fs.h>
16
17#include <asm/stacktrace.h>
18
19#include "trace.h"
20
21static struct trace_array *sysprof_trace;
22static int __read_mostly tracer_enabled;
23
24/*
25 * 1 msec sample interval by default:
26 */
27static unsigned long sample_period = 1000000;
28static const unsigned int sample_max_depth = 512;
29
30static DEFINE_MUTEX(sample_timer_lock);
31/*
32 * Per CPU hrtimers that do the profiling:
33 */
34static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
35
36struct stack_frame {
37 const void __user *next_fp;
38 unsigned long return_address;
39};
40
41static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
42{
43 int ret;
44
45 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
46 return 0;
47
48 ret = 1;
49 pagefault_disable();
50 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
51 ret = 0;
52 pagefault_enable();
53
54 return ret;
55}
56
57struct backtrace_info {
58 struct trace_array_cpu *data;
59 struct trace_array *tr;
60 int pos;
61};
62
63static void
64backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
65{
66 /* Ignore warnings */
67}
68
69static void backtrace_warning(void *data, char *msg)
70{
71 /* Ignore warnings */
72}
73
74static int backtrace_stack(void *data, char *name)
75{
76 /* Don't bother with IRQ stacks for now */
77 return -1;
78}
79
80static void backtrace_address(void *data, unsigned long addr, int reliable)
81{
82 struct backtrace_info *info = data;
83
84 if (info->pos < sample_max_depth && reliable) {
85 __trace_special(info->tr, info->data, 1, addr, 0);
86
87 info->pos++;
88 }
89}
90
91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack,
95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
97};
98
99static int
100trace_kernel(struct pt_regs *regs, struct trace_array *tr,
101 struct trace_array_cpu *data)
102{
103 struct backtrace_info info;
104 unsigned long bp;
105 char *stack;
106
107 info.tr = tr;
108 info.data = data;
109 info.pos = 1;
110
111 __trace_special(info.tr, info.data, 1, regs->ip, 0);
112
113 stack = ((char *)regs + sizeof(struct pt_regs));
114#ifdef CONFIG_FRAME_POINTER
115 bp = regs->bp;
116#else
117 bp = 0;
118#endif
119
120 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
121
122 return info.pos;
123}
124
125static void timer_notify(struct pt_regs *regs, int cpu)
126{
127 struct trace_array_cpu *data;
128 struct stack_frame frame;
129 struct trace_array *tr;
130 const void __user *fp;
131 int is_user;
132 int i;
133
134 if (!regs)
135 return;
136
137 tr = sysprof_trace;
138 data = tr->data[cpu];
139 is_user = user_mode(regs);
140
141 if (!current || current->pid == 0)
142 return;
143
144 if (is_user && current->state != TASK_RUNNING)
145 return;
146
147 __trace_special(tr, data, 0, 0, current->pid);
148
149 if (!is_user)
150 i = trace_kernel(regs, tr, data);
151 else
152 i = 0;
153
154 /*
155 * Trace user stack if we are not a kernel thread
156 */
157 if (current->mm && i < sample_max_depth) {
158 regs = (struct pt_regs *)current->thread.sp0 - 1;
159
160 fp = (void __user *)regs->bp;
161
162 __trace_special(tr, data, 2, regs->ip, 0);
163
164 while (i < sample_max_depth) {
165 frame.next_fp = NULL;
166 frame.return_address = 0;
167 if (!copy_stack_frame(fp, &frame))
168 break;
169 if ((unsigned long)fp < regs->sp)
170 break;
171
172 __trace_special(tr, data, 2, frame.return_address,
173 (unsigned long)fp);
174 fp = frame.next_fp;
175
176 i++;
177 }
178
179 }
180
181 /*
182 * Special trace entry if we overflow the max depth:
183 */
184 if (i == sample_max_depth)
185 __trace_special(tr, data, -1, -1, -1);
186
187 __trace_special(tr, data, 3, current->pid, i);
188}
189
190static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
191{
192 /* trace here */
193 timer_notify(get_irq_regs(), smp_processor_id());
194
195 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
196
197 return HRTIMER_RESTART;
198}
199
200static void start_stack_timer(void *unused)
201{
202 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
203
204 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 hrtimer->function = stack_trace_timer_fn;
206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
208 HRTIMER_MODE_REL_PINNED);
209}
210
211static void start_stack_timers(void)
212{
213 on_each_cpu(start_stack_timer, NULL, 1);
214}
215
216static void stop_stack_timer(int cpu)
217{
218 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
219
220 hrtimer_cancel(hrtimer);
221}
222
223static void stop_stack_timers(void)
224{
225 int cpu;
226
227 for_each_online_cpu(cpu)
228 stop_stack_timer(cpu);
229}
230
231static void stop_stack_trace(struct trace_array *tr)
232{
233 mutex_lock(&sample_timer_lock);
234 stop_stack_timers();
235 tracer_enabled = 0;
236 mutex_unlock(&sample_timer_lock);
237}
238
239static int stack_trace_init(struct trace_array *tr)
240{
241 sysprof_trace = tr;
242
243 tracing_start_cmdline_record();
244
245 mutex_lock(&sample_timer_lock);
246 start_stack_timers();
247 tracer_enabled = 1;
248 mutex_unlock(&sample_timer_lock);
249 return 0;
250}
251
252static void stack_trace_reset(struct trace_array *tr)
253{
254 tracing_stop_cmdline_record();
255 stop_stack_trace(tr);
256}
257
258static struct tracer stack_trace __read_mostly =
259{
260 .name = "sysprof",
261 .init = stack_trace_init,
262 .reset = stack_trace_reset,
263#ifdef CONFIG_FTRACE_SELFTEST
264 .selftest = trace_selftest_startup_sysprof,
265#endif
266};
267
268__init static int init_stack_trace(void)
269{
270 return register_tracer(&stack_trace);
271}
272device_initcall(init_stack_trace);
273
274#define MAX_LONG_DIGITS 22
275
276static ssize_t
277sysprof_sample_read(struct file *filp, char __user *ubuf,
278 size_t cnt, loff_t *ppos)
279{
280 char buf[MAX_LONG_DIGITS];
281 int r;
282
283 r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
284
285 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
286}
287
288static ssize_t
289sysprof_sample_write(struct file *filp, const char __user *ubuf,
290 size_t cnt, loff_t *ppos)
291{
292 char buf[MAX_LONG_DIGITS];
293 unsigned long val;
294
295 if (cnt > MAX_LONG_DIGITS-1)
296 cnt = MAX_LONG_DIGITS-1;
297
298 if (copy_from_user(&buf, ubuf, cnt))
299 return -EFAULT;
300
301 buf[cnt] = 0;
302
303 val = simple_strtoul(buf, NULL, 10);
304 /*
305 * Enforce a minimum sample period of 100 usecs:
306 */
307 if (val < 100)
308 val = 100;
309
310 mutex_lock(&sample_timer_lock);
311 stop_stack_timers();
312 sample_period = val * 1000;
313 start_stack_timers();
314 mutex_unlock(&sample_timer_lock);
315
316 return cnt;
317}
318
319static const struct file_operations sysprof_sample_fops = {
320 .read = sysprof_sample_read,
321 .write = sysprof_sample_write,
322};
323
324void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
325{
326
327 trace_create_file("sysprof_sample_period", 0644,
328 d_tracer, NULL, &sysprof_sample_fops);
329}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/highuid.h>
12#include <linux/cred.h> 13#include <linux/cred.h>
13 14
14/* 15/*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
82 schedule_work(&ns->destroyer); 83 schedule_work(&ns->destroyer);
83} 84}
84EXPORT_SYMBOL(free_user_ns); 85EXPORT_SYMBOL(free_user_ns);
86
87uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
88{
89 struct user_namespace *tmp;
90
91 if (likely(to == cred->user->user_ns))
92 return uid;
93
94
95 /* Is cred->user the creator of the target user_ns
96 * or the creator of one of it's parents?
97 */
98 for ( tmp = to; tmp != &init_user_ns;
99 tmp = tmp->creator->user_ns ) {
100 if (cred->user == tmp->creator) {
101 return (uid_t)0;
102 }
103 }
104
105 /* No useful relationship so no mapping */
106 return overflowuid;
107}
108
109gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
110{
111 struct user_namespace *tmp;
112
113 if (likely(to == cred->user->user_ns))
114 return gid;
115
116 /* Is cred->user the creator of the target user_ns
117 * or the creator of one of it's parents?
118 */
119 for ( tmp = to; tmp != &init_user_ns;
120 tmp = tmp->creator->user_ns ) {
121 if (cred->user == tmp->creator) {
122 return (gid_t)0;
123 }
124 }
125
126 /* No useful relationship so no mapping */
127 return overflowgid;
128}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..0d53c8e853b1
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,570 @@
1/*
2 * Detect hard and soft lockups on a system
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * this code detects hard lockups: incidents in where on a CPU
7 * the kernel does not respond to anything except NMI.
8 *
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well.
13 */
14
15#include <linux/mm.h>
16#include <linux/cpu.h>
17#include <linux/nmi.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/freezer.h>
21#include <linux/kthread.h>
22#include <linux/lockdep.h>
23#include <linux/notifier.h>
24#include <linux/module.h>
25#include <linux/sysctl.h>
26
27#include <asm/irq_regs.h>
28#include <linux/perf_event.h>
29
30int watchdog_enabled;
31int __read_mostly softlockup_thresh = 60;
32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif
45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog;
48
49
50/* boot commands */
51/*
52 * Should we panic when a soft-lockup or hard-lockup occurs:
53 */
54#ifdef CONFIG_HARDLOCKUP_DETECTOR
55static int hardlockup_panic;
56
57static int __init hardlockup_panic_setup(char *str)
58{
59 if (!strncmp(str, "panic", 5))
60 hardlockup_panic = 1;
61 return 1;
62}
63__setup("nmi_watchdog=", hardlockup_panic_setup);
64#endif
65
66unsigned int __read_mostly softlockup_panic =
67 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
68
69static int __init softlockup_panic_setup(char *str)
70{
71 softlockup_panic = simple_strtoul(str, NULL, 0);
72
73 return 1;
74}
75__setup("softlockup_panic=", softlockup_panic_setup);
76
77static int __init nowatchdog_setup(char *str)
78{
79 no_watchdog = 1;
80 return 1;
81}
82__setup("nowatchdog", nowatchdog_setup);
83
84/* deprecated */
85static int __init nosoftlockup_setup(char *str)
86{
87 no_watchdog = 1;
88 return 1;
89}
90__setup("nosoftlockup", nosoftlockup_setup);
91/* */
92
93
94/*
95 * Returns seconds, approximately. We don't need nanosecond
96 * resolution, and we don't need to waste time with a big divide when
97 * 2^30ns == 1.074s.
98 */
99static unsigned long get_timestamp(int this_cpu)
100{
101 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
102}
103
104static unsigned long get_sample_period(void)
105{
106 /*
107 * convert softlockup_thresh from seconds to ns
108 * the divide by 5 is to give hrtimer 5 chances to
109 * increment before the hardlockup detector generates
110 * a warning
111 */
112 return softlockup_thresh / 5 * NSEC_PER_SEC;
113}
114
115/* Commands for resetting the watchdog */
116static void __touch_watchdog(void)
117{
118 int this_cpu = smp_processor_id();
119
120 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
121}
122
123void touch_softlockup_watchdog(void)
124{
125 __get_cpu_var(watchdog_touch_ts) = 0;
126}
127EXPORT_SYMBOL(touch_softlockup_watchdog);
128
129void touch_all_softlockup_watchdogs(void)
130{
131 int cpu;
132
133 /*
134 * this is done lockless
135 * do we care if a 0 races with a timestamp?
136 * all it means is the softlock check starts one cycle later
137 */
138 for_each_online_cpu(cpu)
139 per_cpu(watchdog_touch_ts, cpu) = 0;
140}
141
142#ifdef CONFIG_HARDLOCKUP_DETECTOR
143void touch_nmi_watchdog(void)
144{
145 __get_cpu_var(watchdog_nmi_touch) = true;
146 touch_softlockup_watchdog();
147}
148EXPORT_SYMBOL(touch_nmi_watchdog);
149
150#endif
151
152void touch_softlockup_watchdog_sync(void)
153{
154 __raw_get_cpu_var(softlockup_touch_sync) = true;
155 __raw_get_cpu_var(watchdog_touch_ts) = 0;
156}
157
158#ifdef CONFIG_HARDLOCKUP_DETECTOR
159/* watchdog detector functions */
160static int is_hardlockup(void)
161{
162 unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
163
164 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
165 return 1;
166
167 __get_cpu_var(hrtimer_interrupts_saved) = hrint;
168 return 0;
169}
170#endif
171
172static int is_softlockup(unsigned long touch_ts)
173{
174 unsigned long now = get_timestamp(smp_processor_id());
175
176 /* Warn about unreasonable delays: */
177 if (time_after(now, touch_ts + softlockup_thresh))
178 return now - touch_ts;
179
180 return 0;
181}
182
183static int
184watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
185{
186 did_panic = 1;
187
188 return NOTIFY_DONE;
189}
190
191static struct notifier_block panic_block = {
192 .notifier_call = watchdog_panic,
193};
194
195#ifdef CONFIG_HARDLOCKUP_DETECTOR
196static struct perf_event_attr wd_hw_attr = {
197 .type = PERF_TYPE_HARDWARE,
198 .config = PERF_COUNT_HW_CPU_CYCLES,
199 .size = sizeof(struct perf_event_attr),
200 .pinned = 1,
201 .disabled = 1,
202};
203
204/* Callback function for perf event subsystem */
205void watchdog_overflow_callback(struct perf_event *event, int nmi,
206 struct perf_sample_data *data,
207 struct pt_regs *regs)
208{
209 /* Ensure the watchdog never gets throttled */
210 event->hw.interrupts = 0;
211
212 if (__get_cpu_var(watchdog_nmi_touch) == true) {
213 __get_cpu_var(watchdog_nmi_touch) = false;
214 return;
215 }
216
217 /* check for a hardlockup
218 * This is done by making sure our timer interrupt
219 * is incrementing. The timer interrupt should have
220 * fired multiple times before we overflow'd. If it hasn't
221 * then this is a good indication the cpu is stuck
222 */
223 if (is_hardlockup()) {
224 int this_cpu = smp_processor_id();
225
226 /* only print hardlockups once */
227 if (__get_cpu_var(hard_watchdog_warn) == true)
228 return;
229
230 if (hardlockup_panic)
231 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
232 else
233 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
234
235 __get_cpu_var(hard_watchdog_warn) = true;
236 return;
237 }
238
239 __get_cpu_var(hard_watchdog_warn) = false;
240 return;
241}
242static void watchdog_interrupt_count(void)
243{
244 __get_cpu_var(hrtimer_interrupts)++;
245}
246#else
247static inline void watchdog_interrupt_count(void) { return; }
248#endif /* CONFIG_HARDLOCKUP_DETECTOR */
249
250/* watchdog kicker functions */
251static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
252{
253 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
254 struct pt_regs *regs = get_irq_regs();
255 int duration;
256
257 /* kick the hardlockup detector */
258 watchdog_interrupt_count();
259
260 /* kick the softlockup detector */
261 wake_up_process(__get_cpu_var(softlockup_watchdog));
262
263 /* .. and repeat */
264 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
265
266 if (touch_ts == 0) {
267 if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
268 /*
269 * If the time stamp was touched atomically
270 * make sure the scheduler tick is up to date.
271 */
272 __get_cpu_var(softlockup_touch_sync) = false;
273 sched_clock_tick();
274 }
275 __touch_watchdog();
276 return HRTIMER_RESTART;
277 }
278
279 /* check for a softlockup
280 * This is done by making sure a high priority task is
281 * being scheduled. The task touches the watchdog to
282 * indicate it is getting cpu time. If it hasn't then
283 * this is a good indication some task is hogging the cpu
284 */
285 duration = is_softlockup(touch_ts);
286 if (unlikely(duration)) {
287 /* only warn once */
288 if (__get_cpu_var(soft_watchdog_warn) == true)
289 return HRTIMER_RESTART;
290
291 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
292 smp_processor_id(), duration,
293 current->comm, task_pid_nr(current));
294 print_modules();
295 print_irqtrace_events(current);
296 if (regs)
297 show_regs(regs);
298 else
299 dump_stack();
300
301 if (softlockup_panic)
302 panic("softlockup: hung tasks");
303 __get_cpu_var(soft_watchdog_warn) = true;
304 } else
305 __get_cpu_var(soft_watchdog_warn) = false;
306
307 return HRTIMER_RESTART;
308}
309
310
311/*
312 * The watchdog thread - touches the timestamp.
313 */
314static int watchdog(void *unused)
315{
316 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
317 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
318
319 sched_setscheduler(current, SCHED_FIFO, &param);
320
321 /* initialize timestamp */
322 __touch_watchdog();
323
324 /* kick off the timer for the hardlockup detector */
325 /* done here because hrtimer_start can only pin to smp_processor_id() */
326 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
327 HRTIMER_MODE_REL_PINNED);
328
329 set_current_state(TASK_INTERRUPTIBLE);
330 /*
331 * Run briefly once per second to reset the softlockup timestamp.
332 * If this gets delayed for more than 60 seconds then the
333 * debug-printout triggers in watchdog_timer_fn().
334 */
335 while (!kthread_should_stop()) {
336 __touch_watchdog();
337 schedule();
338
339 if (kthread_should_stop())
340 break;
341
342 set_current_state(TASK_INTERRUPTIBLE);
343 }
344 __set_current_state(TASK_RUNNING);
345
346 return 0;
347}
348
349
350#ifdef CONFIG_HARDLOCKUP_DETECTOR
351static int watchdog_nmi_enable(int cpu)
352{
353 struct perf_event_attr *wd_attr;
354 struct perf_event *event = per_cpu(watchdog_ev, cpu);
355
356 /* is it already setup and enabled? */
357 if (event && event->state > PERF_EVENT_STATE_OFF)
358 goto out;
359
360 /* it is setup but not enabled */
361 if (event != NULL)
362 goto out_enable;
363
364 /* Try to register using hardware perf events */
365 wd_attr = &wd_hw_attr;
366 wd_attr->sample_period = hw_nmi_get_sample_period();
367 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
368 if (!IS_ERR(event)) {
369 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
370 goto out_save;
371 }
372
373 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
374 return -1;
375
376 /* success path */
377out_save:
378 per_cpu(watchdog_ev, cpu) = event;
379out_enable:
380 perf_event_enable(per_cpu(watchdog_ev, cpu));
381out:
382 return 0;
383}
384
385static void watchdog_nmi_disable(int cpu)
386{
387 struct perf_event *event = per_cpu(watchdog_ev, cpu);
388
389 if (event) {
390 perf_event_disable(event);
391 per_cpu(watchdog_ev, cpu) = NULL;
392
393 /* should be in cleanup, but blocks oprofile */
394 perf_event_release_kernel(event);
395 }
396 return;
397}
398#else
399static int watchdog_nmi_enable(int cpu) { return 0; }
400static void watchdog_nmi_disable(int cpu) { return; }
401#endif /* CONFIG_HARDLOCKUP_DETECTOR */
402
403/* prepare/enable/disable routines */
404static int watchdog_prepare_cpu(int cpu)
405{
406 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
407
408 WARN_ON(per_cpu(softlockup_watchdog, cpu));
409 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
410 hrtimer->function = watchdog_timer_fn;
411
412 return 0;
413}
414
415static int watchdog_enable(int cpu)
416{
417 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
418
419 /* enable the perf event */
420 if (watchdog_nmi_enable(cpu) != 0)
421 return -1;
422
423 /* create the watchdog thread */
424 if (!p) {
425 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
426 if (IS_ERR(p)) {
427 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
428 return -1;
429 }
430 kthread_bind(p, cpu);
431 per_cpu(watchdog_touch_ts, cpu) = 0;
432 per_cpu(softlockup_watchdog, cpu) = p;
433 wake_up_process(p);
434 }
435
436 return 0;
437}
438
439static void watchdog_disable(int cpu)
440{
441 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
442 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
443
444 /*
445 * cancel the timer first to stop incrementing the stats
446 * and waking up the kthread
447 */
448 hrtimer_cancel(hrtimer);
449
450 /* disable the perf event */
451 watchdog_nmi_disable(cpu);
452
453 /* stop the watchdog thread */
454 if (p) {
455 per_cpu(softlockup_watchdog, cpu) = NULL;
456 kthread_stop(p);
457 }
458
459 /* if any cpu succeeds, watchdog is considered enabled for the system */
460 watchdog_enabled = 1;
461}
462
463static void watchdog_enable_all_cpus(void)
464{
465 int cpu;
466 int result = 0;
467
468 for_each_online_cpu(cpu)
469 result += watchdog_enable(cpu);
470
471 if (result)
472 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
473
474}
475
476static void watchdog_disable_all_cpus(void)
477{
478 int cpu;
479
480 for_each_online_cpu(cpu)
481 watchdog_disable(cpu);
482
483 /* if all watchdogs are disabled, then they are disabled for the system */
484 watchdog_enabled = 0;
485}
486
487
488/* sysctl functions */
489#ifdef CONFIG_SYSCTL
490/*
491 * proc handler for /proc/sys/kernel/nmi_watchdog
492 */
493
494int proc_dowatchdog_enabled(struct ctl_table *table, int write,
495 void __user *buffer, size_t *length, loff_t *ppos)
496{
497 proc_dointvec(table, write, buffer, length, ppos);
498
499 if (watchdog_enabled)
500 watchdog_enable_all_cpus();
501 else
502 watchdog_disable_all_cpus();
503 return 0;
504}
505
506int proc_dowatchdog_thresh(struct ctl_table *table, int write,
507 void __user *buffer,
508 size_t *lenp, loff_t *ppos)
509{
510 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
511}
512#endif /* CONFIG_SYSCTL */
513
514
515/*
516 * Create/destroy watchdog threads as CPUs come and go:
517 */
518static int __cpuinit
519cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
520{
521 int hotcpu = (unsigned long)hcpu;
522
523 switch (action) {
524 case CPU_UP_PREPARE:
525 case CPU_UP_PREPARE_FROZEN:
526 if (watchdog_prepare_cpu(hotcpu))
527 return NOTIFY_BAD;
528 break;
529 case CPU_ONLINE:
530 case CPU_ONLINE_FROZEN:
531 if (watchdog_enable(hotcpu))
532 return NOTIFY_BAD;
533 break;
534#ifdef CONFIG_HOTPLUG_CPU
535 case CPU_UP_CANCELED:
536 case CPU_UP_CANCELED_FROZEN:
537 watchdog_disable(hotcpu);
538 break;
539 case CPU_DEAD:
540 case CPU_DEAD_FROZEN:
541 watchdog_disable(hotcpu);
542 break;
543#endif /* CONFIG_HOTPLUG_CPU */
544 }
545 return NOTIFY_OK;
546}
547
548static struct notifier_block __cpuinitdata cpu_nfb = {
549 .notifier_call = cpu_callback
550};
551
552static int __init spawn_watchdog_task(void)
553{
554 void *cpu = (void *)(long)smp_processor_id();
555 int err;
556
557 if (no_watchdog)
558 return 0;
559
560 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
561 WARN_ON(err == NOTIFY_BAD);
562
563 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
564 register_cpu_notifier(&cpu_nfb);
565
566 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
567
568 return 0;
569}
570early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..8bd600c020e5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,41 +33,290 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 35#include <linux/lockdep.h>
36#include <linux/idr.h>
37
36#define CREATE_TRACE_POINTS 38#define CREATE_TRACE_POINTS
37#include <trace/events/workqueue.h> 39#include <trace/events/workqueue.h>
38 40
41#include "workqueue_sched.h"
42
43enum {
44 /* global_cwq flags */
45 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
46 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
47 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
48 GCWQ_FREEZING = 1 << 3, /* freeze in progress */
49 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
50
51 /* worker flags */
52 WORKER_STARTED = 1 << 0, /* started */
53 WORKER_DIE = 1 << 1, /* die die die */
54 WORKER_IDLE = 1 << 2, /* is idle */
55 WORKER_PREP = 1 << 3, /* preparing to run works */
56 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
57 WORKER_REBIND = 1 << 5, /* mom is home, come back */
58 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
59 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
60
61 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
62 WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
63
64 /* gcwq->trustee_state */
65 TRUSTEE_START = 0, /* start */
66 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
67 TRUSTEE_BUTCHER = 2, /* butcher workers */
68 TRUSTEE_RELEASE = 3, /* release workers */
69 TRUSTEE_DONE = 4, /* trustee is done */
70
71 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
72 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
73 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
74
75 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
76 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
77
78 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
79 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
80 CREATE_COOLDOWN = HZ, /* time to breath after fail */
81 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
82
83 /*
84 * Rescue workers are used only on emergencies and shared by
85 * all cpus. Give -20.
86 */
87 RESCUER_NICE_LEVEL = -20,
88};
89
39/* 90/*
40 * The per-CPU workqueue (if single thread, we always use the first 91 * Structure fields follow one of the following exclusion rules.
41 * possible cpu). 92 *
93 * I: Set during initialization and read-only afterwards.
94 *
95 * P: Preemption protected. Disabling preemption is enough and should
96 * only be modified and accessed from the local cpu.
97 *
98 * L: gcwq->lock protected. Access with gcwq->lock held.
99 *
100 * X: During normal operation, modification requires gcwq->lock and
101 * should be done only from local cpu. Either disabling preemption
102 * on local cpu or grabbing gcwq->lock is enough for read access.
103 * If GCWQ_DISASSOCIATED is set, it's identical to L.
104 *
105 * F: wq->flush_mutex protected.
106 *
107 * W: workqueue_lock protected.
42 */ 108 */
43struct cpu_workqueue_struct {
44 109
45 spinlock_t lock; 110struct global_cwq;
111
112/*
113 * The poor guys doing the actual heavy lifting. All on-duty workers
114 * are either serving the manager role, on idle list or on busy hash.
115 */
116struct worker {
117 /* on idle list while idle, on busy hash table while busy */
118 union {
119 struct list_head entry; /* L: while idle */
120 struct hlist_node hentry; /* L: while busy */
121 };
122
123 struct work_struct *current_work; /* L: work being processed */
124 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
125 struct list_head scheduled; /* L: scheduled works */
126 struct task_struct *task; /* I: worker task */
127 struct global_cwq *gcwq; /* I: the associated gcwq */
128 /* 64 bytes boundary on 64bit, 32 on 32bit */
129 unsigned long last_active; /* L: last active timestamp */
130 unsigned int flags; /* X: flags */
131 int id; /* I: worker id */
132 struct work_struct rebind_work; /* L: rebind worker to cpu */
133};
134
135/*
136 * Global per-cpu workqueue. There's one and only one for each cpu
137 * and all works are queued and processed here regardless of their
138 * target workqueues.
139 */
140struct global_cwq {
141 spinlock_t lock; /* the gcwq lock */
142 struct list_head worklist; /* L: list of pending works */
143 unsigned int cpu; /* I: the associated cpu */
144 unsigned int flags; /* L: GCWQ_* flags */
46 145
47 struct list_head worklist; 146 int nr_workers; /* L: total number of workers */
48 wait_queue_head_t more_work; 147 int nr_idle; /* L: currently idle ones */
49 struct work_struct *current_work;
50 148
51 struct workqueue_struct *wq; 149 /* workers are chained either in the idle_list or busy_hash */
52 struct task_struct *thread; 150 struct list_head idle_list; /* X: list of idle workers */
53} ____cacheline_aligned; 151 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
152 /* L: hash of busy workers */
153
154 struct timer_list idle_timer; /* L: worker idle timeout */
155 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
156
157 struct ida worker_ida; /* L: for worker IDs */
158
159 struct task_struct *trustee; /* L: for gcwq shutdown */
160 unsigned int trustee_state; /* L: trustee state */
161 wait_queue_head_t trustee_wait; /* trustee wait */
162 struct worker *first_idle; /* L: first idle worker */
163} ____cacheline_aligned_in_smp;
164
165/*
166 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
167 * work_struct->data are used for flags and thus cwqs need to be
168 * aligned at two's power of the number of flag bits.
169 */
170struct cpu_workqueue_struct {
171 struct global_cwq *gcwq; /* I: the associated gcwq */
172 struct workqueue_struct *wq; /* I: the owning workqueue */
173 int work_color; /* L: current color */
174 int flush_color; /* L: flushing color */
175 int nr_in_flight[WORK_NR_COLORS];
176 /* L: nr of in_flight works */
177 int nr_active; /* L: nr of active works */
178 int max_active; /* L: max active works */
179 struct list_head delayed_works; /* L: delayed works */
180};
181
182/*
183 * Structure used to wait for workqueue flush.
184 */
185struct wq_flusher {
186 struct list_head list; /* F: list of flushers */
187 int flush_color; /* F: flush color waiting for */
188 struct completion done; /* flush completion */
189};
190
191/*
192 * All cpumasks are assumed to be always set on UP and thus can't be
193 * used to determine whether there's something to be done.
194 */
195#ifdef CONFIG_SMP
196typedef cpumask_var_t mayday_mask_t;
197#define mayday_test_and_set_cpu(cpu, mask) \
198 cpumask_test_and_set_cpu((cpu), (mask))
199#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
200#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
201#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp))
202#define free_mayday_mask(mask) free_cpumask_var((mask))
203#else
204typedef unsigned long mayday_mask_t;
205#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
206#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
207#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
208#define alloc_mayday_mask(maskp, gfp) true
209#define free_mayday_mask(mask) do { } while (0)
210#endif
54 211
55/* 212/*
56 * The externally visible workqueue abstraction is an array of 213 * The externally visible workqueue abstraction is an array of
57 * per-CPU workqueues: 214 * per-CPU workqueues:
58 */ 215 */
59struct workqueue_struct { 216struct workqueue_struct {
60 struct cpu_workqueue_struct *cpu_wq; 217 unsigned int flags; /* I: WQ_* flags */
61 struct list_head list; 218 union {
62 const char *name; 219 struct cpu_workqueue_struct __percpu *pcpu;
63 int singlethread; 220 struct cpu_workqueue_struct *single;
64 int freezeable; /* Freeze threads during suspend */ 221 unsigned long v;
65 int rt; 222 } cpu_wq; /* I: cwq's */
223 struct list_head list; /* W: list of all workqueues */
224
225 struct mutex flush_mutex; /* protects wq flushing */
226 int work_color; /* F: current work color */
227 int flush_color; /* F: current flush color */
228 atomic_t nr_cwqs_to_flush; /* flush in progress */
229 struct wq_flusher *first_flusher; /* F: first flusher */
230 struct list_head flusher_queue; /* F: flush waiters */
231 struct list_head flusher_overflow; /* F: flush overflow list */
232
233 mayday_mask_t mayday_mask; /* cpus requesting rescue */
234 struct worker *rescuer; /* I: rescue worker */
235
236 int saved_max_active; /* W: saved cwq max_active */
237 const char *name; /* I: workqueue name */
66#ifdef CONFIG_LOCKDEP 238#ifdef CONFIG_LOCKDEP
67 struct lockdep_map lockdep_map; 239 struct lockdep_map lockdep_map;
68#endif 240#endif
69}; 241};
70 242
243struct workqueue_struct *system_wq __read_mostly;
244struct workqueue_struct *system_long_wq __read_mostly;
245struct workqueue_struct *system_nrt_wq __read_mostly;
246struct workqueue_struct *system_unbound_wq __read_mostly;
247EXPORT_SYMBOL_GPL(system_wq);
248EXPORT_SYMBOL_GPL(system_long_wq);
249EXPORT_SYMBOL_GPL(system_nrt_wq);
250EXPORT_SYMBOL_GPL(system_unbound_wq);
251
252#define for_each_busy_worker(worker, i, pos, gcwq) \
253 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
254 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
255
256static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
257 unsigned int sw)
258{
259 if (cpu < nr_cpu_ids) {
260 if (sw & 1) {
261 cpu = cpumask_next(cpu, mask);
262 if (cpu < nr_cpu_ids)
263 return cpu;
264 }
265 if (sw & 2)
266 return WORK_CPU_UNBOUND;
267 }
268 return WORK_CPU_NONE;
269}
270
271static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
272 struct workqueue_struct *wq)
273{
274 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
275}
276
277/*
278 * CPU iterators
279 *
280 * An extra gcwq is defined for an invalid cpu number
281 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
282 * specific CPU. The following iterators are similar to
283 * for_each_*_cpu() iterators but also considers the unbound gcwq.
284 *
285 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
286 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
287 * for_each_cwq_cpu() : possible CPUs for bound workqueues,
288 * WORK_CPU_UNBOUND for unbound workqueues
289 */
290#define for_each_gcwq_cpu(cpu) \
291 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
292 (cpu) < WORK_CPU_NONE; \
293 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
294
295#define for_each_online_gcwq_cpu(cpu) \
296 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
297 (cpu) < WORK_CPU_NONE; \
298 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
299
300#define for_each_cwq_cpu(cpu, wq) \
301 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
302 (cpu) < WORK_CPU_NONE; \
303 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
304
305#ifdef CONFIG_LOCKDEP
306/**
307 * in_workqueue_context() - in context of specified workqueue?
308 * @wq: the workqueue of interest
309 *
310 * Checks lockdep state to see if the current task is executing from
311 * within a workqueue item. This function exists only if lockdep is
312 * enabled.
313 */
314int in_workqueue_context(struct workqueue_struct *wq)
315{
316 return lock_is_held(&wq->lockdep_map);
317}
318#endif
319
71#ifdef CONFIG_DEBUG_OBJECTS_WORK 320#ifdef CONFIG_DEBUG_OBJECTS_WORK
72 321
73static struct debug_obj_descr work_debug_descr; 322static struct debug_obj_descr work_debug_descr;
@@ -107,7 +356,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
107 * statically initialized. We just make sure that it 356 * statically initialized. We just make sure that it
108 * is tracked in the object tracker. 357 * is tracked in the object tracker.
109 */ 358 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { 359 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr); 360 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr); 361 debug_object_activate(work, &work_debug_descr);
113 return 0; 362 return 0;
@@ -181,94 +430,575 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
181/* Serializes the accesses to the list of workqueues. */ 430/* Serializes the accesses to the list of workqueues. */
182static DEFINE_SPINLOCK(workqueue_lock); 431static DEFINE_SPINLOCK(workqueue_lock);
183static LIST_HEAD(workqueues); 432static LIST_HEAD(workqueues);
433static bool workqueue_freezing; /* W: have wqs started freezing? */
434
435/*
436 * The almighty global cpu workqueues. nr_running is the only field
437 * which is expected to be used frequently by other cpus via
438 * try_to_wake_up(). Put it in a separate cacheline.
439 */
440static DEFINE_PER_CPU(struct global_cwq, global_cwq);
441static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
184 442
185static int singlethread_cpu __read_mostly;
186static const struct cpumask *cpu_singlethread_map __read_mostly;
187/* 443/*
188 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 444 * Global cpu workqueue and nr_running counter for unbound gcwq. The
189 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 445 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
190 * which comes in between can't use for_each_online_cpu(). We could 446 * workers have WORKER_UNBOUND set.
191 * use cpu_possible_map, the cpumask below is more a documentation
192 * than optimization.
193 */ 447 */
194static cpumask_var_t cpu_populated_map __read_mostly; 448static struct global_cwq unbound_global_cwq;
449static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
450
451static int worker_thread(void *__worker);
452
453static struct global_cwq *get_gcwq(unsigned int cpu)
454{
455 if (cpu != WORK_CPU_UNBOUND)
456 return &per_cpu(global_cwq, cpu);
457 else
458 return &unbound_global_cwq;
459}
460
461static atomic_t *get_gcwq_nr_running(unsigned int cpu)
462{
463 if (cpu != WORK_CPU_UNBOUND)
464 return &per_cpu(gcwq_nr_running, cpu);
465 else
466 return &unbound_gcwq_nr_running;
467}
195 468
196/* If it's single threaded, it isn't in the list of workqueues. */ 469static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
197static inline int is_wq_single_threaded(struct workqueue_struct *wq) 470 struct workqueue_struct *wq)
198{ 471{
199 return wq->singlethread; 472 if (!(wq->flags & WQ_UNBOUND)) {
473 if (likely(cpu < nr_cpu_ids)) {
474#ifdef CONFIG_SMP
475 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
476#else
477 return wq->cpu_wq.single;
478#endif
479 }
480 } else if (likely(cpu == WORK_CPU_UNBOUND))
481 return wq->cpu_wq.single;
482 return NULL;
200} 483}
201 484
202static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) 485static unsigned int work_color_to_flags(int color)
203{ 486{
204 return is_wq_single_threaded(wq) 487 return color << WORK_STRUCT_COLOR_SHIFT;
205 ? cpu_singlethread_map : cpu_populated_map;
206} 488}
207 489
208static 490static int get_work_color(struct work_struct *work)
209struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
210{ 491{
211 if (unlikely(is_wq_single_threaded(wq))) 492 return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
212 cpu = singlethread_cpu; 493 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
213 return per_cpu_ptr(wq->cpu_wq, cpu); 494}
495
496static int work_next_color(int color)
497{
498 return (color + 1) % WORK_NR_COLORS;
214} 499}
215 500
216/* 501/*
217 * Set the workqueue on which a work item is to be run 502 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
218 * - Must *only* be called if the pending flag is set 503 * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
504 * cleared and the work data contains the cpu number it was last on.
505 *
506 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
507 * cwq, cpu or clear work->data. These functions should only be
508 * called while the work is owned - ie. while the PENDING bit is set.
509 *
510 * get_work_[g]cwq() can be used to obtain the gcwq or cwq
511 * corresponding to a work. gcwq is available once the work has been
512 * queued anywhere after initialization. cwq is available only from
513 * queueing until execution starts.
219 */ 514 */
220static inline void set_wq_data(struct work_struct *work, 515static inline void set_work_data(struct work_struct *work, unsigned long data,
221 struct cpu_workqueue_struct *cwq) 516 unsigned long flags)
222{ 517{
223 unsigned long new;
224
225 BUG_ON(!work_pending(work)); 518 BUG_ON(!work_pending(work));
519 atomic_long_set(&work->data, data | flags | work_static(work));
520}
226 521
227 new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); 522static void set_work_cwq(struct work_struct *work,
228 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); 523 struct cpu_workqueue_struct *cwq,
229 atomic_long_set(&work->data, new); 524 unsigned long extra_flags)
525{
526 set_work_data(work, (unsigned long)cwq,
527 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
528}
529
530static void set_work_cpu(struct work_struct *work, unsigned int cpu)
531{
532 set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
533}
534
535static void clear_work_data(struct work_struct *work)
536{
537 set_work_data(work, WORK_STRUCT_NO_CPU, 0);
538}
539
540static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
541{
542 unsigned long data = atomic_long_read(&work->data);
543
544 if (data & WORK_STRUCT_CWQ)
545 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
546 else
547 return NULL;
548}
549
550static struct global_cwq *get_work_gcwq(struct work_struct *work)
551{
552 unsigned long data = atomic_long_read(&work->data);
553 unsigned int cpu;
554
555 if (data & WORK_STRUCT_CWQ)
556 return ((struct cpu_workqueue_struct *)
557 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
558
559 cpu = data >> WORK_STRUCT_FLAG_BITS;
560 if (cpu == WORK_CPU_NONE)
561 return NULL;
562
563 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
564 return get_gcwq(cpu);
565}
566
567/*
568 * Policy functions. These define the policies on how the global
569 * worker pool is managed. Unless noted otherwise, these functions
570 * assume that they're being called with gcwq->lock held.
571 */
572
573static bool __need_more_worker(struct global_cwq *gcwq)
574{
575 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
576 gcwq->flags & GCWQ_HIGHPRI_PENDING;
230} 577}
231 578
232/* 579/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. 580 * Need to wake up a worker? Called from anything but currently
581 * running workers.
234 */ 582 */
235static inline void clear_wq_data(struct work_struct *work) 583static bool need_more_worker(struct global_cwq *gcwq)
236{ 584{
237 unsigned long flags = *work_data_bits(work) & 585 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
238 (1UL << WORK_STRUCT_STATIC);
239 atomic_long_set(&work->data, flags);
240} 586}
241 587
242static inline 588/* Can I start working? Called from busy but !running workers. */
243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 589static bool may_start_working(struct global_cwq *gcwq)
244{ 590{
245 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 591 return gcwq->nr_idle;
246} 592}
247 593
594/* Do I need to keep working? Called from currently running workers. */
595static bool keep_working(struct global_cwq *gcwq)
596{
597 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
598
599 return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
600}
601
602/* Do we need a new worker? Called from manager. */
603static bool need_to_create_worker(struct global_cwq *gcwq)
604{
605 return need_more_worker(gcwq) && !may_start_working(gcwq);
606}
607
608/* Do I need to be the manager? */
609static bool need_to_manage_workers(struct global_cwq *gcwq)
610{
611 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
612}
613
614/* Do we have too many workers and should some go away? */
615static bool too_many_workers(struct global_cwq *gcwq)
616{
617 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
618 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
619 int nr_busy = gcwq->nr_workers - nr_idle;
620
621 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
622}
623
624/*
625 * Wake up functions.
626 */
627
628/* Return the first worker. Safe with preemption disabled */
629static struct worker *first_worker(struct global_cwq *gcwq)
630{
631 if (unlikely(list_empty(&gcwq->idle_list)))
632 return NULL;
633
634 return list_first_entry(&gcwq->idle_list, struct worker, entry);
635}
636
637/**
638 * wake_up_worker - wake up an idle worker
639 * @gcwq: gcwq to wake worker for
640 *
641 * Wake up the first idle worker of @gcwq.
642 *
643 * CONTEXT:
644 * spin_lock_irq(gcwq->lock).
645 */
646static void wake_up_worker(struct global_cwq *gcwq)
647{
648 struct worker *worker = first_worker(gcwq);
649
650 if (likely(worker))
651 wake_up_process(worker->task);
652}
653
654/**
655 * wq_worker_waking_up - a worker is waking up
656 * @task: task waking up
657 * @cpu: CPU @task is waking up to
658 *
659 * This function is called during try_to_wake_up() when a worker is
660 * being awoken.
661 *
662 * CONTEXT:
663 * spin_lock_irq(rq->lock)
664 */
665void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
666{
667 struct worker *worker = kthread_data(task);
668
669 if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
670 atomic_inc(get_gcwq_nr_running(cpu));
671}
672
673/**
674 * wq_worker_sleeping - a worker is going to sleep
675 * @task: task going to sleep
676 * @cpu: CPU in question, must be the current CPU number
677 *
678 * This function is called during schedule() when a busy worker is
679 * going to sleep. Worker on the same cpu can be woken up by
680 * returning pointer to its task.
681 *
682 * CONTEXT:
683 * spin_lock_irq(rq->lock)
684 *
685 * RETURNS:
686 * Worker task on @cpu to wake up, %NULL if none.
687 */
688struct task_struct *wq_worker_sleeping(struct task_struct *task,
689 unsigned int cpu)
690{
691 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
692 struct global_cwq *gcwq = get_gcwq(cpu);
693 atomic_t *nr_running = get_gcwq_nr_running(cpu);
694
695 if (unlikely(worker->flags & WORKER_NOT_RUNNING))
696 return NULL;
697
698 /* this can only happen on the local cpu */
699 BUG_ON(cpu != raw_smp_processor_id());
700
701 /*
702 * The counterpart of the following dec_and_test, implied mb,
703 * worklist not empty test sequence is in insert_work().
704 * Please read comment there.
705 *
706 * NOT_RUNNING is clear. This means that trustee is not in
707 * charge and we're running on the local cpu w/ rq lock held
708 * and preemption disabled, which in turn means that none else
709 * could be manipulating idle_list, so dereferencing idle_list
710 * without gcwq lock is safe.
711 */
712 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
713 to_wakeup = first_worker(gcwq);
714 return to_wakeup ? to_wakeup->task : NULL;
715}
716
717/**
718 * worker_set_flags - set worker flags and adjust nr_running accordingly
719 * @worker: self
720 * @flags: flags to set
721 * @wakeup: wakeup an idle worker if necessary
722 *
723 * Set @flags in @worker->flags and adjust nr_running accordingly. If
724 * nr_running becomes zero and @wakeup is %true, an idle worker is
725 * woken up.
726 *
727 * CONTEXT:
728 * spin_lock_irq(gcwq->lock)
729 */
730static inline void worker_set_flags(struct worker *worker, unsigned int flags,
731 bool wakeup)
732{
733 struct global_cwq *gcwq = worker->gcwq;
734
735 WARN_ON_ONCE(worker->task != current);
736
737 /*
738 * If transitioning into NOT_RUNNING, adjust nr_running and
739 * wake up an idle worker as necessary if requested by
740 * @wakeup.
741 */
742 if ((flags & WORKER_NOT_RUNNING) &&
743 !(worker->flags & WORKER_NOT_RUNNING)) {
744 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
745
746 if (wakeup) {
747 if (atomic_dec_and_test(nr_running) &&
748 !list_empty(&gcwq->worklist))
749 wake_up_worker(gcwq);
750 } else
751 atomic_dec(nr_running);
752 }
753
754 worker->flags |= flags;
755}
756
757/**
758 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
759 * @worker: self
760 * @flags: flags to clear
761 *
762 * Clear @flags in @worker->flags and adjust nr_running accordingly.
763 *
764 * CONTEXT:
765 * spin_lock_irq(gcwq->lock)
766 */
767static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
768{
769 struct global_cwq *gcwq = worker->gcwq;
770 unsigned int oflags = worker->flags;
771
772 WARN_ON_ONCE(worker->task != current);
773
774 worker->flags &= ~flags;
775
776 /* if transitioning out of NOT_RUNNING, increment nr_running */
777 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
778 if (!(worker->flags & WORKER_NOT_RUNNING))
779 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
780}
781
782/**
783 * busy_worker_head - return the busy hash head for a work
784 * @gcwq: gcwq of interest
785 * @work: work to be hashed
786 *
787 * Return hash head of @gcwq for @work.
788 *
789 * CONTEXT:
790 * spin_lock_irq(gcwq->lock).
791 *
792 * RETURNS:
793 * Pointer to the hash head.
794 */
795static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
796 struct work_struct *work)
797{
798 const int base_shift = ilog2(sizeof(struct work_struct));
799 unsigned long v = (unsigned long)work;
800
801 /* simple shift and fold hash, do we need something better? */
802 v >>= base_shift;
803 v += v >> BUSY_WORKER_HASH_ORDER;
804 v &= BUSY_WORKER_HASH_MASK;
805
806 return &gcwq->busy_hash[v];
807}
808
809/**
810 * __find_worker_executing_work - find worker which is executing a work
811 * @gcwq: gcwq of interest
812 * @bwh: hash head as returned by busy_worker_head()
813 * @work: work to find worker for
814 *
815 * Find a worker which is executing @work on @gcwq. @bwh should be
816 * the hash head obtained by calling busy_worker_head() with the same
817 * work.
818 *
819 * CONTEXT:
820 * spin_lock_irq(gcwq->lock).
821 *
822 * RETURNS:
823 * Pointer to worker which is executing @work if found, NULL
824 * otherwise.
825 */
826static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
827 struct hlist_head *bwh,
828 struct work_struct *work)
829{
830 struct worker *worker;
831 struct hlist_node *tmp;
832
833 hlist_for_each_entry(worker, tmp, bwh, hentry)
834 if (worker->current_work == work)
835 return worker;
836 return NULL;
837}
838
839/**
840 * find_worker_executing_work - find worker which is executing a work
841 * @gcwq: gcwq of interest
842 * @work: work to find worker for
843 *
844 * Find a worker which is executing @work on @gcwq. This function is
845 * identical to __find_worker_executing_work() except that this
846 * function calculates @bwh itself.
847 *
848 * CONTEXT:
849 * spin_lock_irq(gcwq->lock).
850 *
851 * RETURNS:
852 * Pointer to worker which is executing @work if found, NULL
853 * otherwise.
854 */
855static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
856 struct work_struct *work)
857{
858 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
859 work);
860}
861
862/**
863 * gcwq_determine_ins_pos - find insertion position
864 * @gcwq: gcwq of interest
865 * @cwq: cwq a work is being queued for
866 *
867 * A work for @cwq is about to be queued on @gcwq, determine insertion
868 * position for the work. If @cwq is for HIGHPRI wq, the work is
869 * queued at the head of the queue but in FIFO order with respect to
870 * other HIGHPRI works; otherwise, at the end of the queue. This
871 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
872 * there are HIGHPRI works pending.
873 *
874 * CONTEXT:
875 * spin_lock_irq(gcwq->lock).
876 *
877 * RETURNS:
878 * Pointer to inserstion position.
879 */
880static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
881 struct cpu_workqueue_struct *cwq)
882{
883 struct work_struct *twork;
884
885 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
886 return &gcwq->worklist;
887
888 list_for_each_entry(twork, &gcwq->worklist, entry) {
889 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
890
891 if (!(tcwq->wq->flags & WQ_HIGHPRI))
892 break;
893 }
894
895 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
896 return &twork->entry;
897}
898
899/**
900 * insert_work - insert a work into gcwq
901 * @cwq: cwq @work belongs to
902 * @work: work to insert
903 * @head: insertion point
904 * @extra_flags: extra WORK_STRUCT_* flags to set
905 *
906 * Insert @work which belongs to @cwq into @gcwq after @head.
907 * @extra_flags is or'd to work_struct flags.
908 *
909 * CONTEXT:
910 * spin_lock_irq(gcwq->lock).
911 */
248static void insert_work(struct cpu_workqueue_struct *cwq, 912static void insert_work(struct cpu_workqueue_struct *cwq,
249 struct work_struct *work, struct list_head *head) 913 struct work_struct *work, struct list_head *head,
914 unsigned int extra_flags)
250{ 915{
251 trace_workqueue_insertion(cwq->thread, work); 916 struct global_cwq *gcwq = cwq->gcwq;
917
918 /* we own @work, set data and link */
919 set_work_cwq(work, cwq, extra_flags);
252 920
253 set_wq_data(work, cwq);
254 /* 921 /*
255 * Ensure that we get the right work->data if we see the 922 * Ensure that we get the right work->data if we see the
256 * result of list_add() below, see try_to_grab_pending(). 923 * result of list_add() below, see try_to_grab_pending().
257 */ 924 */
258 smp_wmb(); 925 smp_wmb();
926
259 list_add_tail(&work->entry, head); 927 list_add_tail(&work->entry, head);
260 wake_up(&cwq->more_work); 928
929 /*
930 * Ensure either worker_sched_deactivated() sees the above
931 * list_add_tail() or we see zero nr_running to avoid workers
932 * lying around lazily while there are works to be processed.
933 */
934 smp_mb();
935
936 if (__need_more_worker(gcwq))
937 wake_up_worker(gcwq);
261} 938}
262 939
263static void __queue_work(struct cpu_workqueue_struct *cwq, 940static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
264 struct work_struct *work) 941 struct work_struct *work)
265{ 942{
943 struct global_cwq *gcwq;
944 struct cpu_workqueue_struct *cwq;
945 struct list_head *worklist;
266 unsigned long flags; 946 unsigned long flags;
267 947
268 debug_work_activate(work); 948 debug_work_activate(work);
269 spin_lock_irqsave(&cwq->lock, flags); 949
270 insert_work(cwq, work, &cwq->worklist); 950 /* determine gcwq to use */
271 spin_unlock_irqrestore(&cwq->lock, flags); 951 if (!(wq->flags & WQ_UNBOUND)) {
952 struct global_cwq *last_gcwq;
953
954 if (unlikely(cpu == WORK_CPU_UNBOUND))
955 cpu = raw_smp_processor_id();
956
957 /*
958 * It's multi cpu. If @wq is non-reentrant and @work
959 * was previously on a different cpu, it might still
960 * be running there, in which case the work needs to
961 * be queued on that cpu to guarantee non-reentrance.
962 */
963 gcwq = get_gcwq(cpu);
964 if (wq->flags & WQ_NON_REENTRANT &&
965 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
966 struct worker *worker;
967
968 spin_lock_irqsave(&last_gcwq->lock, flags);
969
970 worker = find_worker_executing_work(last_gcwq, work);
971
972 if (worker && worker->current_cwq->wq == wq)
973 gcwq = last_gcwq;
974 else {
975 /* meh... not running there, queue here */
976 spin_unlock_irqrestore(&last_gcwq->lock, flags);
977 spin_lock_irqsave(&gcwq->lock, flags);
978 }
979 } else
980 spin_lock_irqsave(&gcwq->lock, flags);
981 } else {
982 gcwq = get_gcwq(WORK_CPU_UNBOUND);
983 spin_lock_irqsave(&gcwq->lock, flags);
984 }
985
986 /* gcwq determined, get cwq and queue */
987 cwq = get_cwq(gcwq->cpu, wq);
988
989 BUG_ON(!list_empty(&work->entry));
990
991 cwq->nr_in_flight[cwq->work_color]++;
992
993 if (likely(cwq->nr_active < cwq->max_active)) {
994 cwq->nr_active++;
995 worklist = gcwq_determine_ins_pos(gcwq, cwq);
996 } else
997 worklist = &cwq->delayed_works;
998
999 insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
1000
1001 spin_unlock_irqrestore(&gcwq->lock, flags);
272} 1002}
273 1003
274/** 1004/**
@@ -308,9 +1038,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
308{ 1038{
309 int ret = 0; 1039 int ret = 0;
310 1040
311 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1041 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
312 BUG_ON(!list_empty(&work->entry)); 1042 __queue_work(cpu, wq, work);
313 __queue_work(wq_per_cpu(wq, cpu), work);
314 ret = 1; 1043 ret = 1;
315 } 1044 }
316 return ret; 1045 return ret;
@@ -320,10 +1049,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
320static void delayed_work_timer_fn(unsigned long __data) 1049static void delayed_work_timer_fn(unsigned long __data)
321{ 1050{
322 struct delayed_work *dwork = (struct delayed_work *)__data; 1051 struct delayed_work *dwork = (struct delayed_work *)__data;
323 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); 1052 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
324 struct workqueue_struct *wq = cwq->wq;
325 1053
326 __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); 1054 __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
327} 1055}
328 1056
329/** 1057/**
@@ -360,14 +1088,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
360 struct timer_list *timer = &dwork->timer; 1088 struct timer_list *timer = &dwork->timer;
361 struct work_struct *work = &dwork->work; 1089 struct work_struct *work = &dwork->work;
362 1090
363 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1091 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1092 unsigned int lcpu;
1093
364 BUG_ON(timer_pending(timer)); 1094 BUG_ON(timer_pending(timer));
365 BUG_ON(!list_empty(&work->entry)); 1095 BUG_ON(!list_empty(&work->entry));
366 1096
367 timer_stats_timer_set_start_info(&dwork->timer); 1097 timer_stats_timer_set_start_info(&dwork->timer);
368 1098
369 /* This stores cwq for the moment, for the timer_fn */ 1099 /*
370 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); 1100 * This stores cwq for the moment, for the timer_fn.
1101 * Note that the work's gcwq is preserved to allow
1102 * reentrance detection for delayed works.
1103 */
1104 if (!(wq->flags & WQ_UNBOUND)) {
1105 struct global_cwq *gcwq = get_work_gcwq(work);
1106
1107 if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1108 lcpu = gcwq->cpu;
1109 else
1110 lcpu = raw_smp_processor_id();
1111 } else
1112 lcpu = WORK_CPU_UNBOUND;
1113
1114 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1115
371 timer->expires = jiffies + delay; 1116 timer->expires = jiffies + delay;
372 timer->data = (unsigned long)dwork; 1117 timer->data = (unsigned long)dwork;
373 timer->function = delayed_work_timer_fn; 1118 timer->function = delayed_work_timer_fn;
@@ -382,80 +1127,878 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
382} 1127}
383EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1128EXPORT_SYMBOL_GPL(queue_delayed_work_on);
384 1129
385static void run_workqueue(struct cpu_workqueue_struct *cwq) 1130/**
1131 * worker_enter_idle - enter idle state
1132 * @worker: worker which is entering idle state
1133 *
1134 * @worker is entering idle state. Update stats and idle timer if
1135 * necessary.
1136 *
1137 * LOCKING:
1138 * spin_lock_irq(gcwq->lock).
1139 */
1140static void worker_enter_idle(struct worker *worker)
386{ 1141{
387 spin_lock_irq(&cwq->lock); 1142 struct global_cwq *gcwq = worker->gcwq;
388 while (!list_empty(&cwq->worklist)) { 1143
389 struct work_struct *work = list_entry(cwq->worklist.next, 1144 BUG_ON(worker->flags & WORKER_IDLE);
390 struct work_struct, entry); 1145 BUG_ON(!list_empty(&worker->entry) &&
391 work_func_t f = work->func; 1146 (worker->hentry.next || worker->hentry.pprev));
392#ifdef CONFIG_LOCKDEP 1147
1148 /* can't use worker_set_flags(), also called from start_worker() */
1149 worker->flags |= WORKER_IDLE;
1150 gcwq->nr_idle++;
1151 worker->last_active = jiffies;
1152
1153 /* idle_list is LIFO */
1154 list_add(&worker->entry, &gcwq->idle_list);
1155
1156 if (likely(!(worker->flags & WORKER_ROGUE))) {
1157 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1158 mod_timer(&gcwq->idle_timer,
1159 jiffies + IDLE_WORKER_TIMEOUT);
1160 } else
1161 wake_up_all(&gcwq->trustee_wait);
1162
1163 /* sanity check nr_running */
1164 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1165 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1166}
1167
1168/**
1169 * worker_leave_idle - leave idle state
1170 * @worker: worker which is leaving idle state
1171 *
1172 * @worker is leaving idle state. Update stats.
1173 *
1174 * LOCKING:
1175 * spin_lock_irq(gcwq->lock).
1176 */
1177static void worker_leave_idle(struct worker *worker)
1178{
1179 struct global_cwq *gcwq = worker->gcwq;
1180
1181 BUG_ON(!(worker->flags & WORKER_IDLE));
1182 worker_clr_flags(worker, WORKER_IDLE);
1183 gcwq->nr_idle--;
1184 list_del_init(&worker->entry);
1185}
1186
1187/**
1188 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1189 * @worker: self
1190 *
1191 * Works which are scheduled while the cpu is online must at least be
1192 * scheduled to a worker which is bound to the cpu so that if they are
1193 * flushed from cpu callbacks while cpu is going down, they are
1194 * guaranteed to execute on the cpu.
1195 *
1196 * This function is to be used by rogue workers and rescuers to bind
1197 * themselves to the target cpu and may race with cpu going down or
1198 * coming online. kthread_bind() can't be used because it may put the
1199 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1200 * verbatim as it's best effort and blocking and gcwq may be
1201 * [dis]associated in the meantime.
1202 *
1203 * This function tries set_cpus_allowed() and locks gcwq and verifies
1204 * the binding against GCWQ_DISASSOCIATED which is set during
1205 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1206 * idle state or fetches works without dropping lock, it can guarantee
1207 * the scheduling requirement described in the first paragraph.
1208 *
1209 * CONTEXT:
1210 * Might sleep. Called without any lock but returns with gcwq->lock
1211 * held.
1212 *
1213 * RETURNS:
1214 * %true if the associated gcwq is online (@worker is successfully
1215 * bound), %false if offline.
1216 */
1217static bool worker_maybe_bind_and_lock(struct worker *worker)
1218{
1219 struct global_cwq *gcwq = worker->gcwq;
1220 struct task_struct *task = worker->task;
1221
1222 while (true) {
393 /* 1223 /*
394 * It is permissible to free the struct work_struct 1224 * The following call may fail, succeed or succeed
395 * from inside the function that is called from it, 1225 * without actually migrating the task to the cpu if
396 * this we need to take into account for lockdep too. 1226 * it races with cpu hotunplug operation. Verify
397 * To avoid bogus "held lock freed" warnings as well 1227 * against GCWQ_DISASSOCIATED.
398 * as problems when looking into work->lockdep_map,
399 * make a copy and use that here.
400 */ 1228 */
401 struct lockdep_map lockdep_map = work->lockdep_map; 1229 if (!(gcwq->flags & GCWQ_DISASSOCIATED))
402#endif 1230 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
403 trace_workqueue_execution(cwq->thread, work); 1231
404 debug_work_deactivate(work); 1232 spin_lock_irq(&gcwq->lock);
405 cwq->current_work = work; 1233 if (gcwq->flags & GCWQ_DISASSOCIATED)
406 list_del_init(cwq->worklist.next); 1234 return false;
407 spin_unlock_irq(&cwq->lock); 1235 if (task_cpu(task) == gcwq->cpu &&
408 1236 cpumask_equal(&current->cpus_allowed,
409 BUG_ON(get_wq_data(work) != cwq); 1237 get_cpu_mask(gcwq->cpu)))
410 work_clear_pending(work); 1238 return true;
411 lock_map_acquire(&cwq->wq->lockdep_map); 1239 spin_unlock_irq(&gcwq->lock);
412 lock_map_acquire(&lockdep_map); 1240
413 f(work); 1241 /* CPU has come up inbetween, retry migration */
414 lock_map_release(&lockdep_map); 1242 cpu_relax();
415 lock_map_release(&cwq->wq->lockdep_map); 1243 }
416 1244}
417 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 1245
418 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 1246/*
419 "%s/0x%08x/%d\n", 1247 * Function for worker->rebind_work used to rebind rogue busy workers
420 current->comm, preempt_count(), 1248 * to the associated cpu which is coming back online. This is
421 task_pid_nr(current)); 1249 * scheduled by cpu up but can race with other cpu hotplug operations
422 printk(KERN_ERR " last function: "); 1250 * and may be executed twice without intervening cpu down.
423 print_symbol("%s\n", (unsigned long)f); 1251 */
424 debug_show_held_locks(current); 1252static void worker_rebind_fn(struct work_struct *work)
425 dump_stack(); 1253{
1254 struct worker *worker = container_of(work, struct worker, rebind_work);
1255 struct global_cwq *gcwq = worker->gcwq;
1256
1257 if (worker_maybe_bind_and_lock(worker))
1258 worker_clr_flags(worker, WORKER_REBIND);
1259
1260 spin_unlock_irq(&gcwq->lock);
1261}
1262
1263static struct worker *alloc_worker(void)
1264{
1265 struct worker *worker;
1266
1267 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1268 if (worker) {
1269 INIT_LIST_HEAD(&worker->entry);
1270 INIT_LIST_HEAD(&worker->scheduled);
1271 INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1272 /* on creation a worker is in !idle && prep state */
1273 worker->flags = WORKER_PREP;
1274 }
1275 return worker;
1276}
1277
1278/**
1279 * create_worker - create a new workqueue worker
1280 * @gcwq: gcwq the new worker will belong to
1281 * @bind: whether to set affinity to @cpu or not
1282 *
1283 * Create a new worker which is bound to @gcwq. The returned worker
1284 * can be started by calling start_worker() or destroyed using
1285 * destroy_worker().
1286 *
1287 * CONTEXT:
1288 * Might sleep. Does GFP_KERNEL allocations.
1289 *
1290 * RETURNS:
1291 * Pointer to the newly created worker.
1292 */
1293static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1294{
1295 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1296 struct worker *worker = NULL;
1297 int id = -1;
1298
1299 spin_lock_irq(&gcwq->lock);
1300 while (ida_get_new(&gcwq->worker_ida, &id)) {
1301 spin_unlock_irq(&gcwq->lock);
1302 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1303 goto fail;
1304 spin_lock_irq(&gcwq->lock);
1305 }
1306 spin_unlock_irq(&gcwq->lock);
1307
1308 worker = alloc_worker();
1309 if (!worker)
1310 goto fail;
1311
1312 worker->gcwq = gcwq;
1313 worker->id = id;
1314
1315 if (!on_unbound_cpu)
1316 worker->task = kthread_create(worker_thread, worker,
1317 "kworker/%u:%d", gcwq->cpu, id);
1318 else
1319 worker->task = kthread_create(worker_thread, worker,
1320 "kworker/u:%d", id);
1321 if (IS_ERR(worker->task))
1322 goto fail;
1323
1324 /*
1325 * A rogue worker will become a regular one if CPU comes
1326 * online later on. Make sure every worker has
1327 * PF_THREAD_BOUND set.
1328 */
1329 if (bind && !on_unbound_cpu)
1330 kthread_bind(worker->task, gcwq->cpu);
1331 else {
1332 worker->task->flags |= PF_THREAD_BOUND;
1333 if (on_unbound_cpu)
1334 worker->flags |= WORKER_UNBOUND;
1335 }
1336
1337 return worker;
1338fail:
1339 if (id >= 0) {
1340 spin_lock_irq(&gcwq->lock);
1341 ida_remove(&gcwq->worker_ida, id);
1342 spin_unlock_irq(&gcwq->lock);
1343 }
1344 kfree(worker);
1345 return NULL;
1346}
1347
1348/**
1349 * start_worker - start a newly created worker
1350 * @worker: worker to start
1351 *
1352 * Make the gcwq aware of @worker and start it.
1353 *
1354 * CONTEXT:
1355 * spin_lock_irq(gcwq->lock).
1356 */
1357static void start_worker(struct worker *worker)
1358{
1359 worker->flags |= WORKER_STARTED;
1360 worker->gcwq->nr_workers++;
1361 worker_enter_idle(worker);
1362 wake_up_process(worker->task);
1363}
1364
1365/**
1366 * destroy_worker - destroy a workqueue worker
1367 * @worker: worker to be destroyed
1368 *
1369 * Destroy @worker and adjust @gcwq stats accordingly.
1370 *
1371 * CONTEXT:
1372 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1373 */
1374static void destroy_worker(struct worker *worker)
1375{
1376 struct global_cwq *gcwq = worker->gcwq;
1377 int id = worker->id;
1378
1379 /* sanity check frenzy */
1380 BUG_ON(worker->current_work);
1381 BUG_ON(!list_empty(&worker->scheduled));
1382
1383 if (worker->flags & WORKER_STARTED)
1384 gcwq->nr_workers--;
1385 if (worker->flags & WORKER_IDLE)
1386 gcwq->nr_idle--;
1387
1388 list_del_init(&worker->entry);
1389 worker->flags |= WORKER_DIE;
1390
1391 spin_unlock_irq(&gcwq->lock);
1392
1393 kthread_stop(worker->task);
1394 kfree(worker);
1395
1396 spin_lock_irq(&gcwq->lock);
1397 ida_remove(&gcwq->worker_ida, id);
1398}
1399
1400static void idle_worker_timeout(unsigned long __gcwq)
1401{
1402 struct global_cwq *gcwq = (void *)__gcwq;
1403
1404 spin_lock_irq(&gcwq->lock);
1405
1406 if (too_many_workers(gcwq)) {
1407 struct worker *worker;
1408 unsigned long expires;
1409
1410 /* idle_list is kept in LIFO order, check the last one */
1411 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1412 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1413
1414 if (time_before(jiffies, expires))
1415 mod_timer(&gcwq->idle_timer, expires);
1416 else {
1417 /* it's been idle for too long, wake up manager */
1418 gcwq->flags |= GCWQ_MANAGE_WORKERS;
1419 wake_up_worker(gcwq);
426 } 1420 }
1421 }
427 1422
428 spin_lock_irq(&cwq->lock); 1423 spin_unlock_irq(&gcwq->lock);
429 cwq->current_work = NULL; 1424}
1425
1426static bool send_mayday(struct work_struct *work)
1427{
1428 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1429 struct workqueue_struct *wq = cwq->wq;
1430 unsigned int cpu;
1431
1432 if (!(wq->flags & WQ_RESCUER))
1433 return false;
1434
1435 /* mayday mayday mayday */
1436 cpu = cwq->gcwq->cpu;
1437 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1438 if (cpu == WORK_CPU_UNBOUND)
1439 cpu = 0;
1440 if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1441 wake_up_process(wq->rescuer->task);
1442 return true;
1443}
1444
1445static void gcwq_mayday_timeout(unsigned long __gcwq)
1446{
1447 struct global_cwq *gcwq = (void *)__gcwq;
1448 struct work_struct *work;
1449
1450 spin_lock_irq(&gcwq->lock);
1451
1452 if (need_to_create_worker(gcwq)) {
1453 /*
1454 * We've been trying to create a new worker but
1455 * haven't been successful. We might be hitting an
1456 * allocation deadlock. Send distress signals to
1457 * rescuers.
1458 */
1459 list_for_each_entry(work, &gcwq->worklist, entry)
1460 send_mayday(work);
430 } 1461 }
431 spin_unlock_irq(&cwq->lock); 1462
1463 spin_unlock_irq(&gcwq->lock);
1464
1465 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
432} 1466}
433 1467
434static int worker_thread(void *__cwq) 1468/**
1469 * maybe_create_worker - create a new worker if necessary
1470 * @gcwq: gcwq to create a new worker for
1471 *
1472 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
1473 * have at least one idle worker on return from this function. If
1474 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1475 * sent to all rescuers with works scheduled on @gcwq to resolve
1476 * possible allocation deadlock.
1477 *
1478 * On return, need_to_create_worker() is guaranteed to be false and
1479 * may_start_working() true.
1480 *
1481 * LOCKING:
1482 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1483 * multiple times. Does GFP_KERNEL allocations. Called only from
1484 * manager.
1485 *
1486 * RETURNS:
1487 * false if no action was taken and gcwq->lock stayed locked, true
1488 * otherwise.
1489 */
1490static bool maybe_create_worker(struct global_cwq *gcwq)
435{ 1491{
436 struct cpu_workqueue_struct *cwq = __cwq; 1492 if (!need_to_create_worker(gcwq))
437 DEFINE_WAIT(wait); 1493 return false;
1494restart:
1495 spin_unlock_irq(&gcwq->lock);
1496
1497 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1498 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1499
1500 while (true) {
1501 struct worker *worker;
1502
1503 worker = create_worker(gcwq, true);
1504 if (worker) {
1505 del_timer_sync(&gcwq->mayday_timer);
1506 spin_lock_irq(&gcwq->lock);
1507 start_worker(worker);
1508 BUG_ON(need_to_create_worker(gcwq));
1509 return true;
1510 }
1511
1512 if (!need_to_create_worker(gcwq))
1513 break;
438 1514
439 if (cwq->wq->freezeable) 1515 __set_current_state(TASK_INTERRUPTIBLE);
440 set_freezable(); 1516 schedule_timeout(CREATE_COOLDOWN);
441 1517
442 for (;;) { 1518 if (!need_to_create_worker(gcwq))
443 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 1519 break;
444 if (!freezing(current) && 1520 }
445 !kthread_should_stop() &&
446 list_empty(&cwq->worklist))
447 schedule();
448 finish_wait(&cwq->more_work, &wait);
449 1521
450 try_to_freeze(); 1522 del_timer_sync(&gcwq->mayday_timer);
1523 spin_lock_irq(&gcwq->lock);
1524 if (need_to_create_worker(gcwq))
1525 goto restart;
1526 return true;
1527}
451 1528
452 if (kthread_should_stop()) 1529/**
1530 * maybe_destroy_worker - destroy workers which have been idle for a while
1531 * @gcwq: gcwq to destroy workers for
1532 *
1533 * Destroy @gcwq workers which have been idle for longer than
1534 * IDLE_WORKER_TIMEOUT.
1535 *
1536 * LOCKING:
1537 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1538 * multiple times. Called only from manager.
1539 *
1540 * RETURNS:
1541 * false if no action was taken and gcwq->lock stayed locked, true
1542 * otherwise.
1543 */
1544static bool maybe_destroy_workers(struct global_cwq *gcwq)
1545{
1546 bool ret = false;
1547
1548 while (too_many_workers(gcwq)) {
1549 struct worker *worker;
1550 unsigned long expires;
1551
1552 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1553 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1554
1555 if (time_before(jiffies, expires)) {
1556 mod_timer(&gcwq->idle_timer, expires);
453 break; 1557 break;
1558 }
454 1559
455 run_workqueue(cwq); 1560 destroy_worker(worker);
1561 ret = true;
456 } 1562 }
457 1563
458 return 0; 1564 return ret;
1565}
1566
1567/**
1568 * manage_workers - manage worker pool
1569 * @worker: self
1570 *
1571 * Assume the manager role and manage gcwq worker pool @worker belongs
1572 * to. At any given time, there can be only zero or one manager per
1573 * gcwq. The exclusion is handled automatically by this function.
1574 *
1575 * The caller can safely start processing works on false return. On
1576 * true return, it's guaranteed that need_to_create_worker() is false
1577 * and may_start_working() is true.
1578 *
1579 * CONTEXT:
1580 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1581 * multiple times. Does GFP_KERNEL allocations.
1582 *
1583 * RETURNS:
1584 * false if no action was taken and gcwq->lock stayed locked, true if
1585 * some action was taken.
1586 */
1587static bool manage_workers(struct worker *worker)
1588{
1589 struct global_cwq *gcwq = worker->gcwq;
1590 bool ret = false;
1591
1592 if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1593 return ret;
1594
1595 gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1596 gcwq->flags |= GCWQ_MANAGING_WORKERS;
1597
1598 /*
1599 * Destroy and then create so that may_start_working() is true
1600 * on return.
1601 */
1602 ret |= maybe_destroy_workers(gcwq);
1603 ret |= maybe_create_worker(gcwq);
1604
1605 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1606
1607 /*
1608 * The trustee might be waiting to take over the manager
1609 * position, tell it we're done.
1610 */
1611 if (unlikely(gcwq->trustee))
1612 wake_up_all(&gcwq->trustee_wait);
1613
1614 return ret;
1615}
1616
1617/**
1618 * move_linked_works - move linked works to a list
1619 * @work: start of series of works to be scheduled
1620 * @head: target list to append @work to
1621 * @nextp: out paramter for nested worklist walking
1622 *
1623 * Schedule linked works starting from @work to @head. Work series to
1624 * be scheduled starts at @work and includes any consecutive work with
1625 * WORK_STRUCT_LINKED set in its predecessor.
1626 *
1627 * If @nextp is not NULL, it's updated to point to the next work of
1628 * the last scheduled work. This allows move_linked_works() to be
1629 * nested inside outer list_for_each_entry_safe().
1630 *
1631 * CONTEXT:
1632 * spin_lock_irq(gcwq->lock).
1633 */
1634static void move_linked_works(struct work_struct *work, struct list_head *head,
1635 struct work_struct **nextp)
1636{
1637 struct work_struct *n;
1638
1639 /*
1640 * Linked worklist will always end before the end of the list,
1641 * use NULL for list head.
1642 */
1643 list_for_each_entry_safe_from(work, n, NULL, entry) {
1644 list_move_tail(&work->entry, head);
1645 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1646 break;
1647 }
1648
1649 /*
1650 * If we're already inside safe list traversal and have moved
1651 * multiple works to the scheduled queue, the next position
1652 * needs to be updated.
1653 */
1654 if (nextp)
1655 *nextp = n;
1656}
1657
1658static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1659{
1660 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1661 struct work_struct, entry);
1662 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1663
1664 move_linked_works(work, pos, NULL);
1665 cwq->nr_active++;
1666}
1667
1668/**
1669 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1670 * @cwq: cwq of interest
1671 * @color: color of work which left the queue
1672 *
1673 * A work either has completed or is removed from pending queue,
1674 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1675 *
1676 * CONTEXT:
1677 * spin_lock_irq(gcwq->lock).
1678 */
1679static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1680{
1681 /* ignore uncolored works */
1682 if (color == WORK_NO_COLOR)
1683 return;
1684
1685 cwq->nr_in_flight[color]--;
1686 cwq->nr_active--;
1687
1688 if (!list_empty(&cwq->delayed_works)) {
1689 /* one down, submit a delayed one */
1690 if (cwq->nr_active < cwq->max_active)
1691 cwq_activate_first_delayed(cwq);
1692 }
1693
1694 /* is flush in progress and are we at the flushing tip? */
1695 if (likely(cwq->flush_color != color))
1696 return;
1697
1698 /* are there still in-flight works? */
1699 if (cwq->nr_in_flight[color])
1700 return;
1701
1702 /* this cwq is done, clear flush_color */
1703 cwq->flush_color = -1;
1704
1705 /*
1706 * If this was the last cwq, wake up the first flusher. It
1707 * will handle the rest.
1708 */
1709 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1710 complete(&cwq->wq->first_flusher->done);
1711}
1712
1713/**
1714 * process_one_work - process single work
1715 * @worker: self
1716 * @work: work to process
1717 *
1718 * Process @work. This function contains all the logics necessary to
1719 * process a single work including synchronization against and
1720 * interaction with other workers on the same cpu, queueing and
1721 * flushing. As long as context requirement is met, any worker can
1722 * call this function to process a work.
1723 *
1724 * CONTEXT:
1725 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1726 */
1727static void process_one_work(struct worker *worker, struct work_struct *work)
1728{
1729 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1730 struct global_cwq *gcwq = cwq->gcwq;
1731 struct hlist_head *bwh = busy_worker_head(gcwq, work);
1732 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1733 work_func_t f = work->func;
1734 int work_color;
1735 struct worker *collision;
1736#ifdef CONFIG_LOCKDEP
1737 /*
1738 * It is permissible to free the struct work_struct from
1739 * inside the function that is called from it, this we need to
1740 * take into account for lockdep too. To avoid bogus "held
1741 * lock freed" warnings as well as problems when looking into
1742 * work->lockdep_map, make a copy and use that here.
1743 */
1744 struct lockdep_map lockdep_map = work->lockdep_map;
1745#endif
1746 /*
1747 * A single work shouldn't be executed concurrently by
1748 * multiple workers on a single cpu. Check whether anyone is
1749 * already processing the work. If so, defer the work to the
1750 * currently executing one.
1751 */
1752 collision = __find_worker_executing_work(gcwq, bwh, work);
1753 if (unlikely(collision)) {
1754 move_linked_works(work, &collision->scheduled, NULL);
1755 return;
1756 }
1757
1758 /* claim and process */
1759 debug_work_deactivate(work);
1760 hlist_add_head(&worker->hentry, bwh);
1761 worker->current_work = work;
1762 worker->current_cwq = cwq;
1763 work_color = get_work_color(work);
1764
1765 /* record the current cpu number in the work data and dequeue */
1766 set_work_cpu(work, gcwq->cpu);
1767 list_del_init(&work->entry);
1768
1769 /*
1770 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1771 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1772 */
1773 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1774 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1775 struct work_struct, entry);
1776
1777 if (!list_empty(&gcwq->worklist) &&
1778 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1779 wake_up_worker(gcwq);
1780 else
1781 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1782 }
1783
1784 /*
1785 * CPU intensive works don't participate in concurrency
1786 * management. They're the scheduler's responsibility.
1787 */
1788 if (unlikely(cpu_intensive))
1789 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1790
1791 spin_unlock_irq(&gcwq->lock);
1792
1793 work_clear_pending(work);
1794 lock_map_acquire(&cwq->wq->lockdep_map);
1795 lock_map_acquire(&lockdep_map);
1796 trace_workqueue_execute_start(work);
1797 f(work);
1798 /*
1799 * While we must be careful to not use "work" after this, the trace
1800 * point will only record its address.
1801 */
1802 trace_workqueue_execute_end(work);
1803 lock_map_release(&lockdep_map);
1804 lock_map_release(&cwq->wq->lockdep_map);
1805
1806 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1807 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1808 "%s/0x%08x/%d\n",
1809 current->comm, preempt_count(), task_pid_nr(current));
1810 printk(KERN_ERR " last function: ");
1811 print_symbol("%s\n", (unsigned long)f);
1812 debug_show_held_locks(current);
1813 dump_stack();
1814 }
1815
1816 spin_lock_irq(&gcwq->lock);
1817
1818 /* clear cpu intensive status */
1819 if (unlikely(cpu_intensive))
1820 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1821
1822 /* we're done with it, release */
1823 hlist_del_init(&worker->hentry);
1824 worker->current_work = NULL;
1825 worker->current_cwq = NULL;
1826 cwq_dec_nr_in_flight(cwq, work_color);
1827}
1828
1829/**
1830 * process_scheduled_works - process scheduled works
1831 * @worker: self
1832 *
1833 * Process all scheduled works. Please note that the scheduled list
1834 * may change while processing a work, so this function repeatedly
1835 * fetches a work from the top and executes it.
1836 *
1837 * CONTEXT:
1838 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1839 * multiple times.
1840 */
1841static void process_scheduled_works(struct worker *worker)
1842{
1843 while (!list_empty(&worker->scheduled)) {
1844 struct work_struct *work = list_first_entry(&worker->scheduled,
1845 struct work_struct, entry);
1846 process_one_work(worker, work);
1847 }
1848}
1849
1850/**
1851 * worker_thread - the worker thread function
1852 * @__worker: self
1853 *
1854 * The gcwq worker thread function. There's a single dynamic pool of
1855 * these per each cpu. These workers process all works regardless of
1856 * their specific target workqueue. The only exception is works which
1857 * belong to workqueues with a rescuer which will be explained in
1858 * rescuer_thread().
1859 */
1860static int worker_thread(void *__worker)
1861{
1862 struct worker *worker = __worker;
1863 struct global_cwq *gcwq = worker->gcwq;
1864
1865 /* tell the scheduler that this is a workqueue worker */
1866 worker->task->flags |= PF_WQ_WORKER;
1867woke_up:
1868 spin_lock_irq(&gcwq->lock);
1869
1870 /* DIE can be set only while we're idle, checking here is enough */
1871 if (worker->flags & WORKER_DIE) {
1872 spin_unlock_irq(&gcwq->lock);
1873 worker->task->flags &= ~PF_WQ_WORKER;
1874 return 0;
1875 }
1876
1877 worker_leave_idle(worker);
1878recheck:
1879 /* no more worker necessary? */
1880 if (!need_more_worker(gcwq))
1881 goto sleep;
1882
1883 /* do we need to manage? */
1884 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1885 goto recheck;
1886
1887 /*
1888 * ->scheduled list can only be filled while a worker is
1889 * preparing to process a work or actually processing it.
1890 * Make sure nobody diddled with it while I was sleeping.
1891 */
1892 BUG_ON(!list_empty(&worker->scheduled));
1893
1894 /*
1895 * When control reaches this point, we're guaranteed to have
1896 * at least one idle worker or that someone else has already
1897 * assumed the manager role.
1898 */
1899 worker_clr_flags(worker, WORKER_PREP);
1900
1901 do {
1902 struct work_struct *work =
1903 list_first_entry(&gcwq->worklist,
1904 struct work_struct, entry);
1905
1906 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1907 /* optimization path, not strictly necessary */
1908 process_one_work(worker, work);
1909 if (unlikely(!list_empty(&worker->scheduled)))
1910 process_scheduled_works(worker);
1911 } else {
1912 move_linked_works(work, &worker->scheduled, NULL);
1913 process_scheduled_works(worker);
1914 }
1915 } while (keep_working(gcwq));
1916
1917 worker_set_flags(worker, WORKER_PREP, false);
1918sleep:
1919 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1920 goto recheck;
1921
1922 /*
1923 * gcwq->lock is held and there's no work to process and no
1924 * need to manage, sleep. Workers are woken up only while
1925 * holding gcwq->lock or from local cpu, so setting the
1926 * current state before releasing gcwq->lock is enough to
1927 * prevent losing any event.
1928 */
1929 worker_enter_idle(worker);
1930 __set_current_state(TASK_INTERRUPTIBLE);
1931 spin_unlock_irq(&gcwq->lock);
1932 schedule();
1933 goto woke_up;
1934}
1935
1936/**
1937 * rescuer_thread - the rescuer thread function
1938 * @__wq: the associated workqueue
1939 *
1940 * Workqueue rescuer thread function. There's one rescuer for each
1941 * workqueue which has WQ_RESCUER set.
1942 *
1943 * Regular work processing on a gcwq may block trying to create a new
1944 * worker which uses GFP_KERNEL allocation which has slight chance of
1945 * developing into deadlock if some works currently on the same queue
1946 * need to be processed to satisfy the GFP_KERNEL allocation. This is
1947 * the problem rescuer solves.
1948 *
1949 * When such condition is possible, the gcwq summons rescuers of all
1950 * workqueues which have works queued on the gcwq and let them process
1951 * those works so that forward progress can be guaranteed.
1952 *
1953 * This should happen rarely.
1954 */
1955static int rescuer_thread(void *__wq)
1956{
1957 struct workqueue_struct *wq = __wq;
1958 struct worker *rescuer = wq->rescuer;
1959 struct list_head *scheduled = &rescuer->scheduled;
1960 bool is_unbound = wq->flags & WQ_UNBOUND;
1961 unsigned int cpu;
1962
1963 set_user_nice(current, RESCUER_NICE_LEVEL);
1964repeat:
1965 set_current_state(TASK_INTERRUPTIBLE);
1966
1967 if (kthread_should_stop())
1968 return 0;
1969
1970 /*
1971 * See whether any cpu is asking for help. Unbounded
1972 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
1973 */
1974 for_each_mayday_cpu(cpu, wq->mayday_mask) {
1975 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
1976 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
1977 struct global_cwq *gcwq = cwq->gcwq;
1978 struct work_struct *work, *n;
1979
1980 __set_current_state(TASK_RUNNING);
1981 mayday_clear_cpu(cpu, wq->mayday_mask);
1982
1983 /* migrate to the target cpu if possible */
1984 rescuer->gcwq = gcwq;
1985 worker_maybe_bind_and_lock(rescuer);
1986
1987 /*
1988 * Slurp in all works issued via this workqueue and
1989 * process'em.
1990 */
1991 BUG_ON(!list_empty(&rescuer->scheduled));
1992 list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
1993 if (get_work_cwq(work) == cwq)
1994 move_linked_works(work, scheduled, &n);
1995
1996 process_scheduled_works(rescuer);
1997 spin_unlock_irq(&gcwq->lock);
1998 }
1999
2000 schedule();
2001 goto repeat;
459} 2002}
460 2003
461struct wq_barrier { 2004struct wq_barrier {
@@ -469,44 +2012,137 @@ static void wq_barrier_func(struct work_struct *work)
469 complete(&barr->done); 2012 complete(&barr->done);
470} 2013}
471 2014
2015/**
2016 * insert_wq_barrier - insert a barrier work
2017 * @cwq: cwq to insert barrier into
2018 * @barr: wq_barrier to insert
2019 * @target: target work to attach @barr to
2020 * @worker: worker currently executing @target, NULL if @target is not executing
2021 *
2022 * @barr is linked to @target such that @barr is completed only after
2023 * @target finishes execution. Please note that the ordering
2024 * guarantee is observed only with respect to @target and on the local
2025 * cpu.
2026 *
2027 * Currently, a queued barrier can't be canceled. This is because
2028 * try_to_grab_pending() can't determine whether the work to be
2029 * grabbed is at the head of the queue and thus can't clear LINKED
2030 * flag of the previous work while there must be a valid next work
2031 * after a work with LINKED flag set.
2032 *
2033 * Note that when @worker is non-NULL, @target may be modified
2034 * underneath us, so we can't reliably determine cwq from @target.
2035 *
2036 * CONTEXT:
2037 * spin_lock_irq(gcwq->lock).
2038 */
472static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2039static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
473 struct wq_barrier *barr, struct list_head *head) 2040 struct wq_barrier *barr,
2041 struct work_struct *target, struct worker *worker)
474{ 2042{
2043 struct list_head *head;
2044 unsigned int linked = 0;
2045
475 /* 2046 /*
476 * debugobject calls are safe here even with cwq->lock locked 2047 * debugobject calls are safe here even with gcwq->lock locked
477 * as we know for sure that this will not trigger any of the 2048 * as we know for sure that this will not trigger any of the
478 * checks and call back into the fixup functions where we 2049 * checks and call back into the fixup functions where we
479 * might deadlock. 2050 * might deadlock.
480 */ 2051 */
481 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2052 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
482 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 2053 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
483
484 init_completion(&barr->done); 2054 init_completion(&barr->done);
485 2055
2056 /*
2057 * If @target is currently being executed, schedule the
2058 * barrier to the worker; otherwise, put it after @target.
2059 */
2060 if (worker)
2061 head = worker->scheduled.next;
2062 else {
2063 unsigned long *bits = work_data_bits(target);
2064
2065 head = target->entry.next;
2066 /* there can already be other linked works, inherit and set */
2067 linked = *bits & WORK_STRUCT_LINKED;
2068 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2069 }
2070
486 debug_work_activate(&barr->work); 2071 debug_work_activate(&barr->work);
487 insert_work(cwq, &barr->work, head); 2072 insert_work(cwq, &barr->work, head,
2073 work_color_to_flags(WORK_NO_COLOR) | linked);
488} 2074}
489 2075
490static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 2076/**
2077 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2078 * @wq: workqueue being flushed
2079 * @flush_color: new flush color, < 0 for no-op
2080 * @work_color: new work color, < 0 for no-op
2081 *
2082 * Prepare cwqs for workqueue flushing.
2083 *
2084 * If @flush_color is non-negative, flush_color on all cwqs should be
2085 * -1. If no cwq has in-flight commands at the specified color, all
2086 * cwq->flush_color's stay at -1 and %false is returned. If any cwq
2087 * has in flight commands, its cwq->flush_color is set to
2088 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2089 * wakeup logic is armed and %true is returned.
2090 *
2091 * The caller should have initialized @wq->first_flusher prior to
2092 * calling this function with non-negative @flush_color. If
2093 * @flush_color is negative, no flush color update is done and %false
2094 * is returned.
2095 *
2096 * If @work_color is non-negative, all cwqs should have the same
2097 * work_color which is previous to @work_color and all will be
2098 * advanced to @work_color.
2099 *
2100 * CONTEXT:
2101 * mutex_lock(wq->flush_mutex).
2102 *
2103 * RETURNS:
2104 * %true if @flush_color >= 0 and there's something to flush. %false
2105 * otherwise.
2106 */
2107static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2108 int flush_color, int work_color)
491{ 2109{
492 int active = 0; 2110 bool wait = false;
493 struct wq_barrier barr; 2111 unsigned int cpu;
494
495 WARN_ON(cwq->thread == current);
496 2112
497 spin_lock_irq(&cwq->lock); 2113 if (flush_color >= 0) {
498 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 2114 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
499 insert_wq_barrier(cwq, &barr, &cwq->worklist); 2115 atomic_set(&wq->nr_cwqs_to_flush, 1);
500 active = 1;
501 } 2116 }
502 spin_unlock_irq(&cwq->lock);
503 2117
504 if (active) { 2118 for_each_cwq_cpu(cpu, wq) {
505 wait_for_completion(&barr.done); 2119 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
506 destroy_work_on_stack(&barr.work); 2120 struct global_cwq *gcwq = cwq->gcwq;
2121
2122 spin_lock_irq(&gcwq->lock);
2123
2124 if (flush_color >= 0) {
2125 BUG_ON(cwq->flush_color != -1);
2126
2127 if (cwq->nr_in_flight[flush_color]) {
2128 cwq->flush_color = flush_color;
2129 atomic_inc(&wq->nr_cwqs_to_flush);
2130 wait = true;
2131 }
2132 }
2133
2134 if (work_color >= 0) {
2135 BUG_ON(work_color != work_next_color(cwq->work_color));
2136 cwq->work_color = work_color;
2137 }
2138
2139 spin_unlock_irq(&gcwq->lock);
507 } 2140 }
508 2141
509 return active; 2142 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2143 complete(&wq->first_flusher->done);
2144
2145 return wait;
510} 2146}
511 2147
512/** 2148/**
@@ -518,20 +2154,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
518 * 2154 *
519 * We sleep until all works which were queued on entry have been handled, 2155 * We sleep until all works which were queued on entry have been handled,
520 * but we are not livelocked by new incoming ones. 2156 * but we are not livelocked by new incoming ones.
521 *
522 * This function used to run the workqueues itself. Now we just wait for the
523 * helper threads to do it.
524 */ 2157 */
525void flush_workqueue(struct workqueue_struct *wq) 2158void flush_workqueue(struct workqueue_struct *wq)
526{ 2159{
527 const struct cpumask *cpu_map = wq_cpu_map(wq); 2160 struct wq_flusher this_flusher = {
528 int cpu; 2161 .list = LIST_HEAD_INIT(this_flusher.list),
2162 .flush_color = -1,
2163 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2164 };
2165 int next_color;
529 2166
530 might_sleep();
531 lock_map_acquire(&wq->lockdep_map); 2167 lock_map_acquire(&wq->lockdep_map);
532 lock_map_release(&wq->lockdep_map); 2168 lock_map_release(&wq->lockdep_map);
533 for_each_cpu(cpu, cpu_map) 2169
534 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 2170 mutex_lock(&wq->flush_mutex);
2171
2172 /*
2173 * Start-to-wait phase
2174 */
2175 next_color = work_next_color(wq->work_color);
2176
2177 if (next_color != wq->flush_color) {
2178 /*
2179 * Color space is not full. The current work_color
2180 * becomes our flush_color and work_color is advanced
2181 * by one.
2182 */
2183 BUG_ON(!list_empty(&wq->flusher_overflow));
2184 this_flusher.flush_color = wq->work_color;
2185 wq->work_color = next_color;
2186
2187 if (!wq->first_flusher) {
2188 /* no flush in progress, become the first flusher */
2189 BUG_ON(wq->flush_color != this_flusher.flush_color);
2190
2191 wq->first_flusher = &this_flusher;
2192
2193 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2194 wq->work_color)) {
2195 /* nothing to flush, done */
2196 wq->flush_color = next_color;
2197 wq->first_flusher = NULL;
2198 goto out_unlock;
2199 }
2200 } else {
2201 /* wait in queue */
2202 BUG_ON(wq->flush_color == this_flusher.flush_color);
2203 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2204 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2205 }
2206 } else {
2207 /*
2208 * Oops, color space is full, wait on overflow queue.
2209 * The next flush completion will assign us
2210 * flush_color and transfer to flusher_queue.
2211 */
2212 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2213 }
2214
2215 mutex_unlock(&wq->flush_mutex);
2216
2217 wait_for_completion(&this_flusher.done);
2218
2219 /*
2220 * Wake-up-and-cascade phase
2221 *
2222 * First flushers are responsible for cascading flushes and
2223 * handling overflow. Non-first flushers can simply return.
2224 */
2225 if (wq->first_flusher != &this_flusher)
2226 return;
2227
2228 mutex_lock(&wq->flush_mutex);
2229
2230 /* we might have raced, check again with mutex held */
2231 if (wq->first_flusher != &this_flusher)
2232 goto out_unlock;
2233
2234 wq->first_flusher = NULL;
2235
2236 BUG_ON(!list_empty(&this_flusher.list));
2237 BUG_ON(wq->flush_color != this_flusher.flush_color);
2238
2239 while (true) {
2240 struct wq_flusher *next, *tmp;
2241
2242 /* complete all the flushers sharing the current flush color */
2243 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2244 if (next->flush_color != wq->flush_color)
2245 break;
2246 list_del_init(&next->list);
2247 complete(&next->done);
2248 }
2249
2250 BUG_ON(!list_empty(&wq->flusher_overflow) &&
2251 wq->flush_color != work_next_color(wq->work_color));
2252
2253 /* this flush_color is finished, advance by one */
2254 wq->flush_color = work_next_color(wq->flush_color);
2255
2256 /* one color has been freed, handle overflow queue */
2257 if (!list_empty(&wq->flusher_overflow)) {
2258 /*
2259 * Assign the same color to all overflowed
2260 * flushers, advance work_color and append to
2261 * flusher_queue. This is the start-to-wait
2262 * phase for these overflowed flushers.
2263 */
2264 list_for_each_entry(tmp, &wq->flusher_overflow, list)
2265 tmp->flush_color = wq->work_color;
2266
2267 wq->work_color = work_next_color(wq->work_color);
2268
2269 list_splice_tail_init(&wq->flusher_overflow,
2270 &wq->flusher_queue);
2271 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2272 }
2273
2274 if (list_empty(&wq->flusher_queue)) {
2275 BUG_ON(wq->flush_color != wq->work_color);
2276 break;
2277 }
2278
2279 /*
2280 * Need to flush more colors. Make the next flusher
2281 * the new first flusher and arm cwqs.
2282 */
2283 BUG_ON(wq->flush_color == wq->work_color);
2284 BUG_ON(wq->flush_color != next->flush_color);
2285
2286 list_del_init(&next->list);
2287 wq->first_flusher = next;
2288
2289 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2290 break;
2291
2292 /*
2293 * Meh... this color is already done, clear first
2294 * flusher and repeat cascading.
2295 */
2296 wq->first_flusher = NULL;
2297 }
2298
2299out_unlock:
2300 mutex_unlock(&wq->flush_mutex);
535} 2301}
536EXPORT_SYMBOL_GPL(flush_workqueue); 2302EXPORT_SYMBOL_GPL(flush_workqueue);
537 2303
@@ -547,43 +2313,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
547 */ 2313 */
548int flush_work(struct work_struct *work) 2314int flush_work(struct work_struct *work)
549{ 2315{
2316 struct worker *worker = NULL;
2317 struct global_cwq *gcwq;
550 struct cpu_workqueue_struct *cwq; 2318 struct cpu_workqueue_struct *cwq;
551 struct list_head *prev;
552 struct wq_barrier barr; 2319 struct wq_barrier barr;
553 2320
554 might_sleep(); 2321 might_sleep();
555 cwq = get_wq_data(work); 2322 gcwq = get_work_gcwq(work);
556 if (!cwq) 2323 if (!gcwq)
557 return 0; 2324 return 0;
558 2325
559 lock_map_acquire(&cwq->wq->lockdep_map); 2326 spin_lock_irq(&gcwq->lock);
560 lock_map_release(&cwq->wq->lockdep_map);
561
562 prev = NULL;
563 spin_lock_irq(&cwq->lock);
564 if (!list_empty(&work->entry)) { 2327 if (!list_empty(&work->entry)) {
565 /* 2328 /*
566 * See the comment near try_to_grab_pending()->smp_rmb(). 2329 * See the comment near try_to_grab_pending()->smp_rmb().
567 * If it was re-queued under us we are not going to wait. 2330 * If it was re-queued to a different gcwq under us, we
2331 * are not going to wait.
568 */ 2332 */
569 smp_rmb(); 2333 smp_rmb();
570 if (unlikely(cwq != get_wq_data(work))) 2334 cwq = get_work_cwq(work);
571 goto out; 2335 if (unlikely(!cwq || gcwq != cwq->gcwq))
572 prev = &work->entry; 2336 goto already_gone;
573 } else { 2337 } else {
574 if (cwq->current_work != work) 2338 worker = find_worker_executing_work(gcwq, work);
575 goto out; 2339 if (!worker)
576 prev = &cwq->worklist; 2340 goto already_gone;
2341 cwq = worker->current_cwq;
577 } 2342 }
578 insert_wq_barrier(cwq, &barr, prev->next); 2343
579out: 2344 insert_wq_barrier(cwq, &barr, work, worker);
580 spin_unlock_irq(&cwq->lock); 2345 spin_unlock_irq(&gcwq->lock);
581 if (!prev) 2346
582 return 0; 2347 lock_map_acquire(&cwq->wq->lockdep_map);
2348 lock_map_release(&cwq->wq->lockdep_map);
583 2349
584 wait_for_completion(&barr.done); 2350 wait_for_completion(&barr.done);
585 destroy_work_on_stack(&barr.work); 2351 destroy_work_on_stack(&barr.work);
586 return 1; 2352 return 1;
2353already_gone:
2354 spin_unlock_irq(&gcwq->lock);
2355 return 0;
587} 2356}
588EXPORT_SYMBOL_GPL(flush_work); 2357EXPORT_SYMBOL_GPL(flush_work);
589 2358
@@ -593,54 +2362,55 @@ EXPORT_SYMBOL_GPL(flush_work);
593 */ 2362 */
594static int try_to_grab_pending(struct work_struct *work) 2363static int try_to_grab_pending(struct work_struct *work)
595{ 2364{
596 struct cpu_workqueue_struct *cwq; 2365 struct global_cwq *gcwq;
597 int ret = -1; 2366 int ret = -1;
598 2367
599 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) 2368 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
600 return 0; 2369 return 0;
601 2370
602 /* 2371 /*
603 * The queueing is in progress, or it is already queued. Try to 2372 * The queueing is in progress, or it is already queued. Try to
604 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 2373 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
605 */ 2374 */
606 2375 gcwq = get_work_gcwq(work);
607 cwq = get_wq_data(work); 2376 if (!gcwq)
608 if (!cwq)
609 return ret; 2377 return ret;
610 2378
611 spin_lock_irq(&cwq->lock); 2379 spin_lock_irq(&gcwq->lock);
612 if (!list_empty(&work->entry)) { 2380 if (!list_empty(&work->entry)) {
613 /* 2381 /*
614 * This work is queued, but perhaps we locked the wrong cwq. 2382 * This work is queued, but perhaps we locked the wrong gcwq.
615 * In that case we must see the new value after rmb(), see 2383 * In that case we must see the new value after rmb(), see
616 * insert_work()->wmb(). 2384 * insert_work()->wmb().
617 */ 2385 */
618 smp_rmb(); 2386 smp_rmb();
619 if (cwq == get_wq_data(work)) { 2387 if (gcwq == get_work_gcwq(work)) {
620 debug_work_deactivate(work); 2388 debug_work_deactivate(work);
621 list_del_init(&work->entry); 2389 list_del_init(&work->entry);
2390 cwq_dec_nr_in_flight(get_work_cwq(work),
2391 get_work_color(work));
622 ret = 1; 2392 ret = 1;
623 } 2393 }
624 } 2394 }
625 spin_unlock_irq(&cwq->lock); 2395 spin_unlock_irq(&gcwq->lock);
626 2396
627 return ret; 2397 return ret;
628} 2398}
629 2399
630static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, 2400static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
631 struct work_struct *work)
632{ 2401{
633 struct wq_barrier barr; 2402 struct wq_barrier barr;
634 int running = 0; 2403 struct worker *worker;
635 2404
636 spin_lock_irq(&cwq->lock); 2405 spin_lock_irq(&gcwq->lock);
637 if (unlikely(cwq->current_work == work)) {
638 insert_wq_barrier(cwq, &barr, cwq->worklist.next);
639 running = 1;
640 }
641 spin_unlock_irq(&cwq->lock);
642 2406
643 if (unlikely(running)) { 2407 worker = find_worker_executing_work(gcwq, work);
2408 if (unlikely(worker))
2409 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2410
2411 spin_unlock_irq(&gcwq->lock);
2412
2413 if (unlikely(worker)) {
644 wait_for_completion(&barr.done); 2414 wait_for_completion(&barr.done);
645 destroy_work_on_stack(&barr.work); 2415 destroy_work_on_stack(&barr.work);
646 } 2416 }
@@ -648,9 +2418,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
648 2418
649static void wait_on_work(struct work_struct *work) 2419static void wait_on_work(struct work_struct *work)
650{ 2420{
651 struct cpu_workqueue_struct *cwq;
652 struct workqueue_struct *wq;
653 const struct cpumask *cpu_map;
654 int cpu; 2421 int cpu;
655 2422
656 might_sleep(); 2423 might_sleep();
@@ -658,15 +2425,8 @@ static void wait_on_work(struct work_struct *work)
658 lock_map_acquire(&work->lockdep_map); 2425 lock_map_acquire(&work->lockdep_map);
659 lock_map_release(&work->lockdep_map); 2426 lock_map_release(&work->lockdep_map);
660 2427
661 cwq = get_wq_data(work); 2428 for_each_gcwq_cpu(cpu)
662 if (!cwq) 2429 wait_on_cpu_work(get_gcwq(cpu), work);
663 return;
664
665 wq = cwq->wq;
666 cpu_map = wq_cpu_map(wq);
667
668 for_each_cpu(cpu, cpu_map)
669 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
670} 2430}
671 2431
672static int __cancel_work_timer(struct work_struct *work, 2432static int __cancel_work_timer(struct work_struct *work,
@@ -681,7 +2441,7 @@ static int __cancel_work_timer(struct work_struct *work,
681 wait_on_work(work); 2441 wait_on_work(work);
682 } while (unlikely(ret < 0)); 2442 } while (unlikely(ret < 0));
683 2443
684 clear_wq_data(work); 2444 clear_work_data(work);
685 return ret; 2445 return ret;
686} 2446}
687 2447
@@ -727,8 +2487,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
727} 2487}
728EXPORT_SYMBOL(cancel_delayed_work_sync); 2488EXPORT_SYMBOL(cancel_delayed_work_sync);
729 2489
730static struct workqueue_struct *keventd_wq __read_mostly;
731
732/** 2490/**
733 * schedule_work - put work task in global workqueue 2491 * schedule_work - put work task in global workqueue
734 * @work: job to be done 2492 * @work: job to be done
@@ -742,7 +2500,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
742 */ 2500 */
743int schedule_work(struct work_struct *work) 2501int schedule_work(struct work_struct *work)
744{ 2502{
745 return queue_work(keventd_wq, work); 2503 return queue_work(system_wq, work);
746} 2504}
747EXPORT_SYMBOL(schedule_work); 2505EXPORT_SYMBOL(schedule_work);
748 2506
@@ -755,7 +2513,7 @@ EXPORT_SYMBOL(schedule_work);
755 */ 2513 */
756int schedule_work_on(int cpu, struct work_struct *work) 2514int schedule_work_on(int cpu, struct work_struct *work)
757{ 2515{
758 return queue_work_on(cpu, keventd_wq, work); 2516 return queue_work_on(cpu, system_wq, work);
759} 2517}
760EXPORT_SYMBOL(schedule_work_on); 2518EXPORT_SYMBOL(schedule_work_on);
761 2519
@@ -770,7 +2528,7 @@ EXPORT_SYMBOL(schedule_work_on);
770int schedule_delayed_work(struct delayed_work *dwork, 2528int schedule_delayed_work(struct delayed_work *dwork,
771 unsigned long delay) 2529 unsigned long delay)
772{ 2530{
773 return queue_delayed_work(keventd_wq, dwork, delay); 2531 return queue_delayed_work(system_wq, dwork, delay);
774} 2532}
775EXPORT_SYMBOL(schedule_delayed_work); 2533EXPORT_SYMBOL(schedule_delayed_work);
776 2534
@@ -783,9 +2541,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
783void flush_delayed_work(struct delayed_work *dwork) 2541void flush_delayed_work(struct delayed_work *dwork)
784{ 2542{
785 if (del_timer_sync(&dwork->timer)) { 2543 if (del_timer_sync(&dwork->timer)) {
786 struct cpu_workqueue_struct *cwq; 2544 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
787 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); 2545 &dwork->work);
788 __queue_work(cwq, &dwork->work);
789 put_cpu(); 2546 put_cpu();
790 } 2547 }
791 flush_work(&dwork->work); 2548 flush_work(&dwork->work);
@@ -804,7 +2561,7 @@ EXPORT_SYMBOL(flush_delayed_work);
804int schedule_delayed_work_on(int cpu, 2561int schedule_delayed_work_on(int cpu,
805 struct delayed_work *dwork, unsigned long delay) 2562 struct delayed_work *dwork, unsigned long delay)
806{ 2563{
807 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); 2564 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
808} 2565}
809EXPORT_SYMBOL(schedule_delayed_work_on); 2566EXPORT_SYMBOL(schedule_delayed_work_on);
810 2567
@@ -820,8 +2577,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
820int schedule_on_each_cpu(work_func_t func) 2577int schedule_on_each_cpu(work_func_t func)
821{ 2578{
822 int cpu; 2579 int cpu;
823 int orig = -1; 2580 struct work_struct __percpu *works;
824 struct work_struct *works;
825 2581
826 works = alloc_percpu(struct work_struct); 2582 works = alloc_percpu(struct work_struct);
827 if (!works) 2583 if (!works)
@@ -829,23 +2585,12 @@ int schedule_on_each_cpu(work_func_t func)
829 2585
830 get_online_cpus(); 2586 get_online_cpus();
831 2587
832 /*
833 * When running in keventd don't schedule a work item on
834 * itself. Can just call directly because the work queue is
835 * already bound. This also is faster.
836 */
837 if (current_is_keventd())
838 orig = raw_smp_processor_id();
839
840 for_each_online_cpu(cpu) { 2588 for_each_online_cpu(cpu) {
841 struct work_struct *work = per_cpu_ptr(works, cpu); 2589 struct work_struct *work = per_cpu_ptr(works, cpu);
842 2590
843 INIT_WORK(work, func); 2591 INIT_WORK(work, func);
844 if (cpu != orig) 2592 schedule_work_on(cpu, work);
845 schedule_work_on(cpu, work);
846 } 2593 }
847 if (orig >= 0)
848 func(per_cpu_ptr(works, orig));
849 2594
850 for_each_online_cpu(cpu) 2595 for_each_online_cpu(cpu)
851 flush_work(per_cpu_ptr(works, cpu)); 2596 flush_work(per_cpu_ptr(works, cpu));
@@ -881,7 +2626,7 @@ int schedule_on_each_cpu(work_func_t func)
881 */ 2626 */
882void flush_scheduled_work(void) 2627void flush_scheduled_work(void)
883{ 2628{
884 flush_workqueue(keventd_wq); 2629 flush_workqueue(system_wq);
885} 2630}
886EXPORT_SYMBOL(flush_scheduled_work); 2631EXPORT_SYMBOL(flush_scheduled_work);
887 2632
@@ -913,170 +2658,170 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
913 2658
914int keventd_up(void) 2659int keventd_up(void)
915{ 2660{
916 return keventd_wq != NULL; 2661 return system_wq != NULL;
917} 2662}
918 2663
919int current_is_keventd(void) 2664static int alloc_cwqs(struct workqueue_struct *wq)
920{ 2665{
921 struct cpu_workqueue_struct *cwq; 2666 /*
922 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ 2667 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
923 int ret = 0; 2668 * Make sure that the alignment isn't lower than that of
924 2669 * unsigned long long.
925 BUG_ON(!keventd_wq); 2670 */
2671 const size_t size = sizeof(struct cpu_workqueue_struct);
2672 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2673 __alignof__(unsigned long long));
2674#ifdef CONFIG_SMP
2675 bool percpu = !(wq->flags & WQ_UNBOUND);
2676#else
2677 bool percpu = false;
2678#endif
926 2679
927 cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); 2680 if (percpu)
928 if (current == cwq->thread) 2681 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
929 ret = 1; 2682 else {
2683 void *ptr;
930 2684
931 return ret; 2685 /*
2686 * Allocate enough room to align cwq and put an extra
2687 * pointer at the end pointing back to the originally
2688 * allocated pointer which will be used for free.
2689 */
2690 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2691 if (ptr) {
2692 wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2693 *(void **)(wq->cpu_wq.single + 1) = ptr;
2694 }
2695 }
932 2696
2697 /* just in case, make sure it's actually aligned */
2698 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2699 return wq->cpu_wq.v ? 0 : -ENOMEM;
933} 2700}
934 2701
935static struct cpu_workqueue_struct * 2702static void free_cwqs(struct workqueue_struct *wq)
936init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
937{ 2703{
938 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 2704#ifdef CONFIG_SMP
939 2705 bool percpu = !(wq->flags & WQ_UNBOUND);
940 cwq->wq = wq; 2706#else
941 spin_lock_init(&cwq->lock); 2707 bool percpu = false;
942 INIT_LIST_HEAD(&cwq->worklist); 2708#endif
943 init_waitqueue_head(&cwq->more_work);
944 2709
945 return cwq; 2710 if (percpu)
2711 free_percpu(wq->cpu_wq.pcpu);
2712 else if (wq->cpu_wq.single) {
2713 /* the pointer to free is stored right after the cwq */
2714 kfree(*(void **)(wq->cpu_wq.single + 1));
2715 }
946} 2716}
947 2717
948static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2718static int wq_clamp_max_active(int max_active, unsigned int flags,
2719 const char *name)
949{ 2720{
950 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 2721 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
951 struct workqueue_struct *wq = cwq->wq;
952 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
953 struct task_struct *p;
954 2722
955 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); 2723 if (max_active < 1 || max_active > lim)
956 /* 2724 printk(KERN_WARNING "workqueue: max_active %d requested for %s "
957 * Nobody can add the work_struct to this cwq, 2725 "is out of range, clamping between %d and %d\n",
958 * if (caller is __create_workqueue) 2726 max_active, name, 1, lim);
959 * nobody should see this wq
960 * else // caller is CPU_UP_PREPARE
961 * cpu is not on cpu_online_map
962 * so we can abort safely.
963 */
964 if (IS_ERR(p))
965 return PTR_ERR(p);
966 if (cwq->wq->rt)
967 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
968 cwq->thread = p;
969
970 trace_workqueue_creation(cwq->thread, cpu);
971 2727
972 return 0; 2728 return clamp_val(max_active, 1, lim);
973} 2729}
974 2730
975static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2731struct workqueue_struct *__alloc_workqueue_key(const char *name,
2732 unsigned int flags,
2733 int max_active,
2734 struct lock_class_key *key,
2735 const char *lock_name)
976{ 2736{
977 struct task_struct *p = cwq->thread; 2737 struct workqueue_struct *wq;
2738 unsigned int cpu;
978 2739
979 if (p != NULL) { 2740 /*
980 if (cpu >= 0) 2741 * Unbound workqueues aren't concurrency managed and should be
981 kthread_bind(p, cpu); 2742 * dispatched to workers immediately.
982 wake_up_process(p); 2743 */
983 } 2744 if (flags & WQ_UNBOUND)
984} 2745 flags |= WQ_HIGHPRI;
985 2746
986struct workqueue_struct *__create_workqueue_key(const char *name, 2747 max_active = max_active ?: WQ_DFL_ACTIVE;
987 int singlethread, 2748 max_active = wq_clamp_max_active(max_active, flags, name);
988 int freezeable,
989 int rt,
990 struct lock_class_key *key,
991 const char *lock_name)
992{
993 struct workqueue_struct *wq;
994 struct cpu_workqueue_struct *cwq;
995 int err = 0, cpu;
996 2749
997 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 2750 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
998 if (!wq) 2751 if (!wq)
999 return NULL; 2752 goto err;
1000 2753
1001 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 2754 wq->flags = flags;
1002 if (!wq->cpu_wq) { 2755 wq->saved_max_active = max_active;
1003 kfree(wq); 2756 mutex_init(&wq->flush_mutex);
1004 return NULL; 2757 atomic_set(&wq->nr_cwqs_to_flush, 0);
1005 } 2758 INIT_LIST_HEAD(&wq->flusher_queue);
2759 INIT_LIST_HEAD(&wq->flusher_overflow);
1006 2760
1007 wq->name = name; 2761 wq->name = name;
1008 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 2762 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
1009 wq->singlethread = singlethread;
1010 wq->freezeable = freezeable;
1011 wq->rt = rt;
1012 INIT_LIST_HEAD(&wq->list); 2763 INIT_LIST_HEAD(&wq->list);
1013 2764
1014 if (singlethread) { 2765 if (alloc_cwqs(wq) < 0)
1015 cwq = init_cpu_workqueue(wq, singlethread_cpu); 2766 goto err;
1016 err = create_workqueue_thread(cwq, singlethread_cpu); 2767
1017 start_workqueue_thread(cwq, -1); 2768 for_each_cwq_cpu(cpu, wq) {
1018 } else { 2769 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1019 cpu_maps_update_begin(); 2770 struct global_cwq *gcwq = get_gcwq(cpu);
1020 /* 2771
1021 * We must place this wq on list even if the code below fails. 2772 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
1022 * cpu_down(cpu) can remove cpu from cpu_populated_map before 2773 cwq->gcwq = gcwq;
1023 * destroy_workqueue() takes the lock, in that case we leak 2774 cwq->wq = wq;
1024 * cwq[cpu]->thread. 2775 cwq->flush_color = -1;
1025 */ 2776 cwq->max_active = max_active;
1026 spin_lock(&workqueue_lock); 2777 INIT_LIST_HEAD(&cwq->delayed_works);
1027 list_add(&wq->list, &workqueues);
1028 spin_unlock(&workqueue_lock);
1029 /*
1030 * We must initialize cwqs for each possible cpu even if we
1031 * are going to call destroy_workqueue() finally. Otherwise
1032 * cpu_up() can hit the uninitialized cwq once we drop the
1033 * lock.
1034 */
1035 for_each_possible_cpu(cpu) {
1036 cwq = init_cpu_workqueue(wq, cpu);
1037 if (err || !cpu_online(cpu))
1038 continue;
1039 err = create_workqueue_thread(cwq, cpu);
1040 start_workqueue_thread(cwq, cpu);
1041 }
1042 cpu_maps_update_done();
1043 } 2778 }
1044 2779
1045 if (err) { 2780 if (flags & WQ_RESCUER) {
1046 destroy_workqueue(wq); 2781 struct worker *rescuer;
1047 wq = NULL; 2782
2783 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
2784 goto err;
2785
2786 wq->rescuer = rescuer = alloc_worker();
2787 if (!rescuer)
2788 goto err;
2789
2790 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
2791 if (IS_ERR(rescuer->task))
2792 goto err;
2793
2794 wq->rescuer = rescuer;
2795 rescuer->task->flags |= PF_THREAD_BOUND;
2796 wake_up_process(rescuer->task);
1048 } 2797 }
1049 return wq;
1050}
1051EXPORT_SYMBOL_GPL(__create_workqueue_key);
1052 2798
1053static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1054{
1055 /* 2799 /*
1056 * Our caller is either destroy_workqueue() or CPU_POST_DEAD, 2800 * workqueue_lock protects global freeze state and workqueues
1057 * cpu_add_remove_lock protects cwq->thread. 2801 * list. Grab it, set max_active accordingly and add the new
2802 * workqueue to workqueues list.
1058 */ 2803 */
1059 if (cwq->thread == NULL) 2804 spin_lock(&workqueue_lock);
1060 return;
1061 2805
1062 lock_map_acquire(&cwq->wq->lockdep_map); 2806 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
1063 lock_map_release(&cwq->wq->lockdep_map); 2807 for_each_cwq_cpu(cpu, wq)
2808 get_cwq(cpu, wq)->max_active = 0;
1064 2809
1065 flush_cpu_workqueue(cwq); 2810 list_add(&wq->list, &workqueues);
1066 /* 2811
1067 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, 2812 spin_unlock(&workqueue_lock);
1068 * a concurrent flush_workqueue() can insert a barrier after us. 2813
1069 * However, in that case run_workqueue() won't return and check 2814 return wq;
1070 * kthread_should_stop() until it flushes all work_struct's. 2815err:
1071 * When ->worklist becomes empty it is safe to exit because no 2816 if (wq) {
1072 * more work_structs can be queued on this cwq: flush_workqueue 2817 free_cwqs(wq);
1073 * checks list_empty(), and a "normal" queue_work() can't use 2818 free_mayday_mask(wq->mayday_mask);
1074 * a dead CPU. 2819 kfree(wq->rescuer);
1075 */ 2820 kfree(wq);
1076 trace_workqueue_destruction(cwq->thread); 2821 }
1077 kthread_stop(cwq->thread); 2822 return NULL;
1078 cwq->thread = NULL;
1079} 2823}
2824EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
1080 2825
1081/** 2826/**
1082 * destroy_workqueue - safely terminate a workqueue 2827 * destroy_workqueue - safely terminate a workqueue
@@ -1086,72 +2831,516 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1086 */ 2831 */
1087void destroy_workqueue(struct workqueue_struct *wq) 2832void destroy_workqueue(struct workqueue_struct *wq)
1088{ 2833{
1089 const struct cpumask *cpu_map = wq_cpu_map(wq); 2834 unsigned int cpu;
1090 int cpu; 2835
2836 flush_workqueue(wq);
1091 2837
1092 cpu_maps_update_begin(); 2838 /*
2839 * wq list is used to freeze wq, remove from list after
2840 * flushing is complete in case freeze races us.
2841 */
1093 spin_lock(&workqueue_lock); 2842 spin_lock(&workqueue_lock);
1094 list_del(&wq->list); 2843 list_del(&wq->list);
1095 spin_unlock(&workqueue_lock); 2844 spin_unlock(&workqueue_lock);
1096 2845
1097 for_each_cpu(cpu, cpu_map) 2846 /* sanity check */
1098 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 2847 for_each_cwq_cpu(cpu, wq) {
1099 cpu_maps_update_done(); 2848 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2849 int i;
2850
2851 for (i = 0; i < WORK_NR_COLORS; i++)
2852 BUG_ON(cwq->nr_in_flight[i]);
2853 BUG_ON(cwq->nr_active);
2854 BUG_ON(!list_empty(&cwq->delayed_works));
2855 }
2856
2857 if (wq->flags & WQ_RESCUER) {
2858 kthread_stop(wq->rescuer->task);
2859 free_mayday_mask(wq->mayday_mask);
2860 }
1100 2861
1101 free_percpu(wq->cpu_wq); 2862 free_cwqs(wq);
1102 kfree(wq); 2863 kfree(wq);
1103} 2864}
1104EXPORT_SYMBOL_GPL(destroy_workqueue); 2865EXPORT_SYMBOL_GPL(destroy_workqueue);
1105 2866
2867/**
2868 * workqueue_set_max_active - adjust max_active of a workqueue
2869 * @wq: target workqueue
2870 * @max_active: new max_active value.
2871 *
2872 * Set max_active of @wq to @max_active.
2873 *
2874 * CONTEXT:
2875 * Don't call from IRQ context.
2876 */
2877void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2878{
2879 unsigned int cpu;
2880
2881 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
2882
2883 spin_lock(&workqueue_lock);
2884
2885 wq->saved_max_active = max_active;
2886
2887 for_each_cwq_cpu(cpu, wq) {
2888 struct global_cwq *gcwq = get_gcwq(cpu);
2889
2890 spin_lock_irq(&gcwq->lock);
2891
2892 if (!(wq->flags & WQ_FREEZEABLE) ||
2893 !(gcwq->flags & GCWQ_FREEZING))
2894 get_cwq(gcwq->cpu, wq)->max_active = max_active;
2895
2896 spin_unlock_irq(&gcwq->lock);
2897 }
2898
2899 spin_unlock(&workqueue_lock);
2900}
2901EXPORT_SYMBOL_GPL(workqueue_set_max_active);
2902
2903/**
2904 * workqueue_congested - test whether a workqueue is congested
2905 * @cpu: CPU in question
2906 * @wq: target workqueue
2907 *
2908 * Test whether @wq's cpu workqueue for @cpu is congested. There is
2909 * no synchronization around this function and the test result is
2910 * unreliable and only useful as advisory hints or for debugging.
2911 *
2912 * RETURNS:
2913 * %true if congested, %false otherwise.
2914 */
2915bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
2916{
2917 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2918
2919 return !list_empty(&cwq->delayed_works);
2920}
2921EXPORT_SYMBOL_GPL(workqueue_congested);
2922
2923/**
2924 * work_cpu - return the last known associated cpu for @work
2925 * @work: the work of interest
2926 *
2927 * RETURNS:
2928 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
2929 */
2930unsigned int work_cpu(struct work_struct *work)
2931{
2932 struct global_cwq *gcwq = get_work_gcwq(work);
2933
2934 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
2935}
2936EXPORT_SYMBOL_GPL(work_cpu);
2937
2938/**
2939 * work_busy - test whether a work is currently pending or running
2940 * @work: the work to be tested
2941 *
2942 * Test whether @work is currently pending or running. There is no
2943 * synchronization around this function and the test result is
2944 * unreliable and only useful as advisory hints or for debugging.
2945 * Especially for reentrant wqs, the pending state might hide the
2946 * running state.
2947 *
2948 * RETURNS:
2949 * OR'd bitmask of WORK_BUSY_* bits.
2950 */
2951unsigned int work_busy(struct work_struct *work)
2952{
2953 struct global_cwq *gcwq = get_work_gcwq(work);
2954 unsigned long flags;
2955 unsigned int ret = 0;
2956
2957 if (!gcwq)
2958 return false;
2959
2960 spin_lock_irqsave(&gcwq->lock, flags);
2961
2962 if (work_pending(work))
2963 ret |= WORK_BUSY_PENDING;
2964 if (find_worker_executing_work(gcwq, work))
2965 ret |= WORK_BUSY_RUNNING;
2966
2967 spin_unlock_irqrestore(&gcwq->lock, flags);
2968
2969 return ret;
2970}
2971EXPORT_SYMBOL_GPL(work_busy);
2972
2973/*
2974 * CPU hotplug.
2975 *
2976 * There are two challenges in supporting CPU hotplug. Firstly, there
2977 * are a lot of assumptions on strong associations among work, cwq and
2978 * gcwq which make migrating pending and scheduled works very
2979 * difficult to implement without impacting hot paths. Secondly,
2980 * gcwqs serve mix of short, long and very long running works making
2981 * blocked draining impractical.
2982 *
2983 * This is solved by allowing a gcwq to be detached from CPU, running
2984 * it with unbound (rogue) workers and allowing it to be reattached
2985 * later if the cpu comes back online. A separate thread is created
2986 * to govern a gcwq in such state and is called the trustee of the
2987 * gcwq.
2988 *
2989 * Trustee states and their descriptions.
2990 *
2991 * START Command state used on startup. On CPU_DOWN_PREPARE, a
2992 * new trustee is started with this state.
2993 *
2994 * IN_CHARGE Once started, trustee will enter this state after
2995 * assuming the manager role and making all existing
2996 * workers rogue. DOWN_PREPARE waits for trustee to
2997 * enter this state. After reaching IN_CHARGE, trustee
2998 * tries to execute the pending worklist until it's empty
2999 * and the state is set to BUTCHER, or the state is set
3000 * to RELEASE.
3001 *
3002 * BUTCHER Command state which is set by the cpu callback after
3003 * the cpu has went down. Once this state is set trustee
3004 * knows that there will be no new works on the worklist
3005 * and once the worklist is empty it can proceed to
3006 * killing idle workers.
3007 *
3008 * RELEASE Command state which is set by the cpu callback if the
3009 * cpu down has been canceled or it has come online
3010 * again. After recognizing this state, trustee stops
3011 * trying to drain or butcher and clears ROGUE, rebinds
3012 * all remaining workers back to the cpu and releases
3013 * manager role.
3014 *
3015 * DONE Trustee will enter this state after BUTCHER or RELEASE
3016 * is complete.
3017 *
3018 * trustee CPU draining
3019 * took over down complete
3020 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3021 * | | ^
3022 * | CPU is back online v return workers |
3023 * ----------------> RELEASE --------------
3024 */
3025
3026/**
3027 * trustee_wait_event_timeout - timed event wait for trustee
3028 * @cond: condition to wait for
3029 * @timeout: timeout in jiffies
3030 *
3031 * wait_event_timeout() for trustee to use. Handles locking and
3032 * checks for RELEASE request.
3033 *
3034 * CONTEXT:
3035 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3036 * multiple times. To be used by trustee.
3037 *
3038 * RETURNS:
3039 * Positive indicating left time if @cond is satisfied, 0 if timed
3040 * out, -1 if canceled.
3041 */
3042#define trustee_wait_event_timeout(cond, timeout) ({ \
3043 long __ret = (timeout); \
3044 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3045 __ret) { \
3046 spin_unlock_irq(&gcwq->lock); \
3047 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3048 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3049 __ret); \
3050 spin_lock_irq(&gcwq->lock); \
3051 } \
3052 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3053})
3054
3055/**
3056 * trustee_wait_event - event wait for trustee
3057 * @cond: condition to wait for
3058 *
3059 * wait_event() for trustee to use. Automatically handles locking and
3060 * checks for CANCEL request.
3061 *
3062 * CONTEXT:
3063 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3064 * multiple times. To be used by trustee.
3065 *
3066 * RETURNS:
3067 * 0 if @cond is satisfied, -1 if canceled.
3068 */
3069#define trustee_wait_event(cond) ({ \
3070 long __ret1; \
3071 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3072 __ret1 < 0 ? -1 : 0; \
3073})
3074
3075static int __cpuinit trustee_thread(void *__gcwq)
3076{
3077 struct global_cwq *gcwq = __gcwq;
3078 struct worker *worker;
3079 struct work_struct *work;
3080 struct hlist_node *pos;
3081 long rc;
3082 int i;
3083
3084 BUG_ON(gcwq->cpu != smp_processor_id());
3085
3086 spin_lock_irq(&gcwq->lock);
3087 /*
3088 * Claim the manager position and make all workers rogue.
3089 * Trustee must be bound to the target cpu and can't be
3090 * cancelled.
3091 */
3092 BUG_ON(gcwq->cpu != smp_processor_id());
3093 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3094 BUG_ON(rc < 0);
3095
3096 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3097
3098 list_for_each_entry(worker, &gcwq->idle_list, entry)
3099 worker->flags |= WORKER_ROGUE;
3100
3101 for_each_busy_worker(worker, i, pos, gcwq)
3102 worker->flags |= WORKER_ROGUE;
3103
3104 /*
3105 * Call schedule() so that we cross rq->lock and thus can
3106 * guarantee sched callbacks see the rogue flag. This is
3107 * necessary as scheduler callbacks may be invoked from other
3108 * cpus.
3109 */
3110 spin_unlock_irq(&gcwq->lock);
3111 schedule();
3112 spin_lock_irq(&gcwq->lock);
3113
3114 /*
3115 * Sched callbacks are disabled now. Zap nr_running. After
3116 * this, nr_running stays zero and need_more_worker() and
3117 * keep_working() are always true as long as the worklist is
3118 * not empty.
3119 */
3120 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3121
3122 spin_unlock_irq(&gcwq->lock);
3123 del_timer_sync(&gcwq->idle_timer);
3124 spin_lock_irq(&gcwq->lock);
3125
3126 /*
3127 * We're now in charge. Notify and proceed to drain. We need
3128 * to keep the gcwq running during the whole CPU down
3129 * procedure as other cpu hotunplug callbacks may need to
3130 * flush currently running tasks.
3131 */
3132 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3133 wake_up_all(&gcwq->trustee_wait);
3134
3135 /*
3136 * The original cpu is in the process of dying and may go away
3137 * anytime now. When that happens, we and all workers would
3138 * be migrated to other cpus. Try draining any left work. We
3139 * want to get it over with ASAP - spam rescuers, wake up as
3140 * many idlers as necessary and create new ones till the
3141 * worklist is empty. Note that if the gcwq is frozen, there
3142 * may be frozen works in freezeable cwqs. Don't declare
3143 * completion while frozen.
3144 */
3145 while (gcwq->nr_workers != gcwq->nr_idle ||
3146 gcwq->flags & GCWQ_FREEZING ||
3147 gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3148 int nr_works = 0;
3149
3150 list_for_each_entry(work, &gcwq->worklist, entry) {
3151 send_mayday(work);
3152 nr_works++;
3153 }
3154
3155 list_for_each_entry(worker, &gcwq->idle_list, entry) {
3156 if (!nr_works--)
3157 break;
3158 wake_up_process(worker->task);
3159 }
3160
3161 if (need_to_create_worker(gcwq)) {
3162 spin_unlock_irq(&gcwq->lock);
3163 worker = create_worker(gcwq, false);
3164 spin_lock_irq(&gcwq->lock);
3165 if (worker) {
3166 worker->flags |= WORKER_ROGUE;
3167 start_worker(worker);
3168 }
3169 }
3170
3171 /* give a breather */
3172 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3173 break;
3174 }
3175
3176 /*
3177 * Either all works have been scheduled and cpu is down, or
3178 * cpu down has already been canceled. Wait for and butcher
3179 * all workers till we're canceled.
3180 */
3181 do {
3182 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3183 while (!list_empty(&gcwq->idle_list))
3184 destroy_worker(list_first_entry(&gcwq->idle_list,
3185 struct worker, entry));
3186 } while (gcwq->nr_workers && rc >= 0);
3187
3188 /*
3189 * At this point, either draining has completed and no worker
3190 * is left, or cpu down has been canceled or the cpu is being
3191 * brought back up. There shouldn't be any idle one left.
3192 * Tell the remaining busy ones to rebind once it finishes the
3193 * currently scheduled works by scheduling the rebind_work.
3194 */
3195 WARN_ON(!list_empty(&gcwq->idle_list));
3196
3197 for_each_busy_worker(worker, i, pos, gcwq) {
3198 struct work_struct *rebind_work = &worker->rebind_work;
3199
3200 /*
3201 * Rebind_work may race with future cpu hotplug
3202 * operations. Use a separate flag to mark that
3203 * rebinding is scheduled.
3204 */
3205 worker->flags |= WORKER_REBIND;
3206 worker->flags &= ~WORKER_ROGUE;
3207
3208 /* queue rebind_work, wq doesn't matter, use the default one */
3209 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3210 work_data_bits(rebind_work)))
3211 continue;
3212
3213 debug_work_activate(rebind_work);
3214 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3215 worker->scheduled.next,
3216 work_color_to_flags(WORK_NO_COLOR));
3217 }
3218
3219 /* relinquish manager role */
3220 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3221
3222 /* notify completion */
3223 gcwq->trustee = NULL;
3224 gcwq->trustee_state = TRUSTEE_DONE;
3225 wake_up_all(&gcwq->trustee_wait);
3226 spin_unlock_irq(&gcwq->lock);
3227 return 0;
3228}
3229
3230/**
3231 * wait_trustee_state - wait for trustee to enter the specified state
3232 * @gcwq: gcwq the trustee of interest belongs to
3233 * @state: target state to wait for
3234 *
3235 * Wait for the trustee to reach @state. DONE is already matched.
3236 *
3237 * CONTEXT:
3238 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3239 * multiple times. To be used by cpu_callback.
3240 */
3241static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3242{
3243 if (!(gcwq->trustee_state == state ||
3244 gcwq->trustee_state == TRUSTEE_DONE)) {
3245 spin_unlock_irq(&gcwq->lock);
3246 __wait_event(gcwq->trustee_wait,
3247 gcwq->trustee_state == state ||
3248 gcwq->trustee_state == TRUSTEE_DONE);
3249 spin_lock_irq(&gcwq->lock);
3250 }
3251}
3252
1106static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 3253static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1107 unsigned long action, 3254 unsigned long action,
1108 void *hcpu) 3255 void *hcpu)
1109{ 3256{
1110 unsigned int cpu = (unsigned long)hcpu; 3257 unsigned int cpu = (unsigned long)hcpu;
1111 struct cpu_workqueue_struct *cwq; 3258 struct global_cwq *gcwq = get_gcwq(cpu);
1112 struct workqueue_struct *wq; 3259 struct task_struct *new_trustee = NULL;
1113 int err = 0; 3260 struct worker *uninitialized_var(new_worker);
3261 unsigned long flags;
1114 3262
1115 action &= ~CPU_TASKS_FROZEN; 3263 action &= ~CPU_TASKS_FROZEN;
1116 3264
1117 switch (action) { 3265 switch (action) {
3266 case CPU_DOWN_PREPARE:
3267 new_trustee = kthread_create(trustee_thread, gcwq,
3268 "workqueue_trustee/%d\n", cpu);
3269 if (IS_ERR(new_trustee))
3270 return notifier_from_errno(PTR_ERR(new_trustee));
3271 kthread_bind(new_trustee, cpu);
3272 /* fall through */
1118 case CPU_UP_PREPARE: 3273 case CPU_UP_PREPARE:
1119 cpumask_set_cpu(cpu, cpu_populated_map); 3274 BUG_ON(gcwq->first_idle);
1120 } 3275 new_worker = create_worker(gcwq, false);
1121undo: 3276 if (!new_worker) {
1122 list_for_each_entry(wq, &workqueues, list) { 3277 if (new_trustee)
1123 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 3278 kthread_stop(new_trustee);
1124 3279 return NOTIFY_BAD;
1125 switch (action) {
1126 case CPU_UP_PREPARE:
1127 err = create_workqueue_thread(cwq, cpu);
1128 if (!err)
1129 break;
1130 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1131 wq->name, cpu);
1132 action = CPU_UP_CANCELED;
1133 err = -ENOMEM;
1134 goto undo;
1135
1136 case CPU_ONLINE:
1137 start_workqueue_thread(cwq, cpu);
1138 break;
1139
1140 case CPU_UP_CANCELED:
1141 start_workqueue_thread(cwq, -1);
1142 case CPU_POST_DEAD:
1143 cleanup_workqueue_thread(cwq);
1144 break;
1145 } 3280 }
1146 } 3281 }
1147 3282
3283 /* some are called w/ irq disabled, don't disturb irq status */
3284 spin_lock_irqsave(&gcwq->lock, flags);
3285
1148 switch (action) { 3286 switch (action) {
1149 case CPU_UP_CANCELED: 3287 case CPU_DOWN_PREPARE:
3288 /* initialize trustee and tell it to acquire the gcwq */
3289 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3290 gcwq->trustee = new_trustee;
3291 gcwq->trustee_state = TRUSTEE_START;
3292 wake_up_process(gcwq->trustee);
3293 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3294 /* fall through */
3295 case CPU_UP_PREPARE:
3296 BUG_ON(gcwq->first_idle);
3297 gcwq->first_idle = new_worker;
3298 break;
3299
3300 case CPU_DYING:
3301 /*
3302 * Before this, the trustee and all workers except for
3303 * the ones which are still executing works from
3304 * before the last CPU down must be on the cpu. After
3305 * this, they'll all be diasporas.
3306 */
3307 gcwq->flags |= GCWQ_DISASSOCIATED;
3308 break;
3309
1150 case CPU_POST_DEAD: 3310 case CPU_POST_DEAD:
1151 cpumask_clear_cpu(cpu, cpu_populated_map); 3311 gcwq->trustee_state = TRUSTEE_BUTCHER;
3312 /* fall through */
3313 case CPU_UP_CANCELED:
3314 destroy_worker(gcwq->first_idle);
3315 gcwq->first_idle = NULL;
3316 break;
3317
3318 case CPU_DOWN_FAILED:
3319 case CPU_ONLINE:
3320 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3321 if (gcwq->trustee_state != TRUSTEE_DONE) {
3322 gcwq->trustee_state = TRUSTEE_RELEASE;
3323 wake_up_process(gcwq->trustee);
3324 wait_trustee_state(gcwq, TRUSTEE_DONE);
3325 }
3326
3327 /*
3328 * Trustee is done and there might be no worker left.
3329 * Put the first_idle in and request a real manager to
3330 * take a look.
3331 */
3332 spin_unlock_irq(&gcwq->lock);
3333 kthread_bind(gcwq->first_idle->task, cpu);
3334 spin_lock_irq(&gcwq->lock);
3335 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3336 start_worker(gcwq->first_idle);
3337 gcwq->first_idle = NULL;
3338 break;
1152 } 3339 }
1153 3340
1154 return notifier_from_errno(err); 3341 spin_unlock_irqrestore(&gcwq->lock, flags);
3342
3343 return notifier_from_errno(0);
1155} 3344}
1156 3345
1157#ifdef CONFIG_SMP 3346#ifdef CONFIG_SMP
@@ -1201,14 +3390,199 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
1201EXPORT_SYMBOL_GPL(work_on_cpu); 3390EXPORT_SYMBOL_GPL(work_on_cpu);
1202#endif /* CONFIG_SMP */ 3391#endif /* CONFIG_SMP */
1203 3392
1204void __init init_workqueues(void) 3393#ifdef CONFIG_FREEZER
3394
3395/**
3396 * freeze_workqueues_begin - begin freezing workqueues
3397 *
3398 * Start freezing workqueues. After this function returns, all
3399 * freezeable workqueues will queue new works to their frozen_works
3400 * list instead of gcwq->worklist.
3401 *
3402 * CONTEXT:
3403 * Grabs and releases workqueue_lock and gcwq->lock's.
3404 */
3405void freeze_workqueues_begin(void)
1205{ 3406{
1206 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); 3407 unsigned int cpu;
1207 3408
1208 cpumask_copy(cpu_populated_map, cpu_online_mask); 3409 spin_lock(&workqueue_lock);
1209 singlethread_cpu = cpumask_first(cpu_possible_mask); 3410
1210 cpu_singlethread_map = cpumask_of(singlethread_cpu); 3411 BUG_ON(workqueue_freezing);
1211 hotcpu_notifier(workqueue_cpu_callback, 0); 3412 workqueue_freezing = true;
1212 keventd_wq = create_workqueue("events"); 3413
1213 BUG_ON(!keventd_wq); 3414 for_each_gcwq_cpu(cpu) {
3415 struct global_cwq *gcwq = get_gcwq(cpu);
3416 struct workqueue_struct *wq;
3417
3418 spin_lock_irq(&gcwq->lock);
3419
3420 BUG_ON(gcwq->flags & GCWQ_FREEZING);
3421 gcwq->flags |= GCWQ_FREEZING;
3422
3423 list_for_each_entry(wq, &workqueues, list) {
3424 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3425
3426 if (cwq && wq->flags & WQ_FREEZEABLE)
3427 cwq->max_active = 0;
3428 }
3429
3430 spin_unlock_irq(&gcwq->lock);
3431 }
3432
3433 spin_unlock(&workqueue_lock);
3434}
3435
3436/**
3437 * freeze_workqueues_busy - are freezeable workqueues still busy?
3438 *
3439 * Check whether freezing is complete. This function must be called
3440 * between freeze_workqueues_begin() and thaw_workqueues().
3441 *
3442 * CONTEXT:
3443 * Grabs and releases workqueue_lock.
3444 *
3445 * RETURNS:
3446 * %true if some freezeable workqueues are still busy. %false if
3447 * freezing is complete.
3448 */
3449bool freeze_workqueues_busy(void)
3450{
3451 unsigned int cpu;
3452 bool busy = false;
3453
3454 spin_lock(&workqueue_lock);
3455
3456 BUG_ON(!workqueue_freezing);
3457
3458 for_each_gcwq_cpu(cpu) {
3459 struct workqueue_struct *wq;
3460 /*
3461 * nr_active is monotonically decreasing. It's safe
3462 * to peek without lock.
3463 */
3464 list_for_each_entry(wq, &workqueues, list) {
3465 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3466
3467 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3468 continue;
3469
3470 BUG_ON(cwq->nr_active < 0);
3471 if (cwq->nr_active) {
3472 busy = true;
3473 goto out_unlock;
3474 }
3475 }
3476 }
3477out_unlock:
3478 spin_unlock(&workqueue_lock);
3479 return busy;
3480}
3481
3482/**
3483 * thaw_workqueues - thaw workqueues
3484 *
3485 * Thaw workqueues. Normal queueing is restored and all collected
3486 * frozen works are transferred to their respective gcwq worklists.
3487 *
3488 * CONTEXT:
3489 * Grabs and releases workqueue_lock and gcwq->lock's.
3490 */
3491void thaw_workqueues(void)
3492{
3493 unsigned int cpu;
3494
3495 spin_lock(&workqueue_lock);
3496
3497 if (!workqueue_freezing)
3498 goto out_unlock;
3499
3500 for_each_gcwq_cpu(cpu) {
3501 struct global_cwq *gcwq = get_gcwq(cpu);
3502 struct workqueue_struct *wq;
3503
3504 spin_lock_irq(&gcwq->lock);
3505
3506 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3507 gcwq->flags &= ~GCWQ_FREEZING;
3508
3509 list_for_each_entry(wq, &workqueues, list) {
3510 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3511
3512 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3513 continue;
3514
3515 /* restore max_active and repopulate worklist */
3516 cwq->max_active = wq->saved_max_active;
3517
3518 while (!list_empty(&cwq->delayed_works) &&
3519 cwq->nr_active < cwq->max_active)
3520 cwq_activate_first_delayed(cwq);
3521 }
3522
3523 wake_up_worker(gcwq);
3524
3525 spin_unlock_irq(&gcwq->lock);
3526 }
3527
3528 workqueue_freezing = false;
3529out_unlock:
3530 spin_unlock(&workqueue_lock);
3531}
3532#endif /* CONFIG_FREEZER */
3533
3534static int __init init_workqueues(void)
3535{
3536 unsigned int cpu;
3537 int i;
3538
3539 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3540
3541 /* initialize gcwqs */
3542 for_each_gcwq_cpu(cpu) {
3543 struct global_cwq *gcwq = get_gcwq(cpu);
3544
3545 spin_lock_init(&gcwq->lock);
3546 INIT_LIST_HEAD(&gcwq->worklist);
3547 gcwq->cpu = cpu;
3548 if (cpu == WORK_CPU_UNBOUND)
3549 gcwq->flags |= GCWQ_DISASSOCIATED;
3550
3551 INIT_LIST_HEAD(&gcwq->idle_list);
3552 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3553 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3554
3555 init_timer_deferrable(&gcwq->idle_timer);
3556 gcwq->idle_timer.function = idle_worker_timeout;
3557 gcwq->idle_timer.data = (unsigned long)gcwq;
3558
3559 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3560 (unsigned long)gcwq);
3561
3562 ida_init(&gcwq->worker_ida);
3563
3564 gcwq->trustee_state = TRUSTEE_DONE;
3565 init_waitqueue_head(&gcwq->trustee_wait);
3566 }
3567
3568 /* create the initial worker */
3569 for_each_online_gcwq_cpu(cpu) {
3570 struct global_cwq *gcwq = get_gcwq(cpu);
3571 struct worker *worker;
3572
3573 worker = create_worker(gcwq, true);
3574 BUG_ON(!worker);
3575 spin_lock_irq(&gcwq->lock);
3576 start_worker(worker);
3577 spin_unlock_irq(&gcwq->lock);
3578 }
3579
3580 system_wq = alloc_workqueue("events", 0, 0);
3581 system_long_wq = alloc_workqueue("events_long", 0, 0);
3582 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3583 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3584 WQ_UNBOUND_MAX_ACTIVE);
3585 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
3586 return 0;
1214} 3587}
3588early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..2d10fc98dc79
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);