aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c141
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit.h26
-rw-r--r--kernel/audit_tree.c237
-rw-r--r--kernel/audit_watch.c274
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/cgroup.c36
-rw-r--r--kernel/compat.c17
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/gdbstub.c191
-rw-r--r--kernel/debug/kdb/kdb_main.c136
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/exec_domain.c22
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/hrtimer.c17
-rw-r--r--kernel/hw_breakpoint.c90
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/kthread.c164
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/module.c1088
-rw-r--r--kernel/padata.c755
-rw-r--r--kernel/perf_event.c460
-rw-r--r--kernel/pm_qos_params.c215
-rw-r--r--kernel/posix-cpu-timers.c44
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/hibernate.c27
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/process.c21
-rw-r--r--kernel/power/snapshot.c3
-rw-r--r--kernel/power/suspend.c13
-rw-r--r--kernel/power/swap.c12
-rw-r--r--kernel/printk.c43
-rw-r--r--kernel/range.c4
-rw-r--r--kernel/rcupdate.c160
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutorture.c3
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/sched.c397
-rw-r--r--kernel/sched_clock.c95
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c532
-rw-r--r--kernel/sched_rt.c3
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c1068
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/softlockup.c293
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c202
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c76
-rw-r--r--kernel/time.c16
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c15
-rw-r--r--kernel/time/timekeeping.c79
-rw-r--r--kernel/timer.c53
-rw-r--r--kernel/trace/Kconfig83
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/ftrace.c5
-rw-r--r--kernel/trace/kmemtrace.c529
-rw-r--r--kernel/trace/ring_buffer.c40
-rw-r--r--kernel/trace/trace.c178
-rw-r--r--kernel/trace/trace.h107
-rw-r--r--kernel/trace/trace_boot.c185
-rw-r--r--kernel/trace/trace_clock.c7
-rw-r--r--kernel/trace/trace_entries.h94
-rw-r--r--kernel/trace/trace_event_perf.c27
-rw-r--r--kernel/trace/trace_events.c299
-rw-r--r--kernel/trace/trace_events_filter.c27
-rw-r--r--kernel/trace/trace_export.c8
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c3
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kdb.c136
-rw-r--r--kernel/trace/trace_kprobe.c383
-rw-r--r--kernel/trace/trace_ksym.c508
-rw-r--r--kernel/trace/trace_output.c69
-rw-r--r--kernel/trace/trace_sched_wakeup.c7
-rw-r--r--kernel/trace/trace_selftest.c87
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_syscalls.c7
-rw-r--r--kernel/trace/trace_sysprof.c329
-rw-r--r--kernel/user_namespace.c44
-rw-r--r--kernel/watchdog.c567
-rw-r--r--kernel/workqueue.c3177
-rw-r--r--kernel/workqueue_sched.h9
96 files changed, 7905 insertions, 6627 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 057472fbc272..0b72d1a74be0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_SMP) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
75obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_GCOV_KERNEL) += gcov/
77obj-$(CONFIG_KPROBES) += kprobes.o 78obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += debug/ 79obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 82obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
82obj-$(CONFIG_SECCOMP) += seccomp.o 83obj-$(CONFIG_SECCOMP) += seccomp.o
83obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -99,8 +100,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 100obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 101obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_SMP) += sched_cpupri.o 102obj-$(CONFIG_SMP) += sched_cpupri.o
102obj-$(CONFIG_SLOW_WORK) += slow-work.o
103obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
104obj-$(CONFIG_PERF_EVENTS) += perf_event.o 103obj-$(CONFIG_PERF_EVENTS) += perf_event.o
105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 104obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 105obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 385b88461c29..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
122 spin_unlock(&acct_lock); 122 spin_unlock(&acct_lock);
123 123
124 /* May block */ 124 /* May block */
125 if (vfs_statfs(file->f_path.dentry, &sbuf)) 125 if (vfs_statfs(&file->f_path, &sbuf))
126 return res; 126 return res;
127 suspend = sbuf.f_blocks * SUSPEND; 127 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 128 resume = sbuf.f_blocks * RESUME;
diff --git a/kernel/async.c b/kernel/async.c
index 15319d6c18fe..cd9dbb913c77 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/bug.h>
53#include <linux/module.h> 52#include <linux/module.h>
54#include <linux/wait.h> 53#include <linux/wait.h>
55#include <linux/sched.h> 54#include <linux/sched.h>
56#include <linux/init.h>
57#include <linux/kthread.h>
58#include <linux/delay.h>
59#include <linux/slab.h> 55#include <linux/slab.h>
56#include <linux/workqueue.h>
60#include <asm/atomic.h> 57#include <asm/atomic.h>
61 58
62static async_cookie_t next_cookie = 1; 59static async_cookie_t next_cookie = 1;
63 60
64#define MAX_THREADS 256
65#define MAX_WORK 32768 61#define MAX_WORK 32768
66 62
67static LIST_HEAD(async_pending); 63static LIST_HEAD(async_pending);
68static LIST_HEAD(async_running); 64static LIST_HEAD(async_running);
69static DEFINE_SPINLOCK(async_lock); 65static DEFINE_SPINLOCK(async_lock);
70 66
71static int async_enabled = 0;
72
73struct async_entry { 67struct async_entry {
74 struct list_head list; 68 struct list_head list;
75 async_cookie_t cookie; 69 struct work_struct work;
76 async_func_ptr *func; 70 async_cookie_t cookie;
77 void *data; 71 async_func_ptr *func;
78 struct list_head *running; 72 void *data;
73 struct list_head *running;
79}; 74};
80 75
81static DECLARE_WAIT_QUEUE_HEAD(async_done); 76static DECLARE_WAIT_QUEUE_HEAD(async_done);
82static DECLARE_WAIT_QUEUE_HEAD(async_new);
83 77
84static atomic_t entry_count; 78static atomic_t entry_count;
85static atomic_t thread_count;
86 79
87extern int initcall_debug; 80extern int initcall_debug;
88 81
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running)
117 spin_unlock_irqrestore(&async_lock, flags); 110 spin_unlock_irqrestore(&async_lock, flags);
118 return ret; 111 return ret;
119} 112}
113
120/* 114/*
121 * pick the first pending entry and run it 115 * pick the first pending entry and run it
122 */ 116 */
123static void run_one_entry(void) 117static void async_run_entry_fn(struct work_struct *work)
124{ 118{
119 struct async_entry *entry =
120 container_of(work, struct async_entry, work);
125 unsigned long flags; 121 unsigned long flags;
126 struct async_entry *entry;
127 ktime_t calltime, delta, rettime; 122 ktime_t calltime, delta, rettime;
128 123
129 /* 1) pick one task from the pending queue */ 124 /* 1) move self to the running queue */
130
131 spin_lock_irqsave(&async_lock, flags); 125 spin_lock_irqsave(&async_lock, flags);
132 if (list_empty(&async_pending))
133 goto out;
134 entry = list_first_entry(&async_pending, struct async_entry, list);
135
136 /* 2) move it to the running queue */
137 list_move_tail(&entry->list, entry->running); 126 list_move_tail(&entry->list, entry->running);
138 spin_unlock_irqrestore(&async_lock, flags); 127 spin_unlock_irqrestore(&async_lock, flags);
139 128
140 /* 3) run it (and print duration)*/ 129 /* 2) run (and print duration) */
141 if (initcall_debug && system_state == SYSTEM_BOOTING) { 130 if (initcall_debug && system_state == SYSTEM_BOOTING) {
142 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
143 entry->func, task_pid_nr(current)); 132 entry->func, task_pid_nr(current));
@@ -153,31 +142,25 @@ static void run_one_entry(void)
153 (long long)ktime_to_ns(delta) >> 10); 142 (long long)ktime_to_ns(delta) >> 10);
154 } 143 }
155 144
156 /* 4) remove it from the running queue */ 145 /* 3) remove self from the running queue */
157 spin_lock_irqsave(&async_lock, flags); 146 spin_lock_irqsave(&async_lock, flags);
158 list_del(&entry->list); 147 list_del(&entry->list);
159 148
160 /* 5) free the entry */ 149 /* 4) free the entry */
161 kfree(entry); 150 kfree(entry);
162 atomic_dec(&entry_count); 151 atomic_dec(&entry_count);
163 152
164 spin_unlock_irqrestore(&async_lock, flags); 153 spin_unlock_irqrestore(&async_lock, flags);
165 154
166 /* 6) wake up any waiters. */ 155 /* 5) wake up any waiters */
167 wake_up(&async_done); 156 wake_up(&async_done);
168 return;
169
170out:
171 spin_unlock_irqrestore(&async_lock, flags);
172} 157}
173 158
174
175static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) 159static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
176{ 160{
177 struct async_entry *entry; 161 struct async_entry *entry;
178 unsigned long flags; 162 unsigned long flags;
179 async_cookie_t newcookie; 163 async_cookie_t newcookie;
180
181 164
182 /* allow irq-off callers */ 165 /* allow irq-off callers */
183 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); 166 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
186 * If we're out of memory or if there's too much work 169 * If we're out of memory or if there's too much work
187 * pending already, we execute synchronously. 170 * pending already, we execute synchronously.
188 */ 171 */
189 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { 172 if (!entry || atomic_read(&entry_count) > MAX_WORK) {
190 kfree(entry); 173 kfree(entry);
191 spin_lock_irqsave(&async_lock, flags); 174 spin_lock_irqsave(&async_lock, flags);
192 newcookie = next_cookie++; 175 newcookie = next_cookie++;
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
196 ptr(data, newcookie); 179 ptr(data, newcookie);
197 return newcookie; 180 return newcookie;
198 } 181 }
182 INIT_WORK(&entry->work, async_run_entry_fn);
199 entry->func = ptr; 183 entry->func = ptr;
200 entry->data = data; 184 entry->data = data;
201 entry->running = running; 185 entry->running = running;
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
205 list_add_tail(&entry->list, &async_pending); 189 list_add_tail(&entry->list, &async_pending);
206 atomic_inc(&entry_count); 190 atomic_inc(&entry_count);
207 spin_unlock_irqrestore(&async_lock, flags); 191 spin_unlock_irqrestore(&async_lock, flags);
208 wake_up(&async_new); 192
193 /* schedule for execution */
194 queue_work(system_unbound_wq, &entry->work);
195
209 return newcookie; 196 return newcookie;
210} 197}
211 198
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
312 async_synchronize_cookie_domain(cookie, &async_running); 299 async_synchronize_cookie_domain(cookie, &async_running);
313} 300}
314EXPORT_SYMBOL_GPL(async_synchronize_cookie); 301EXPORT_SYMBOL_GPL(async_synchronize_cookie);
315
316
317static int async_thread(void *unused)
318{
319 DECLARE_WAITQUEUE(wq, current);
320 add_wait_queue(&async_new, &wq);
321
322 while (!kthread_should_stop()) {
323 int ret = HZ;
324 set_current_state(TASK_INTERRUPTIBLE);
325 /*
326 * check the list head without lock.. false positives
327 * are dealt with inside run_one_entry() while holding
328 * the lock.
329 */
330 rmb();
331 if (!list_empty(&async_pending))
332 run_one_entry();
333 else
334 ret = schedule_timeout(HZ);
335
336 if (ret == 0) {
337 /*
338 * we timed out, this means we as thread are redundant.
339 * we sign off and die, but we to avoid any races there
340 * is a last-straw check to see if work snuck in.
341 */
342 atomic_dec(&thread_count);
343 wmb(); /* manager must see our departure first */
344 if (list_empty(&async_pending))
345 break;
346 /*
347 * woops work came in between us timing out and us
348 * signing off; we need to stay alive and keep working.
349 */
350 atomic_inc(&thread_count);
351 }
352 }
353 remove_wait_queue(&async_new, &wq);
354
355 return 0;
356}
357
358static int async_manager_thread(void *unused)
359{
360 DECLARE_WAITQUEUE(wq, current);
361 add_wait_queue(&async_new, &wq);
362
363 while (!kthread_should_stop()) {
364 int tc, ec;
365
366 set_current_state(TASK_INTERRUPTIBLE);
367
368 tc = atomic_read(&thread_count);
369 rmb();
370 ec = atomic_read(&entry_count);
371
372 while (tc < ec && tc < MAX_THREADS) {
373 if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
374 tc))) {
375 msleep(100);
376 continue;
377 }
378 atomic_inc(&thread_count);
379 tc++;
380 }
381
382 schedule();
383 }
384 remove_wait_queue(&async_new, &wq);
385
386 return 0;
387}
388
389static int __init async_init(void)
390{
391 async_enabled =
392 !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
393
394 WARN_ON(!async_enabled);
395 return 0;
396}
397
398core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#include <linux/netlink.h> 58#include <linux/netlink.h>
59#include <linux/inotify.h>
60#include <linux/freezer.h> 59#include <linux/freezer.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
62 61
@@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
407 audit_hold_skb(skb); 406 audit_hold_skb(skb);
408 } else 407 } else
409 /* drop the extra reference if sent ok */ 408 /* drop the extra reference if sent ok */
410 kfree_skb(skb); 409 consume_skb(skb);
411} 410}
412 411
413static int kauditd_thread(void *dummy) 412static int kauditd_thread(void *dummy)
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
103extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
104extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
105 105
106extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
107
106/* audit watch functions */ 108/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch); 109#ifdef CONFIG_AUDIT_WATCH
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch); 110extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch); 111extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); 112extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule); 113extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
113extern void audit_remove_watch(struct audit_watch *watch); 114extern void audit_remove_watch_rule(struct audit_krule *krule);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch); 115extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch); 116extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
118 117#else
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, 118#define audit_put_watch(w) {}
120 struct audit_watch *watch); 119#define audit_get_watch(w) {}
120#define audit_to_watch(k, p, l, o) (-EINVAL)
121#define audit_add_watch(k, l) (-EINVAL)
122#define audit_remove_watch_rule(k) BUG()
123#define audit_watch_path(w) ""
124#define audit_watch_compare(w, i, d) 0
125
126#endif /* CONFIG_AUDIT_WATCH */
121 127
122#ifdef CONFIG_AUDIT_TREE 128#ifdef CONFIG_AUDIT_TREE
123extern struct audit_chunk *audit_tree_lookup(const struct inode *); 129extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
1#include "audit.h" 1#include "audit.h"
2#include <linux/inotify.h> 2#include <linux/fsnotify_backend.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
22 22
23struct audit_chunk { 23struct audit_chunk {
24 struct list_head hash; 24 struct list_head hash;
25 struct inotify_watch watch; 25 struct fsnotify_mark mark;
26 struct list_head trees; /* with root here */ 26 struct list_head trees; /* with root here */
27 int dead; 27 int dead;
28 int count; 28 int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
59 * tree is refcounted; one reference for "some rules on rules_list refer to 59 * tree is refcounted; one reference for "some rules on rules_list refer to
60 * it", one for each chunk with pointer to it. 60 * it", one for each chunk with pointer to it.
61 * 61 *
62 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount 62 * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
63 * of watch contributes 1 to .refs). 63 * of watch contributes 1 to .refs).
64 * 64 *
65 * node.index allows to get from node.list to containing chunk. 65 * node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
68 * that makes a difference. Some. 68 * that makes a difference. Some.
69 */ 69 */
70 70
71static struct inotify_handle *rtree_ih; 71static struct fsnotify_group *audit_tree_group;
72 72
73static struct audit_tree *alloc_tree(const char *s) 73static struct audit_tree *alloc_tree(const char *s)
74{ 74{
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
111 return tree->pathname; 111 return tree->pathname;
112} 112}
113 113
114static struct audit_chunk *alloc_chunk(int count)
115{
116 struct audit_chunk *chunk;
117 size_t size;
118 int i;
119
120 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
121 chunk = kzalloc(size, GFP_KERNEL);
122 if (!chunk)
123 return NULL;
124
125 INIT_LIST_HEAD(&chunk->hash);
126 INIT_LIST_HEAD(&chunk->trees);
127 chunk->count = count;
128 atomic_long_set(&chunk->refs, 1);
129 for (i = 0; i < count; i++) {
130 INIT_LIST_HEAD(&chunk->owners[i].list);
131 chunk->owners[i].index = i;
132 }
133 inotify_init_watch(&chunk->watch);
134 return chunk;
135}
136
137static void free_chunk(struct audit_chunk *chunk) 114static void free_chunk(struct audit_chunk *chunk)
138{ 115{
139 int i; 116 int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
157 audit_put_chunk(chunk); 134 audit_put_chunk(chunk);
158} 135}
159 136
137static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
138{
139 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
140 call_rcu(&chunk->head, __put_chunk);
141}
142
143static struct audit_chunk *alloc_chunk(int count)
144{
145 struct audit_chunk *chunk;
146 size_t size;
147 int i;
148
149 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
150 chunk = kzalloc(size, GFP_KERNEL);
151 if (!chunk)
152 return NULL;
153
154 INIT_LIST_HEAD(&chunk->hash);
155 INIT_LIST_HEAD(&chunk->trees);
156 chunk->count = count;
157 atomic_long_set(&chunk->refs, 1);
158 for (i = 0; i < count; i++) {
159 INIT_LIST_HEAD(&chunk->owners[i].list);
160 chunk->owners[i].index = i;
161 }
162 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
163 return chunk;
164}
165
160enum {HASH_SIZE = 128}; 166enum {HASH_SIZE = 128};
161static struct list_head chunk_hash_heads[HASH_SIZE]; 167static struct list_head chunk_hash_heads[HASH_SIZE];
162static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); 168static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
167 return chunk_hash_heads + n % HASH_SIZE; 173 return chunk_hash_heads + n % HASH_SIZE;
168} 174}
169 175
170/* hash_lock is held by caller */ 176/* hash_lock & entry->lock is held by caller */
171static void insert_hash(struct audit_chunk *chunk) 177static void insert_hash(struct audit_chunk *chunk)
172{ 178{
173 struct list_head *list = chunk_hash(chunk->watch.inode); 179 struct fsnotify_mark *entry = &chunk->mark;
180 struct list_head *list;
181
182 if (!entry->i.inode)
183 return;
184 list = chunk_hash(entry->i.inode);
174 list_add_rcu(&chunk->hash, list); 185 list_add_rcu(&chunk->hash, list);
175} 186}
176 187
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
181 struct audit_chunk *p; 192 struct audit_chunk *p;
182 193
183 list_for_each_entry_rcu(p, list, hash) { 194 list_for_each_entry_rcu(p, list, hash) {
184 if (p->watch.inode == inode) { 195 /* mark.inode may have gone NULL, but who cares? */
196 if (p->mark.i.inode == inode) {
185 atomic_long_inc(&p->refs); 197 atomic_long_inc(&p->refs);
186 return p; 198 return p;
187 } 199 }
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
210static void untag_chunk(struct node *p) 222static void untag_chunk(struct node *p)
211{ 223{
212 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark;
213 struct audit_chunk *new; 226 struct audit_chunk *new;
214 struct audit_tree *owner; 227 struct audit_tree *owner;
215 int size = chunk->count - 1; 228 int size = chunk->count - 1;
216 int i, j; 229 int i, j;
217 230
218 if (!pin_inotify_watch(&chunk->watch)) { 231 fsnotify_get_mark(entry);
219 /*
220 * Filesystem is shutting down; all watches are getting
221 * evicted, just take it off the node list for this
222 * tree and let the eviction logics take care of the
223 * rest.
224 */
225 owner = p->owner;
226 if (owner->root == chunk) {
227 list_del_init(&owner->same_root);
228 owner->root = NULL;
229 }
230 list_del_init(&p->list);
231 p->owner = NULL;
232 put_tree(owner);
233 return;
234 }
235 232
236 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
237 234
238 /* 235 spin_lock(&entry->lock);
239 * pin_inotify_watch() succeeded, so the watch won't go away 236 if (chunk->dead || !entry->i.inode) {
240 * from under us. 237 spin_unlock(&entry->lock);
241 */
242 mutex_lock(&chunk->watch.inode->inotify_mutex);
243 if (chunk->dead) {
244 mutex_unlock(&chunk->watch.inode->inotify_mutex);
245 goto out; 238 goto out;
246 } 239 }
247 240
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
256 list_del_init(&p->list); 249 list_del_init(&p->list);
257 list_del_rcu(&chunk->hash); 250 list_del_rcu(&chunk->hash);
258 spin_unlock(&hash_lock); 251 spin_unlock(&hash_lock);
259 inotify_evict_watch(&chunk->watch); 252 spin_unlock(&entry->lock);
260 mutex_unlock(&chunk->watch.inode->inotify_mutex); 253 fsnotify_destroy_mark(entry);
261 put_inotify_watch(&chunk->watch); 254 fsnotify_put_mark(entry);
262 goto out; 255 goto out;
263 } 256 }
264 257
265 new = alloc_chunk(size); 258 new = alloc_chunk(size);
266 if (!new) 259 if (!new)
267 goto Fallback; 260 goto Fallback;
268 if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { 261 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
269 free_chunk(new); 263 free_chunk(new);
270 goto Fallback; 264 goto Fallback;
271 } 265 }
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
298 list_for_each_entry(owner, &new->trees, same_root) 292 list_for_each_entry(owner, &new->trees, same_root)
299 owner->root = new; 293 owner->root = new;
300 spin_unlock(&hash_lock); 294 spin_unlock(&hash_lock);
301 inotify_evict_watch(&chunk->watch); 295 spin_unlock(&entry->lock);
302 mutex_unlock(&chunk->watch.inode->inotify_mutex); 296 fsnotify_destroy_mark(entry);
303 put_inotify_watch(&chunk->watch); 297 fsnotify_put_mark(entry);
304 goto out; 298 goto out;
305 299
306Fallback: 300Fallback:
@@ -314,31 +308,33 @@ Fallback:
314 p->owner = NULL; 308 p->owner = NULL;
315 put_tree(owner); 309 put_tree(owner);
316 spin_unlock(&hash_lock); 310 spin_unlock(&hash_lock);
317 mutex_unlock(&chunk->watch.inode->inotify_mutex); 311 spin_unlock(&entry->lock);
318out: 312out:
319 unpin_inotify_watch(&chunk->watch); 313 fsnotify_put_mark(entry);
320 spin_lock(&hash_lock); 314 spin_lock(&hash_lock);
321} 315}
322 316
323static int create_chunk(struct inode *inode, struct audit_tree *tree) 317static int create_chunk(struct inode *inode, struct audit_tree *tree)
324{ 318{
319 struct fsnotify_mark *entry;
325 struct audit_chunk *chunk = alloc_chunk(1); 320 struct audit_chunk *chunk = alloc_chunk(1);
326 if (!chunk) 321 if (!chunk)
327 return -ENOMEM; 322 return -ENOMEM;
328 323
329 if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { 324 entry = &chunk->mark;
325 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
330 free_chunk(chunk); 326 free_chunk(chunk);
331 return -ENOSPC; 327 return -ENOSPC;
332 } 328 }
333 329
334 mutex_lock(&inode->inotify_mutex); 330 spin_lock(&entry->lock);
335 spin_lock(&hash_lock); 331 spin_lock(&hash_lock);
336 if (tree->goner) { 332 if (tree->goner) {
337 spin_unlock(&hash_lock); 333 spin_unlock(&hash_lock);
338 chunk->dead = 1; 334 chunk->dead = 1;
339 inotify_evict_watch(&chunk->watch); 335 spin_unlock(&entry->lock);
340 mutex_unlock(&inode->inotify_mutex); 336 fsnotify_destroy_mark(entry);
341 put_inotify_watch(&chunk->watch); 337 fsnotify_put_mark(entry);
342 return 0; 338 return 0;
343 } 339 }
344 chunk->owners[0].index = (1U << 31); 340 chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
351 } 347 }
352 insert_hash(chunk); 348 insert_hash(chunk);
353 spin_unlock(&hash_lock); 349 spin_unlock(&hash_lock);
354 mutex_unlock(&inode->inotify_mutex); 350 spin_unlock(&entry->lock);
355 return 0; 351 return 0;
356} 352}
357 353
358/* the first tagged inode becomes root of tree */ 354/* the first tagged inode becomes root of tree */
359static int tag_chunk(struct inode *inode, struct audit_tree *tree) 355static int tag_chunk(struct inode *inode, struct audit_tree *tree)
360{ 356{
361 struct inotify_watch *watch; 357 struct fsnotify_mark *old_entry, *chunk_entry;
362 struct audit_tree *owner; 358 struct audit_tree *owner;
363 struct audit_chunk *chunk, *old; 359 struct audit_chunk *chunk, *old;
364 struct node *p; 360 struct node *p;
365 int n; 361 int n;
366 362
367 if (inotify_find_watch(rtree_ih, inode, &watch) < 0) 363 old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
364 if (!old_entry)
368 return create_chunk(inode, tree); 365 return create_chunk(inode, tree);
369 366
370 old = container_of(watch, struct audit_chunk, watch); 367 old = container_of(old_entry, struct audit_chunk, mark);
371 368
372 /* are we already there? */ 369 /* are we already there? */
373 spin_lock(&hash_lock); 370 spin_lock(&hash_lock);
374 for (n = 0; n < old->count; n++) { 371 for (n = 0; n < old->count; n++) {
375 if (old->owners[n].owner == tree) { 372 if (old->owners[n].owner == tree) {
376 spin_unlock(&hash_lock); 373 spin_unlock(&hash_lock);
377 put_inotify_watch(&old->watch); 374 fsnotify_put_mark(old_entry);
378 return 0; 375 return 0;
379 } 376 }
380 } 377 }
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
382 379
383 chunk = alloc_chunk(old->count + 1); 380 chunk = alloc_chunk(old->count + 1);
384 if (!chunk) { 381 if (!chunk) {
385 put_inotify_watch(&old->watch); 382 fsnotify_put_mark(old_entry);
386 return -ENOMEM; 383 return -ENOMEM;
387 } 384 }
388 385
389 mutex_lock(&inode->inotify_mutex); 386 chunk_entry = &chunk->mark;
390 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 387
391 mutex_unlock(&inode->inotify_mutex); 388 spin_lock(&old_entry->lock);
392 put_inotify_watch(&old->watch); 389 if (!old_entry->i.inode) {
390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry);
393 free_chunk(chunk); 393 free_chunk(chunk);
394 return -ENOENT;
395 }
396
397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock);
400 free_chunk(chunk);
401 fsnotify_put_mark(old_entry);
394 return -ENOSPC; 402 return -ENOSPC;
395 } 403 }
404
405 /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
406 spin_lock(&chunk_entry->lock);
396 spin_lock(&hash_lock); 407 spin_lock(&hash_lock);
408
409 /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
397 if (tree->goner) { 410 if (tree->goner) {
398 spin_unlock(&hash_lock); 411 spin_unlock(&hash_lock);
399 chunk->dead = 1; 412 chunk->dead = 1;
400 inotify_evict_watch(&chunk->watch); 413 spin_unlock(&chunk_entry->lock);
401 mutex_unlock(&inode->inotify_mutex); 414 spin_unlock(&old_entry->lock);
402 put_inotify_watch(&old->watch); 415
403 put_inotify_watch(&chunk->watch); 416 fsnotify_destroy_mark(chunk_entry);
417
418 fsnotify_put_mark(chunk_entry);
419 fsnotify_put_mark(old_entry);
404 return 0; 420 return 0;
405 } 421 }
406 list_replace_init(&old->trees, &chunk->trees); 422 list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
426 list_add(&tree->same_root, &chunk->trees); 442 list_add(&tree->same_root, &chunk->trees);
427 } 443 }
428 spin_unlock(&hash_lock); 444 spin_unlock(&hash_lock);
429 inotify_evict_watch(&old->watch); 445 spin_unlock(&chunk_entry->lock);
430 mutex_unlock(&inode->inotify_mutex); 446 spin_unlock(&old_entry->lock);
431 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ 447 fsnotify_destroy_mark(old_entry);
432 put_inotify_watch(&old->watch); /* and kill it */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
449 fsnotify_put_mark(old_entry); /* and kill it */
433 return 0; 450 return 0;
434} 451}
435 452
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
584 601
585 spin_lock(&hash_lock); 602 spin_lock(&hash_lock);
586 list_for_each_entry(node, &tree->chunks, list) { 603 list_for_each_entry(node, &tree->chunks, list) {
587 struct inode *inode = find_chunk(node)->watch.inode; 604 struct audit_chunk *chunk = find_chunk(node);
605 /* this could be NULL if the watch is dieing else where... */
606 struct inode *inode = chunk->mark.i.inode;
588 node->index |= 1U<<31; 607 node->index |= 1U<<31;
589 if (iterate_mounts(compare_root, inode, root_mnt)) 608 if (iterate_mounts(compare_root, inode, root_mnt))
590 node->index &= ~(1U<<31); 609 node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
846 * Here comes the stuff asynchronous to auditctl operations 865 * Here comes the stuff asynchronous to auditctl operations
847 */ 866 */
848 867
849/* inode->inotify_mutex is locked */
850static void evict_chunk(struct audit_chunk *chunk) 868static void evict_chunk(struct audit_chunk *chunk)
851{ 869{
852 struct audit_tree *owner; 870 struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
885 mutex_unlock(&audit_filter_mutex); 903 mutex_unlock(&audit_filter_mutex);
886} 904}
887 905
888static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, 906static int audit_tree_handle_event(struct fsnotify_group *group,
889 u32 cookie, const char *dname, struct inode *inode) 907 struct fsnotify_mark *inode_mark,
908 struct fsnotify_mark *vfsmonut_mark,
909 struct fsnotify_event *event)
910{
911 BUG();
912 return -EOPNOTSUPP;
913}
914
915static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
890{ 916{
891 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 917 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
892 918
893 if (mask & IN_IGNORED) { 919 evict_chunk(chunk);
894 evict_chunk(chunk); 920 fsnotify_put_mark(entry);
895 put_inotify_watch(watch);
896 }
897} 921}
898 922
899static void destroy_watch(struct inotify_watch *watch) 923static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
924 struct fsnotify_mark *inode_mark,
925 struct fsnotify_mark *vfsmount_mark,
926 __u32 mask, void *data, int data_type)
900{ 927{
901 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 928 return false;
902 call_rcu(&chunk->head, __put_chunk);
903} 929}
904 930
905static const struct inotify_operations rtree_inotify_ops = { 931static const struct fsnotify_ops audit_tree_ops = {
906 .handle_event = handle_event, 932 .handle_event = audit_tree_handle_event,
907 .destroy_watch = destroy_watch, 933 .should_send_event = audit_tree_send_event,
934 .free_group_priv = NULL,
935 .free_event_priv = NULL,
936 .freeing_mark = audit_tree_freeing_mark,
908}; 937};
909 938
910static int __init audit_tree_init(void) 939static int __init audit_tree_init(void)
911{ 940{
912 int i; 941 int i;
913 942
914 rtree_ih = inotify_init(&rtree_inotify_ops); 943 audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
915 if (IS_ERR(rtree_ih)) 944 if (IS_ERR(audit_tree_group))
916 audit_panic("cannot initialize inotify handle for rectree watches"); 945 audit_panic("cannot initialize fsnotify group for rectree watches");
917 946
918 for (i = 0; i < HASH_SIZE; i++) 947 for (i = 0; i < HASH_SIZE; i++)
919 INIT_LIST_HEAD(&chunk_hash_heads[i]); 948 INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..6bf2306be7d6 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fsnotify_backend.h>
27#include <linux/namei.h> 28#include <linux/namei.h>
28#include <linux/netlink.h> 29#include <linux/netlink.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31#include <linux/inotify.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include "audit.h" 33#include "audit.h"
34 34
35/* 35/*
36 * Reference counting: 36 * Reference counting:
37 * 37 *
38 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED 38 * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
39 * event. Each audit_watch holds a reference to its associated parent. 39 * event. Each audit_watch holds a reference to its associated parent.
40 * 40 *
41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to 41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
51 unsigned long ino; /* associated inode number */ 51 unsigned long ino; /* associated inode number */
52 struct audit_parent *parent; /* associated parent */ 52 struct audit_parent *parent; /* associated parent */
53 struct list_head wlist; /* entry in parent->watches list */ 53 struct list_head wlist; /* entry in parent->watches list */
54 struct list_head rules; /* associated rules */ 54 struct list_head rules; /* anchor for krule->rlist */
55}; 55};
56 56
57struct audit_parent { 57struct audit_parent {
58 struct list_head ilist; /* entry in inotify registration list */ 58 struct list_head watches; /* anchor for audit_watch->wlist */
59 struct list_head watches; /* associated watches */ 59 struct fsnotify_mark mark; /* fsnotify mark on the inode */
60 struct inotify_watch wdata; /* inotify watch data */
61 unsigned flags; /* status flags */
62}; 60};
63 61
64/* Inotify handle. */ 62/* fsnotify handle. */
65struct inotify_handle *audit_ih; 63struct fsnotify_group *audit_watch_group;
66 64
67/* 65/* fsnotify events we care about. */
68 * audit_parent status flags: 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
69 * 67 FS_MOVE_SELF | FS_EVENT_ON_CHILD)
70 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
71 * a filesystem event to ensure we're adding audit watches to a valid parent.
72 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
73 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
74 * we can receive while holding nameidata.
75 */
76#define AUDIT_PARENT_INVALID 0x001
77 68
78/* Inotify events we care about. */ 69static void audit_free_parent(struct audit_parent *parent)
79#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 70{
71 WARN_ON(!list_empty(&parent->watches));
72 kfree(parent);
73}
80 74
81static void audit_free_parent(struct inotify_watch *i_watch) 75static void audit_watch_free_mark(struct fsnotify_mark *entry)
82{ 76{
83 struct audit_parent *parent; 77 struct audit_parent *parent;
84 78
85 parent = container_of(i_watch, struct audit_parent, wdata); 79 parent = container_of(entry, struct audit_parent, mark);
86 WARN_ON(!list_empty(&parent->watches)); 80 audit_free_parent(parent);
87 kfree(parent); 81}
82
83static void audit_get_parent(struct audit_parent *parent)
84{
85 if (likely(parent))
86 fsnotify_get_mark(&parent->mark);
87}
88
89static void audit_put_parent(struct audit_parent *parent)
90{
91 if (likely(parent))
92 fsnotify_put_mark(&parent->mark);
93}
94
95/*
96 * Find and return the audit_parent on the given inode. If found a reference
97 * is taken on this parent.
98 */
99static inline struct audit_parent *audit_find_parent(struct inode *inode)
100{
101 struct audit_parent *parent = NULL;
102 struct fsnotify_mark *entry;
103
104 entry = fsnotify_find_inode_mark(audit_watch_group, inode);
105 if (entry)
106 parent = container_of(entry, struct audit_parent, mark);
107
108 return parent;
88} 109}
89 110
90void audit_get_watch(struct audit_watch *watch) 111void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
105void audit_remove_watch(struct audit_watch *watch) 126void audit_remove_watch(struct audit_watch *watch)
106{ 127{
107 list_del(&watch->wlist); 128 list_del(&watch->wlist);
108 put_inotify_watch(&watch->parent->wdata); 129 audit_put_parent(watch->parent);
109 watch->parent = NULL; 130 watch->parent = NULL;
110 audit_put_watch(watch); /* match initial get */ 131 audit_put_watch(watch); /* match initial get */
111} 132}
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
115 return watch->path; 136 return watch->path;
116} 137}
117 138
118struct list_head *audit_watch_rules(struct audit_watch *watch) 139int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
119{
120 return &watch->rules;
121}
122
123unsigned long audit_watch_inode(struct audit_watch *watch)
124{ 140{
125 return watch->ino; 141 return (watch->ino != (unsigned long)-1) &&
126} 142 (watch->ino == ino) &&
127 143 (watch->dev == dev);
128dev_t audit_watch_dev(struct audit_watch *watch)
129{
130 return watch->dev;
131} 144}
132 145
133/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
134static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct nameidata *ndp)
135{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode;
136 struct audit_parent *parent; 150 struct audit_parent *parent;
137 s32 wd; 151 int ret;
138 152
139 parent = kzalloc(sizeof(*parent), GFP_KERNEL); 153 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
140 if (unlikely(!parent)) 154 if (unlikely(!parent))
141 return ERR_PTR(-ENOMEM); 155 return ERR_PTR(-ENOMEM);
142 156
143 INIT_LIST_HEAD(&parent->watches); 157 INIT_LIST_HEAD(&parent->watches);
144 parent->flags = 0; 158
145 159 fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
146 inotify_init_watch(&parent->wdata); 160 parent->mark.mask = AUDIT_FS_WATCH;
147 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ 161 ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
148 get_inotify_watch(&parent->wdata); 162 if (ret < 0) {
149 wd = inotify_add_watch(audit_ih, &parent->wdata, 163 audit_free_parent(parent);
150 ndp->path.dentry->d_inode, AUDIT_IN_WATCH); 164 return ERR_PTR(ret);
151 if (wd < 0) {
152 audit_free_parent(&parent->wdata);
153 return ERR_PTR(wd);
154 } 165 }
155 166
156 return parent; 167 return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
179{ 190{
180 struct audit_watch *watch; 191 struct audit_watch *watch;
181 192
182 if (!audit_ih) 193 if (!audit_watch_group)
183 return -EOPNOTSUPP; 194 return -EOPNOTSUPP;
184 195
185 if (path[0] != '/' || path[len-1] == '/' || 196 if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
217 228
218 new->dev = old->dev; 229 new->dev = old->dev;
219 new->ino = old->ino; 230 new->ino = old->ino;
220 get_inotify_watch(&old->parent->wdata); 231 audit_get_parent(old->parent);
221 new->parent = old->parent; 232 new->parent = old->parent;
222 233
223out: 234out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
251 struct audit_entry *oentry, *nentry; 262 struct audit_entry *oentry, *nentry;
252 263
253 mutex_lock(&audit_filter_mutex); 264 mutex_lock(&audit_filter_mutex);
265 /* Run all of the watches on this parent looking for the one that
266 * matches the given dname */
254 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
255 if (audit_compare_dname_path(dname, owatch->path, NULL)) 268 if (audit_compare_dname_path(dname, owatch->path, NULL))
256 continue; 269 continue;
257 270
258 /* If the update involves invalidating rules, do the inode-based 271 /* If the update involves invalidating rules, do the inode-based
259 * filtering now, so we don't omit records. */ 272 * filtering now, so we don't omit records. */
260 if (invalidating && current->audit_context) 273 if (invalidating && !audit_dummy_context())
261 audit_filter_inodes(current, current->audit_context); 274 audit_filter_inodes(current, current->audit_context);
262 275
276 /* updating ino will likely change which audit_hash_list we
277 * are on so we need a new watch for the new list */
263 nwatch = audit_dupe_watch(owatch); 278 nwatch = audit_dupe_watch(owatch);
264 if (IS_ERR(nwatch)) { 279 if (IS_ERR(nwatch)) {
265 mutex_unlock(&audit_filter_mutex); 280 mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
275 list_del(&oentry->rule.rlist); 290 list_del(&oentry->rule.rlist);
276 list_del_rcu(&oentry->list); 291 list_del_rcu(&oentry->list);
277 292
278 nentry = audit_dupe_rule(&oentry->rule, nwatch); 293 nentry = audit_dupe_rule(&oentry->rule);
279 if (IS_ERR(nentry)) { 294 if (IS_ERR(nentry)) {
280 list_del(&oentry->rule.list); 295 list_del(&oentry->rule.list);
281 audit_panic("error updating watch, removing"); 296 audit_panic("error updating watch, removing");
282 } else { 297 } else {
283 int h = audit_hash_ino((u32)ino); 298 int h = audit_hash_ino((u32)ino);
299
300 /*
301 * nentry->rule.watch == oentry->rule.watch so
302 * we must drop that reference and set it to our
303 * new watch.
304 */
305 audit_put_watch(nentry->rule.watch);
306 audit_get_watch(nwatch);
307 nentry->rule.watch = nwatch;
284 list_add(&nentry->rule.rlist, &nwatch->rules); 308 list_add(&nentry->rule.rlist, &nwatch->rules);
285 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 309 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
286 list_replace(&oentry->rule.list, 310 list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
312 struct audit_entry *e; 336 struct audit_entry *e;
313 337
314 mutex_lock(&audit_filter_mutex); 338 mutex_lock(&audit_filter_mutex);
315 parent->flags |= AUDIT_PARENT_INVALID;
316 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 339 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
317 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 340 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
318 e = container_of(r, struct audit_entry, rule); 341 e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
325 audit_remove_watch(w); 348 audit_remove_watch(w);
326 } 349 }
327 mutex_unlock(&audit_filter_mutex); 350 mutex_unlock(&audit_filter_mutex);
328}
329
330/* Unregister inotify watches for parents on in_list.
331 * Generates an IN_IGNORED event. */
332void audit_inotify_unregister(struct list_head *in_list)
333{
334 struct audit_parent *p, *n;
335 351
336 list_for_each_entry_safe(p, n, in_list, ilist) { 352 fsnotify_destroy_mark(&parent->mark);
337 list_del(&p->ilist);
338 inotify_rm_watch(audit_ih, &p->wdata);
339 /* the unpin matching the pin in audit_do_del_rule() */
340 unpin_inotify_watch(&p->wdata);
341 }
342} 353}
343 354
344/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
389 } 400 }
390} 401}
391 402
392/* Associate the given rule with an existing parent inotify_watch. 403/* Associate the given rule with an existing parent.
393 * Caller must hold audit_filter_mutex. */ 404 * Caller must hold audit_filter_mutex. */
394static void audit_add_to_parent(struct audit_krule *krule, 405static void audit_add_to_parent(struct audit_krule *krule,
395 struct audit_parent *parent) 406 struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
397 struct audit_watch *w, *watch = krule->watch; 408 struct audit_watch *w, *watch = krule->watch;
398 int watch_found = 0; 409 int watch_found = 0;
399 410
411 BUG_ON(!mutex_is_locked(&audit_filter_mutex));
412
400 list_for_each_entry(w, &parent->watches, wlist) { 413 list_for_each_entry(w, &parent->watches, wlist) {
401 if (strcmp(watch->path, w->path)) 414 if (strcmp(watch->path, w->path))
402 continue; 415 continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
413 } 426 }
414 427
415 if (!watch_found) { 428 if (!watch_found) {
416 get_inotify_watch(&parent->wdata); 429 audit_get_parent(parent);
417 watch->parent = parent; 430 watch->parent = parent;
418 431
419 list_add(&watch->wlist, &parent->watches); 432 list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
423 436
424/* Find a matching watch entry, or add this one. 437/* Find a matching watch entry, or add this one.
425 * Caller must hold audit_filter_mutex. */ 438 * Caller must hold audit_filter_mutex. */
426int audit_add_watch(struct audit_krule *krule) 439int audit_add_watch(struct audit_krule *krule, struct list_head **list)
427{ 440{
428 struct audit_watch *watch = krule->watch; 441 struct audit_watch *watch = krule->watch;
429 struct inotify_watch *i_watch;
430 struct audit_parent *parent; 442 struct audit_parent *parent;
431 struct nameidata *ndp = NULL, *ndw = NULL; 443 struct nameidata *ndp = NULL, *ndw = NULL;
432 int ret = 0; 444 int h, ret = 0;
433 445
434 mutex_unlock(&audit_filter_mutex); 446 mutex_unlock(&audit_filter_mutex);
435 447
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
441 goto error; 453 goto error;
442 } 454 }
443 455
456 mutex_lock(&audit_filter_mutex);
457
444 /* update watch filter fields */ 458 /* update watch filter fields */
445 if (ndw) { 459 if (ndw) {
446 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; 460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
447 watch->ino = ndw->path.dentry->d_inode->i_ino; 461 watch->ino = ndw->path.dentry->d_inode->i_ino;
448 } 462 }
449 463
450 /* The audit_filter_mutex must not be held during inotify calls because 464 /* either find an old parent or attach a new one */
451 * we hold it during inotify event callback processing. If an existing 465 parent = audit_find_parent(ndp->path.dentry->d_inode);
452 * inotify watch is found, inotify_find_watch() grabs a reference before 466 if (!parent) {
453 * returning.
454 */
455 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
456 &i_watch) < 0) {
457 parent = audit_init_parent(ndp); 467 parent = audit_init_parent(ndp);
458 if (IS_ERR(parent)) { 468 if (IS_ERR(parent)) {
459 /* caller expects mutex locked */
460 mutex_lock(&audit_filter_mutex);
461 ret = PTR_ERR(parent); 469 ret = PTR_ERR(parent);
462 goto error; 470 goto error;
463 } 471 }
464 } else 472 }
465 parent = container_of(i_watch, struct audit_parent, wdata);
466
467 mutex_lock(&audit_filter_mutex);
468 473
469 /* parent was moved before we took audit_filter_mutex */ 474 audit_add_to_parent(krule, parent);
470 if (parent->flags & AUDIT_PARENT_INVALID)
471 ret = -ENOENT;
472 else
473 audit_add_to_parent(krule, parent);
474 475
475 /* match get in audit_init_parent or inotify_find_watch */ 476 /* match get in audit_find_parent or audit_init_parent */
476 put_inotify_watch(&parent->wdata); 477 audit_put_parent(parent);
477 478
479 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h];
478error: 481error:
479 audit_put_nd(ndp, ndw); /* NULL args OK */ 482 audit_put_nd(ndp, ndw); /* NULL args OK */
480 return ret; 483 return ret;
481 484
482} 485}
483 486
484void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) 487void audit_remove_watch_rule(struct audit_krule *krule)
485{ 488{
486 struct audit_watch *watch = krule->watch; 489 struct audit_watch *watch = krule->watch;
487 struct audit_parent *parent = watch->parent; 490 struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
492 audit_remove_watch(watch); 495 audit_remove_watch(watch);
493 496
494 if (list_empty(&parent->watches)) { 497 if (list_empty(&parent->watches)) {
495 /* Put parent on the inotify un-registration 498 audit_get_parent(parent);
496 * list. Grab a reference before releasing 499 fsnotify_destroy_mark(&parent->mark);
497 * audit_filter_mutex, to be released in 500 audit_put_parent(parent);
498 * audit_inotify_unregister().
499 * If filesystem is going away, just leave
500 * the sucker alone, eviction will take
501 * care of it. */
502 if (pin_inotify_watch(&parent->wdata))
503 list_add(&parent->ilist, list);
504 } 501 }
505 } 502 }
506} 503}
507 504
508/* Update watch data in audit rules based on inotify events. */ 505static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
509static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, 506 struct fsnotify_mark *inode_mark,
510 u32 cookie, const char *dname, struct inode *inode) 507 struct fsnotify_mark *vfsmount_mark,
508 __u32 mask, void *data, int data_type)
509{
510 return true;
511}
512
513/* Update watch data in audit rules based on fsnotify events. */
514static int audit_watch_handle_event(struct fsnotify_group *group,
515 struct fsnotify_mark *inode_mark,
516 struct fsnotify_mark *vfsmount_mark,
517 struct fsnotify_event *event)
511{ 518{
519 struct inode *inode;
520 __u32 mask = event->mask;
521 const char *dname = event->file_name;
512 struct audit_parent *parent; 522 struct audit_parent *parent;
513 523
514 parent = container_of(i_watch, struct audit_parent, wdata); 524 parent = container_of(inode_mark, struct audit_parent, mark);
515 525
516 if (mask & (IN_CREATE|IN_MOVED_TO) && inode) 526 BUG_ON(group != audit_watch_group);
517 audit_update_watch(parent, dname, inode->i_sb->s_dev, 527
518 inode->i_ino, 0); 528 switch (event->data_type) {
519 else if (mask & (IN_DELETE|IN_MOVED_FROM)) 529 case (FSNOTIFY_EVENT_FILE):
530 inode = event->file->f_path.dentry->d_inode;
531 break;
532 case (FSNOTIFY_EVENT_INODE):
533 inode = event->inode;
534 break;
535 default:
536 BUG();
537 inode = NULL;
538 break;
539 };
540
541 if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
542 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
543 else if (mask & (FS_DELETE|FS_MOVED_FROM))
520 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); 544 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
521 /* inotify automatically removes the watch and sends IN_IGNORED */ 545 else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
522 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
523 audit_remove_parent_watches(parent);
524 /* inotify does not remove the watch, so remove it manually */
525 else if(mask & IN_MOVE_SELF) {
526 audit_remove_parent_watches(parent); 546 audit_remove_parent_watches(parent);
527 inotify_remove_watch_locked(audit_ih, i_watch); 547
528 } else if (mask & IN_IGNORED) 548 return 0;
529 put_inotify_watch(i_watch);
530} 549}
531 550
532static const struct inotify_operations audit_inotify_ops = { 551static const struct fsnotify_ops audit_watch_fsnotify_ops = {
533 .handle_event = audit_handle_ievent, 552 .should_send_event = audit_watch_should_send_event,
534 .destroy_watch = audit_free_parent, 553 .handle_event = audit_watch_handle_event,
554 .free_group_priv = NULL,
555 .freeing_mark = NULL,
556 .free_event_priv = NULL,
535}; 557};
536 558
537static int __init audit_watch_init(void) 559static int __init audit_watch_init(void)
538{ 560{
539 audit_ih = inotify_init(&audit_inotify_ops); 561 audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
540 if (IS_ERR(audit_ih)) 562 if (IS_ERR(audit_watch_group)) {
541 audit_panic("cannot initialize inotify handle"); 563 audit_watch_group = NULL;
564 audit_panic("cannot create audit fsnotify group");
565 }
542 return 0; 566 return 0;
543} 567}
544subsys_initcall(audit_watch_init); 568device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
71{ 71{
72 int i; 72 int i;
73 struct audit_krule *erule = &e->rule; 73 struct audit_krule *erule = &e->rule;
74
74 /* some rules don't have associated watches */ 75 /* some rules don't have associated watches */
75 if (erule->watch) 76 if (erule->watch)
76 audit_put_watch(erule->watch); 77 audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
746 * rule with the new rule in the filterlist, then free the old rule. 747 * rule with the new rule in the filterlist, then free the old rule.
747 * The rlist element is undefined; list manipulations are handled apart from 748 * The rlist element is undefined; list manipulations are handled apart from
748 * the initial copy. */ 749 * the initial copy. */
749struct audit_entry *audit_dupe_rule(struct audit_krule *old, 750struct audit_entry *audit_dupe_rule(struct audit_krule *old)
750 struct audit_watch *watch)
751{ 751{
752 u32 fcount = old->field_count; 752 u32 fcount = old->field_count;
753 struct audit_entry *entry; 753 struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
769 new->prio = old->prio; 769 new->prio = old->prio;
770 new->buflen = old->buflen; 770 new->buflen = old->buflen;
771 new->inode_f = old->inode_f; 771 new->inode_f = old->inode_f;
772 new->watch = NULL;
773 new->field_count = old->field_count; 772 new->field_count = old->field_count;
773
774 /* 774 /*
775 * note that we are OK with not refcounting here; audit_match_tree() 775 * note that we are OK with not refcounting here; audit_match_tree()
776 * never dereferences tree and we can't get false positives there 776 * never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
811 } 811 }
812 } 812 }
813 813
814 if (watch) { 814 if (old->watch) {
815 audit_get_watch(watch); 815 audit_get_watch(old->watch);
816 new->watch = watch; 816 new->watch = old->watch;
817 } 817 }
818 818
819 return entry; 819 return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
866 struct audit_watch *watch = entry->rule.watch; 866 struct audit_watch *watch = entry->rule.watch;
867 struct audit_tree *tree = entry->rule.tree; 867 struct audit_tree *tree = entry->rule.tree;
868 struct list_head *list; 868 struct list_head *list;
869 int h, err; 869 int err;
870#ifdef CONFIG_AUDITSYSCALL 870#ifdef CONFIG_AUDITSYSCALL
871 int dont_count = 0; 871 int dont_count = 0;
872 872
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
889 889
890 if (watch) { 890 if (watch) {
891 /* audit_filter_mutex is dropped and re-taken during this call */ 891 /* audit_filter_mutex is dropped and re-taken during this call */
892 err = audit_add_watch(&entry->rule); 892 err = audit_add_watch(&entry->rule, &list);
893 if (err) { 893 if (err) {
894 mutex_unlock(&audit_filter_mutex); 894 mutex_unlock(&audit_filter_mutex);
895 goto error; 895 goto error;
896 } 896 }
897 /* entry->rule.watch may have changed during audit_add_watch() */
898 watch = entry->rule.watch;
899 h = audit_hash_ino((u32)audit_watch_inode(watch));
900 list = &audit_inode_hash[h];
901 } 897 }
902 if (tree) { 898 if (tree) {
903 err = audit_add_tree_rule(&entry->rule); 899 err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
949 struct audit_watch *watch = entry->rule.watch; 945 struct audit_watch *watch = entry->rule.watch;
950 struct audit_tree *tree = entry->rule.tree; 946 struct audit_tree *tree = entry->rule.tree;
951 struct list_head *list; 947 struct list_head *list;
952 LIST_HEAD(inotify_list);
953 int ret = 0; 948 int ret = 0;
954#ifdef CONFIG_AUDITSYSCALL 949#ifdef CONFIG_AUDITSYSCALL
955 int dont_count = 0; 950 int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
969 } 964 }
970 965
971 if (e->rule.watch) 966 if (e->rule.watch)
972 audit_remove_watch_rule(&e->rule, &inotify_list); 967 audit_remove_watch_rule(&e->rule);
973 968
974 if (e->rule.tree) 969 if (e->rule.tree)
975 audit_remove_tree_rule(&e->rule); 970 audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
987#endif 982#endif
988 mutex_unlock(&audit_filter_mutex); 983 mutex_unlock(&audit_filter_mutex);
989 984
990 if (!list_empty(&inotify_list))
991 audit_inotify_unregister(&inotify_list);
992
993out: 985out:
994 if (watch) 986 if (watch)
995 audit_put_watch(watch); /* match initial get */ 987 audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
1323{ 1315{
1324 struct audit_entry *entry = container_of(r, struct audit_entry, rule); 1316 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1325 struct audit_entry *nentry; 1317 struct audit_entry *nentry;
1326 struct audit_watch *watch;
1327 struct audit_tree *tree;
1328 int err = 0; 1318 int err = 0;
1329 1319
1330 if (!security_audit_rule_known(r)) 1320 if (!security_audit_rule_known(r))
1331 return 0; 1321 return 0;
1332 1322
1333 watch = r->watch; 1323 nentry = audit_dupe_rule(r);
1334 tree = r->tree;
1335 nentry = audit_dupe_rule(r, watch);
1336 if (IS_ERR(nentry)) { 1324 if (IS_ERR(nentry)) {
1337 /* save the first error encountered for the 1325 /* save the first error encountered for the
1338 * return value */ 1326 * return value */
1339 err = PTR_ERR(nentry); 1327 err = PTR_ERR(nentry);
1340 audit_panic("error updating LSM filters"); 1328 audit_panic("error updating LSM filters");
1341 if (watch) 1329 if (r->watch)
1342 list_del(&r->rlist); 1330 list_del(&r->rlist);
1343 list_del_rcu(&entry->list); 1331 list_del_rcu(&entry->list);
1344 list_del(&r->list); 1332 list_del(&r->list);
1345 } else { 1333 } else {
1346 if (watch) { 1334 if (r->watch || r->tree)
1347 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1348 list_del(&r->rlist);
1349 } else if (tree)
1350 list_replace_init(&r->rlist, &nentry->rule.rlist); 1335 list_replace_init(&r->rlist, &nentry->rule.rlist);
1351 list_replace_rcu(&entry->list, &nentry->list); 1336 list_replace_rcu(&entry->list, &nentry->list);
1352 list_replace(&r->list, &nentry->rule.list); 1337 list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..b87a63beb66c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
65#include <linux/binfmts.h> 65#include <linux/binfmts.h>
66#include <linux/highmem.h> 66#include <linux/highmem.h>
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/inotify.h>
69#include <linux/capability.h> 68#include <linux/capability.h>
70#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
71 70
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
549 } 548 }
550 break; 549 break;
551 case AUDIT_WATCH: 550 case AUDIT_WATCH:
552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) 551 if (name)
553 result = (name->dev == audit_watch_dev(rule->watch) && 552 result = audit_watch_compare(rule->watch, name->ino, name->dev);
554 name->ino == audit_watch_inode(rule->watch));
555 break; 553 break;
556 case AUDIT_DIR: 554 case AUDIT_DIR:
557 if (ctx) 555 if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
1726 struct audit_tree_refs *p; 1724 struct audit_tree_refs *p;
1727 struct audit_chunk *chunk; 1725 struct audit_chunk *chunk;
1728 int count; 1726 int count;
1729 if (likely(list_empty(&inode->inotify_watches))) 1727 if (likely(hlist_empty(&inode->i_fsnotify_marks)))
1730 return; 1728 return;
1731 context = current->audit_context; 1729 context = current->audit_context;
1732 p = context->trees; 1730 p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
1769 seq = read_seqbegin(&rename_lock); 1767 seq = read_seqbegin(&rename_lock);
1770 for(;;) { 1768 for(;;) {
1771 struct inode *inode = d->d_inode; 1769 struct inode *inode = d->d_inode;
1772 if (inode && unlikely(!list_empty(&inode->inotify_watches))) { 1770 if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
1773 struct audit_chunk *chunk; 1771 struct audit_chunk *chunk;
1774 chunk = audit_tree_lookup(inode); 1772 chunk = audit_tree_lookup(inode);
1775 if (chunk) { 1773 if (chunk) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3ac6f5b0a64b..d83cab06da87 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = {
1623 .kill_sb = cgroup_kill_sb, 1623 .kill_sb = cgroup_kill_sb,
1624}; 1624};
1625 1625
1626static struct kobject *cgroup_kobj;
1627
1626static inline struct cgroup *__d_cgrp(struct dentry *dentry) 1628static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1627{ 1629{
1628 return dentry->d_fsdata; 1630 return dentry->d_fsdata;
@@ -1788,6 +1790,29 @@ out:
1788 return retval; 1790 return retval;
1789} 1791}
1790 1792
1793/**
1794 * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
1795 * @tsk: the task to be attached
1796 */
1797int cgroup_attach_task_current_cg(struct task_struct *tsk)
1798{
1799 struct cgroupfs_root *root;
1800 struct cgroup *cur_cg;
1801 int retval = 0;
1802
1803 cgroup_lock();
1804 for_each_active_root(root) {
1805 cur_cg = task_cgroup_from_root(current, root);
1806 retval = cgroup_attach_task(cur_cg, tsk);
1807 if (retval)
1808 break;
1809 }
1810 cgroup_unlock();
1811
1812 return retval;
1813}
1814EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
1815
1791/* 1816/*
1792 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1817 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1793 * held. May take task_lock of task 1818 * held. May take task_lock of task
@@ -3871,9 +3896,18 @@ int __init cgroup_init(void)
3871 hhead = css_set_hash(init_css_set.subsys); 3896 hhead = css_set_hash(init_css_set.subsys);
3872 hlist_add_head(&init_css_set.hlist, hhead); 3897 hlist_add_head(&init_css_set.hlist, hhead);
3873 BUG_ON(!init_root_id(&rootnode)); 3898 BUG_ON(!init_root_id(&rootnode));
3899
3900 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
3901 if (!cgroup_kobj) {
3902 err = -ENOMEM;
3903 goto out;
3904 }
3905
3874 err = register_filesystem(&cgroup_fs_type); 3906 err = register_filesystem(&cgroup_fs_type);
3875 if (err < 0) 3907 if (err < 0) {
3908 kobject_put(cgroup_kobj);
3876 goto out; 3909 goto out;
3910 }
3877 3911
3878 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 3912 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
3879 3913
diff --git a/kernel/compat.c b/kernel/compat.c
index 5adab05a3172..e167efce8423 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
279 struct compat_rlimit __user *rlim) 279 struct compat_rlimit __user *rlim)
280{ 280{
281 struct rlimit r; 281 struct rlimit r;
282 int ret;
283 mm_segment_t old_fs = get_fs ();
284
285 if (resource >= RLIM_NLIMITS)
286 return -EINVAL;
287 282
288 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || 283 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
289 __get_user(r.rlim_cur, &rlim->rlim_cur) || 284 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
294 r.rlim_cur = RLIM_INFINITY; 289 r.rlim_cur = RLIM_INFINITY;
295 if (r.rlim_max == COMPAT_RLIM_INFINITY) 290 if (r.rlim_max == COMPAT_RLIM_INFINITY)
296 r.rlim_max = RLIM_INFINITY; 291 r.rlim_max = RLIM_INFINITY;
297 set_fs(KERNEL_DS); 292 return do_prlimit(current, resource, &r, NULL);
298 ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
299 set_fs(old_fs);
300 return ret;
301} 293}
302 294
303#ifdef COMPAT_RLIM_OLD_INFINITY 295#ifdef COMPAT_RLIM_OLD_INFINITY
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
329 321
330#endif 322#endif
331 323
332asmlinkage long compat_sys_getrlimit (unsigned int resource, 324asmlinkage long compat_sys_getrlimit(unsigned int resource,
333 struct compat_rlimit __user *rlim) 325 struct compat_rlimit __user *rlim)
334{ 326{
335 struct rlimit r; 327 struct rlimit r;
336 int ret; 328 int ret;
337 mm_segment_t old_fs = get_fs();
338 329
339 set_fs(KERNEL_DS); 330 ret = do_prlimit(current, resource, NULL, &r);
340 ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
341 set_fs(old_fs);
342 if (!ret) { 331 if (!ret) {
343 if (r.rlim_cur > COMPAT_RLIM_INFINITY) 332 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
344 r.rlim_cur = COMPAT_RLIM_INFINITY; 333 r.rlim_cur = COMPAT_RLIM_INFINITY;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
235 return -EINVAL; 235 return -EINVAL;
236 236
237 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
238 set_cpu_active(cpu, false);
239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
240 if (err) { 239 if (err) {
241 set_cpu_active(cpu, true);
242
243 nr_calls--; 240 nr_calls--;
244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
245 printk("%s: attempt to take down CPU %u failed\n", 242 printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 246
250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251 if (err) { 248 if (err) {
252 set_cpu_active(cpu, true);
253 /* CPU didn't die: tell everyone. Can't complain. */ 249 /* CPU didn't die: tell everyone. Can't complain. */
254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
255 251
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 goto out_notify; 317 goto out_notify;
322 BUG_ON(!cpu_online(cpu)); 318 BUG_ON(!cpu_online(cpu));
323 319
324 set_cpu_active(cpu, true);
325
326 /* Now call notifier in preparation. */ 320 /* Now call notifier in preparation. */
327 cpu_notify(CPU_ONLINE | mod, hcpu); 321 cpu_notify(CPU_ONLINE | mod, hcpu);
328 322
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..b23c0979bbe7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
105 /* for custom sched domain */ 105 /* for custom sched domain */
106 int relax_domain_level; 106 int relax_domain_level;
107 107
108 /* used for walking a cpuset heirarchy */ 108 /* used for walking a cpuset hierarchy */
109 struct list_head stack_list; 109 struct list_head stack_list;
110}; 110};
111 111
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2113 * but making no active use of cpusets. 2113 * but making no active use of cpusets.
2114 * 2114 *
2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2116 * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117 * 2117 *
2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2119 * before calling generate_sched_domains(). 2119 * before calling generate_sched_domains().
2120 */ 2120 */
2121static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2121void cpuset_update_active_cpus(void)
2122 unsigned long phase, void *unused_cpu)
2123{ 2122{
2124 struct sched_domain_attr *attr; 2123 struct sched_domain_attr *attr;
2125 cpumask_var_t *doms; 2124 cpumask_var_t *doms;
2126 int ndoms; 2125 int ndoms;
2127 2126
2128 switch (phase) {
2129 case CPU_ONLINE:
2130 case CPU_ONLINE_FROZEN:
2131 case CPU_DOWN_PREPARE:
2132 case CPU_DOWN_PREPARE_FROZEN:
2133 case CPU_DOWN_FAILED:
2134 case CPU_DOWN_FAILED_FROZEN:
2135 break;
2136
2137 default:
2138 return NOTIFY_DONE;
2139 }
2140
2141 cgroup_lock(); 2127 cgroup_lock();
2142 mutex_lock(&callback_mutex); 2128 mutex_lock(&callback_mutex);
2143 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2148 2134
2149 /* Have scheduler rebuild the domains */ 2135 /* Have scheduler rebuild the domains */
2150 partition_sched_domains(ndoms, doms, attr); 2136 partition_sched_domains(ndoms, doms, attr);
2151
2152 return NOTIFY_OK;
2153} 2137}
2154 2138
2155#ifdef CONFIG_MEMORY_HOTPLUG 2139#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
2203 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205 2189
2206 hotcpu_notifier(cpuset_track_online_cpus, 0);
2207 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208 2191
2209 cpuset_wq = create_singlethread_workqueue("cpuset"); 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 8bc5eeffec8a..3c2d4972d235 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2000-2001 VERITAS Software Corporation. 6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation 7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> 8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> 9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> 10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. 11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc. 12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -605,6 +605,8 @@ cpu_master_loop:
605 if (dbg_kdb_mode) { 605 if (dbg_kdb_mode) {
606 kgdb_connected = 1; 606 kgdb_connected = 1;
607 error = kdb_stub(ks); 607 error = kdb_stub(ks);
608 if (error == -1)
609 continue;
608 kgdb_connected = 0; 610 kgdb_connected = 0;
609 } else { 611 } else {
610 error = gdb_serial_stub(ks); 612 error = gdb_serial_stub(ks);
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e8fd6868682d..481a7bd2dfe7 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2000-2001 VERITAS Software Corporation. 6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation 7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> 8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> 9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> 10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. 11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc. 12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES +
52 * GDB remote protocol parser: 52 * GDB remote protocol parser:
53 */ 53 */
54 54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB 55#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void) 56static int gdbstub_read_wait(void)
68{ 57{
@@ -123,8 +112,8 @@ static void get_packet(char *buffer)
123 buffer[count] = 0; 112 buffer[count] = 0;
124 113
125 if (ch == '#') { 114 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4; 115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait()); 116 xmitcsum += hex_to_bin(gdbstub_read_wait());
128 117
129 if (checksum != xmitcsum) 118 if (checksum != xmitcsum)
130 /* failed checksum */ 119 /* failed checksum */
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
236 * buf. Return a pointer to the last char put in buf (null). May 225 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error. 226 * return an error.
238 */ 227 */
239int kgdb_mem2hex(char *mem, char *buf, int count) 228char *kgdb_mem2hex(char *mem, char *buf, int count)
240{ 229{
241 char *tmp; 230 char *tmp;
242 int err; 231 int err;
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
248 tmp = buf + count; 237 tmp = buf + count;
249 238
250 err = probe_kernel_read(tmp, mem, count); 239 err = probe_kernel_read(tmp, mem, count);
251 if (!err) { 240 if (err)
252 while (count > 0) { 241 return NULL;
253 buf = pack_hex_byte(buf, *tmp); 242 while (count > 0) {
254 tmp++; 243 buf = pack_hex_byte(buf, *tmp);
255 count--; 244 tmp++;
256 } 245 count--;
257
258 *buf = 0;
259 } 246 }
247 *buf = 0;
260 248
261 return err; 249 return buf;
262} 250}
263 251
264/* 252/*
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
280 tmp_hex = tmp_raw - 1; 268 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) { 269 while (tmp_hex >= buf) {
282 tmp_raw--; 270 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--); 271 *tmp_raw = hex_to_bin(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4; 272 *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
285 } 273 }
286 274
287 return probe_kernel_write(mem, tmp_raw, count); 275 return probe_kernel_write(mem, tmp_raw, count);
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
304 (*ptr)++; 292 (*ptr)++;
305 } 293 }
306 while (**ptr) { 294 while (**ptr) {
307 hex_val = hex(**ptr); 295 hex_val = hex_to_bin(**ptr);
308 if (hex_val < 0) 296 if (hex_val < 0)
309 break; 297 break;
310 298
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
339 return probe_kernel_write(mem, c, size); 327 return probe_kernel_write(mem, c, size);
340} 328}
341 329
330#if DBG_MAX_REG_NUM > 0
331void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
332{
333 int i;
334 int idx = 0;
335 char *ptr = (char *)gdb_regs;
336
337 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
338 dbg_get_reg(i, ptr + idx, regs);
339 idx += dbg_reg_def[i].size;
340 }
341}
342
343void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
344{
345 int i;
346 int idx = 0;
347 char *ptr = (char *)gdb_regs;
348
349 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
350 dbg_set_reg(i, ptr + idx, regs);
351 idx += dbg_reg_def[i].size;
352 }
353}
354#endif /* DBG_MAX_REG_NUM > 0 */
355
342/* Write memory due to an 'M' or 'X' packet. */ 356/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary) 357static int write_mem_msg(int binary)
344{ 358{
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error)
378 * remapped to negative TIDs. 392 * remapped to negative TIDs.
379 */ 393 */
380 394
381#define BUF_THREAD_ID_SIZE 16 395#define BUF_THREAD_ID_SIZE 8
382 396
383static char *pack_threadid(char *pkt, unsigned char *id) 397static char *pack_threadid(char *pkt, unsigned char *id)
384{ 398{
385 char *limit; 399 unsigned char *limit;
400 int lzero = 1;
401
402 limit = id + (BUF_THREAD_ID_SIZE / 2);
403 while (id < limit) {
404 if (!lzero || *id != 0) {
405 pkt = pack_hex_byte(pkt, *id);
406 lzero = 0;
407 }
408 id++;
409 }
386 410
387 limit = pkt + BUF_THREAD_ID_SIZE; 411 if (lzero)
388 while (pkt < limit) 412 pkt = pack_hex_byte(pkt, 0);
389 pkt = pack_hex_byte(pkt, *id++);
390 413
391 return pkt; 414 return pkt;
392} 415}
393 416
394static void int_to_threadref(unsigned char *id, int value) 417static void int_to_threadref(unsigned char *id, int value)
395{ 418{
396 unsigned char *scan; 419 put_unaligned_be32(value, id);
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403} 420}
404 421
405static struct task_struct *getthread(struct pt_regs *regs, int tid) 422static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo); 480 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464} 481}
465 482
466/* Handle the 'g' get registers request */ 483static void gdb_get_regs_helper(struct kgdb_state *ks)
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{ 484{
469 struct task_struct *thread; 485 struct task_struct *thread;
470 void *local_debuggerinfo; 486 void *local_debuggerinfo;
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
505 */ 521 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread); 522 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 } 523 }
524}
525
526/* Handle the 'g' get registers request */
527static void gdb_cmd_getregs(struct kgdb_state *ks)
528{
529 gdb_get_regs_helper(ks);
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); 530 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509} 531}
510 532
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
527 char *ptr = &remcom_in_buffer[1]; 549 char *ptr = &remcom_in_buffer[1];
528 unsigned long length; 550 unsigned long length;
529 unsigned long addr; 551 unsigned long addr;
530 int err; 552 char *err;
531 553
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && 554 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) { 555 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); 556 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err) 557 if (!err)
536 error_packet(remcom_out_buffer, err); 558 error_packet(remcom_out_buffer, -EINVAL);
537 } else { 559 } else {
538 error_packet(remcom_out_buffer, -EINVAL); 560 error_packet(remcom_out_buffer, -EINVAL);
539 } 561 }
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
550 strcpy(remcom_out_buffer, "OK"); 572 strcpy(remcom_out_buffer, "OK");
551} 573}
552 574
575#if DBG_MAX_REG_NUM > 0
576static char *gdb_hex_reg_helper(int regnum, char *out)
577{
578 int i;
579 int offset = 0;
580
581 for (i = 0; i < regnum; i++)
582 offset += dbg_reg_def[i].size;
583 return kgdb_mem2hex((char *)gdb_regs + offset, out,
584 dbg_reg_def[i].size);
585}
586
587/* Handle the 'p' individual regster get */
588static void gdb_cmd_reg_get(struct kgdb_state *ks)
589{
590 unsigned long regnum;
591 char *ptr = &remcom_in_buffer[1];
592
593 kgdb_hex2long(&ptr, &regnum);
594 if (regnum >= DBG_MAX_REG_NUM) {
595 error_packet(remcom_out_buffer, -EINVAL);
596 return;
597 }
598 gdb_get_regs_helper(ks);
599 gdb_hex_reg_helper(regnum, remcom_out_buffer);
600}
601
602/* Handle the 'P' individual regster set */
603static void gdb_cmd_reg_set(struct kgdb_state *ks)
604{
605 unsigned long regnum;
606 char *ptr = &remcom_in_buffer[1];
607 int i = 0;
608
609 kgdb_hex2long(&ptr, &regnum);
610 if (*ptr++ != '=' ||
611 !(!kgdb_usethread || kgdb_usethread == current) ||
612 !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
613 error_packet(remcom_out_buffer, -EINVAL);
614 return;
615 }
616 memset(gdb_regs, 0, sizeof(gdb_regs));
617 while (i < sizeof(gdb_regs) * 2)
618 if (hex_to_bin(ptr[i]) >= 0)
619 i++;
620 else
621 break;
622 i = i / 2;
623 kgdb_hex2mem(ptr, (char *)gdb_regs, i);
624 dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
625 strcpy(remcom_out_buffer, "OK");
626}
627#endif /* DBG_MAX_REG_NUM > 0 */
628
553/* Handle the 'X' memory binary write bytes */ 629/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks) 630static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{ 631{
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
612{ 688{
613 struct task_struct *g; 689 struct task_struct *g;
614 struct task_struct *p; 690 struct task_struct *p;
615 unsigned char thref[8]; 691 unsigned char thref[BUF_THREAD_ID_SIZE];
616 char *ptr; 692 char *ptr;
617 int i; 693 int i;
618 int cpu; 694 int cpu;
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
632 for_each_online_cpu(cpu) { 708 for_each_online_cpu(cpu) {
633 ks->thr_query = 0; 709 ks->thr_query = 0;
634 int_to_threadref(thref, -cpu - 2); 710 int_to_threadref(thref, -cpu - 2);
635 pack_threadid(ptr, thref); 711 ptr = pack_threadid(ptr, thref);
636 ptr += BUF_THREAD_ID_SIZE;
637 *(ptr++) = ','; 712 *(ptr++) = ',';
638 i++; 713 i++;
639 } 714 }
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
642 do_each_thread(g, p) { 717 do_each_thread(g, p) {
643 if (i >= ks->thr_query && !finished) { 718 if (i >= ks->thr_query && !finished) {
644 int_to_threadref(thref, p->pid); 719 int_to_threadref(thref, p->pid);
645 pack_threadid(ptr, thref); 720 ptr = pack_threadid(ptr, thref);
646 ptr += BUF_THREAD_ID_SIZE;
647 *(ptr++) = ','; 721 *(ptr++) = ',';
648 ks->thr_query++; 722 ks->thr_query++;
649 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) 723 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
858 int error = 0; 932 int error = 0;
859 int tmp; 933 int tmp;
860 934
861 /* Clear the out buffer. */ 935 /* Initialize comm buffer and globals. */
862 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); 936 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
937 kgdb_usethread = kgdb_info[ks->cpu].task;
938 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
939 ks->pass_exception = 0;
863 940
864 if (kgdb_connected) { 941 if (kgdb_connected) {
865 unsigned char thref[8]; 942 unsigned char thref[BUF_THREAD_ID_SIZE];
866 char *ptr; 943 char *ptr;
867 944
868 /* Reply to host that an exception has occurred */ 945 /* Reply to host that an exception has occurred */
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
876 put_packet(remcom_out_buffer); 953 put_packet(remcom_out_buffer);
877 } 954 }
878 955
879 kgdb_usethread = kgdb_info[ks->cpu].task;
880 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
881 ks->pass_exception = 0;
882
883 while (1) { 956 while (1) {
884 error = 0; 957 error = 0;
885 958
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
904 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 977 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
905 gdb_cmd_memwrite(ks); 978 gdb_cmd_memwrite(ks);
906 break; 979 break;
980#if DBG_MAX_REG_NUM > 0
981 case 'p': /* pXX Return gdb register XX (in hex) */
982 gdb_cmd_reg_get(ks);
983 break;
984 case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
985 gdb_cmd_reg_set(ks);
986 break;
987#endif /* DBG_MAX_REG_NUM > 0 */
907 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 988 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_binwrite(ks); 989 gdb_cmd_binwrite(ks);
909 break; 990 break;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..28b844118bbd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
312 312
313 if (endp == arg) { 313 if (endp == arg) {
314 /* 314 /*
315 * Try base 16, for us folks too lazy to type the 315 * Also try base 16, for us folks too lazy to type the
316 * leading 0x... 316 * leading 0x...
317 */ 317 */
318 val = simple_strtoul(arg, &endp, 16); 318 val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
325 return 0; 325 return 0;
326} 326}
327 327
328int kdbgetu64arg(const char *arg, u64 *value)
329{
330 char *endp;
331 u64 val;
332
333 val = simple_strtoull(arg, &endp, 0);
334
335 if (endp == arg) {
336
337 val = simple_strtoull(arg, &endp, 16);
338 if (endp == arg)
339 return KDB_BADINT;
340 }
341
342 *value = val;
343
344 return 0;
345}
346
328/* 347/*
329 * kdb_set - This function implements the 'set' command. Alter an 348 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one. 349 * existing environment variable or create a new one.
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
1770 */ 1789 */
1771static int kdb_rd(int argc, const char **argv) 1790static int kdb_rd(int argc, const char **argv)
1772{ 1791{
1773 int diag = kdb_check_regs(); 1792 int len = kdb_check_regs();
1774 if (diag) 1793#if DBG_MAX_REG_NUM > 0
1775 return diag; 1794 int i;
1795 char *rname;
1796 int rsize;
1797 u64 reg64;
1798 u32 reg32;
1799 u16 reg16;
1800 u8 reg8;
1801
1802 if (len)
1803 return len;
1804
1805 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1806 rsize = dbg_reg_def[i].size * 2;
1807 if (rsize > 16)
1808 rsize = 2;
1809 if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
1810 len = 0;
1811 kdb_printf("\n");
1812 }
1813 if (len)
1814 len += kdb_printf(" ");
1815 switch(dbg_reg_def[i].size * 8) {
1816 case 8:
1817 rname = dbg_get_reg(i, &reg8, kdb_current_regs);
1818 if (!rname)
1819 break;
1820 len += kdb_printf("%s: %02x", rname, reg8);
1821 break;
1822 case 16:
1823 rname = dbg_get_reg(i, &reg16, kdb_current_regs);
1824 if (!rname)
1825 break;
1826 len += kdb_printf("%s: %04x", rname, reg16);
1827 break;
1828 case 32:
1829 rname = dbg_get_reg(i, &reg32, kdb_current_regs);
1830 if (!rname)
1831 break;
1832 len += kdb_printf("%s: %08x", rname, reg32);
1833 break;
1834 case 64:
1835 rname = dbg_get_reg(i, &reg64, kdb_current_regs);
1836 if (!rname)
1837 break;
1838 len += kdb_printf("%s: %016llx", rname, reg64);
1839 break;
1840 default:
1841 len += kdb_printf("%s: ??", dbg_reg_def[i].name);
1842 }
1843 }
1844 kdb_printf("\n");
1845#else
1846 if (len)
1847 return len;
1776 1848
1777 kdb_dumpregs(kdb_current_regs); 1849 kdb_dumpregs(kdb_current_regs);
1850#endif
1778 return 0; 1851 return 0;
1779} 1852}
1780 1853
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
1782 * kdb_rm - This function implements the 'rm' (register modify) command. 1855 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents 1856 * rm register-name new-contents
1784 * Remarks: 1857 * Remarks:
1785 * Currently doesn't allow modification of control or 1858 * Allows register modification with the same restrictions as gdb
1786 * debug registers.
1787 */ 1859 */
1788static int kdb_rm(int argc, const char **argv) 1860static int kdb_rm(int argc, const char **argv)
1789{ 1861{
1862#if DBG_MAX_REG_NUM > 0
1790 int diag; 1863 int diag;
1791 int ind = 0; 1864 const char *rname;
1792 unsigned long contents; 1865 int i;
1866 u64 reg64;
1867 u32 reg32;
1868 u16 reg16;
1869 u8 reg8;
1793 1870
1794 if (argc != 2) 1871 if (argc != 2)
1795 return KDB_ARGCOUNT; 1872 return KDB_ARGCOUNT;
1796 /* 1873 /*
1797 * Allow presence or absence of leading '%' symbol. 1874 * Allow presence or absence of leading '%' symbol.
1798 */ 1875 */
1799 if (argv[1][0] == '%') 1876 rname = argv[1];
1800 ind = 1; 1877 if (*rname == '%')
1878 rname++;
1801 1879
1802 diag = kdbgetularg(argv[2], &contents); 1880 diag = kdbgetu64arg(argv[2], &reg64);
1803 if (diag) 1881 if (diag)
1804 return diag; 1882 return diag;
1805 1883
1806 diag = kdb_check_regs(); 1884 diag = kdb_check_regs();
1807 if (diag) 1885 if (diag)
1808 return diag; 1886 return diag;
1887
1888 diag = KDB_BADREG;
1889 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1890 if (strcmp(rname, dbg_reg_def[i].name) == 0) {
1891 diag = 0;
1892 break;
1893 }
1894 }
1895 if (!diag) {
1896 switch(dbg_reg_def[i].size * 8) {
1897 case 8:
1898 reg8 = reg64;
1899 dbg_set_reg(i, &reg8, kdb_current_regs);
1900 break;
1901 case 16:
1902 reg16 = reg64;
1903 dbg_set_reg(i, &reg16, kdb_current_regs);
1904 break;
1905 case 32:
1906 reg32 = reg64;
1907 dbg_set_reg(i, &reg32, kdb_current_regs);
1908 break;
1909 case 64:
1910 dbg_set_reg(i, &reg64, kdb_current_regs);
1911 break;
1912 }
1913 }
1914 return diag;
1915#else
1809 kdb_printf("ERROR: Register set currently not implemented\n"); 1916 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0; 1917 return 0;
1918#endif
1811} 1919}
1812 1920
1813#if defined(CONFIG_MAGIC_SYSRQ) 1921#if defined(CONFIG_MAGIC_SYSRQ)
@@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val)
2440 */ 2548 */
2441static int kdb_summary(int argc, const char **argv) 2549static int kdb_summary(int argc, const char **argv)
2442{ 2550{
2551 struct timespec now;
2443 struct kdb_tm tm; 2552 struct kdb_tm tm;
2444 struct sysinfo val; 2553 struct sysinfo val;
2445 2554
@@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv)
2454 kdb_printf("domainname %s\n", init_uts_ns.name.domainname); 2563 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2455 kdb_printf("ccversion %s\n", __stringify(CCVERSION)); 2564 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2456 2565
2457 kdb_gmtime(&xtime, &tm); 2566 now = __current_kernel_time();
2567 kdb_gmtime(&now, &tm);
2458 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " 2568 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2459 "tz_minuteswest %d\n", 2569 "tz_minuteswest %d\n",
2460 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, 2570 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..c438f545a321 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 144extern int kdb_putword(unsigned long, unsigned long, size_t);
145 145
146extern int kdbgetularg(const char *, unsigned long *); 146extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **);
148extern char *kdbgetenv(const char *); 147extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **); 149 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *); 150extern int kdbgetsymval(const char *, kdb_symtab_t *);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index dd62f8e714ca..0dbeae374225 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -134,23 +134,14 @@ unregister:
134 return 0; 134 return 0;
135} 135}
136 136
137int 137int __set_personality(unsigned int personality)
138__set_personality(unsigned int personality)
139{ 138{
140 struct exec_domain *ep, *oep; 139 struct exec_domain *oep = current_thread_info()->exec_domain;
141
142 ep = lookup_exec_domain(personality);
143 if (ep == current_thread_info()->exec_domain) {
144 current->personality = personality;
145 module_put(ep->module);
146 return 0;
147 }
148 140
141 current_thread_info()->exec_domain = lookup_exec_domain(personality);
149 current->personality = personality; 142 current->personality = personality;
150 oep = current_thread_info()->exec_domain;
151 current_thread_info()->exec_domain = ep;
152
153 module_put(oep->module); 143 module_put(oep->module);
144
154 return 0; 145 return 0;
155} 146}
156 147
@@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 183{
193 unsigned int old = current->personality; 184 unsigned int old = current->personality;
194 185
195 if (personality != 0xffffffff) { 186 if (personality != 0xffffffff)
196 set_personality(personality); 187 set_personality(personality);
197 if (current->personality != personality)
198 return -EINVAL;
199 }
200 188
201 return old; 189 return old;
202} 190}
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..98b450876f93 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
899 tty_audit_fork(sig); 899 tty_audit_fork(sig);
900 900
901 sig->oom_adj = current->signal->oom_adj; 901 sig->oom_adj = current->signal->oom_adj;
902 sig->oom_score_adj = current->signal->oom_score_adj;
902 903
903 return 0; 904 return 0;
904} 905}
@@ -907,7 +908,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
907{ 908{
908 unsigned long new_flags = p->flags; 909 unsigned long new_flags = p->flags;
909 910
910 new_flags &= ~PF_SUPERPRIV; 911 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
911 new_flags |= PF_FORKNOEXEC; 912 new_flags |= PF_FORKNOEXEC;
912 new_flags |= PF_STARTING; 913 new_flags |= PF_STARTING;
913 p->flags = new_flags; 914 p->flags = new_flags;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..ce669174f355 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
90 do { 90 do {
91 seq = read_seqbegin(&xtime_lock); 91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time(); 92 xts = __current_kernel_time();
93 tom = wall_to_monotonic; 93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq)); 94 } while (read_seqretry(&xtime_lock, seq));
95 95
96 xtim = timespec_to_ktime(xts); 96 xtim = timespec_to_ktime(xts);
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
144static int hrtimer_get_target(int this_cpu, int pinned) 144static int hrtimer_get_target(int this_cpu, int pinned)
145{ 145{
146#ifdef CONFIG_NO_HZ 146#ifdef CONFIG_NO_HZ
147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { 147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
148 int preferred_cpu = get_nohz_load_balancer(); 148 return get_nohz_timer_target();
149
150 if (preferred_cpu >= 0)
151 return preferred_cpu;
152 }
153#endif 149#endif
154 return this_cpu; 150 return this_cpu;
155} 151}
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
612static void retrigger_next_event(void *arg) 608static void retrigger_next_event(void *arg)
613{ 609{
614 struct hrtimer_cpu_base *base; 610 struct hrtimer_cpu_base *base;
615 struct timespec realtime_offset; 611 struct timespec realtime_offset, wtm;
616 unsigned long seq; 612 unsigned long seq;
617 613
618 if (!hrtimer_hres_active()) 614 if (!hrtimer_hres_active())
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg)
620 616
621 do { 617 do {
622 seq = read_seqbegin(&xtime_lock); 618 seq = read_seqbegin(&xtime_lock);
623 set_normalized_timespec(&realtime_offset, 619 wtm = __get_wall_to_monotonic();
624 -wall_to_monotonic.tv_sec,
625 -wall_to_monotonic.tv_nsec);
626 } while (read_seqretry(&xtime_lock, seq)); 620 } while (read_seqretry(&xtime_lock, seq));
621 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
627 622
628 base = &__get_cpu_var(hrtimer_bases); 623 base = &__get_cpu_var(hrtimer_bases);
629 624
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..d71a987fd2bf 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/list.h>
44#include <linux/cpu.h> 45#include <linux/cpu.h>
45#include <linux/smp.h> 46#include <linux/smp.h>
46 47
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62 63
63static int nr_slots[TYPE_MAX]; 64static int nr_slots[TYPE_MAX];
64 65
66/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head);
68
65static int constraints_initialized; 69static int constraints_initialized;
66 70
67/* Gather the number of total pinned and un-pinned bp in a cpuset */ 71/* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
103 return 0; 107 return 0;
104} 108}
105 109
106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) 110/*
111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list.
113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
107{ 115{
108 struct perf_event_context *ctx = tsk->perf_event_ctxp; 116 struct perf_event_context *ctx = bp->ctx;
109 struct list_head *list; 117 struct perf_event *iter;
110 struct perf_event *bp;
111 unsigned long flags;
112 int count = 0; 118 int count = 0;
113 119
114 if (WARN_ONCE(!ctx, "No perf context for this task")) 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
115 return 0; 121 if (iter->ctx == ctx && find_slot_idx(iter) == type)
116 122 count += hw_breakpoint_weight(iter);
117 list = &ctx->event_list;
118
119 raw_spin_lock_irqsave(&ctx->lock, flags);
120
121 /*
122 * The current breakpoint counter is not included in the list
123 * at the open() callback time
124 */
125 list_for_each_entry(bp, list, event_entry) {
126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
129 } 123 }
130 124
131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
132
133 return count; 125 return count;
134} 126}
135 127
@@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
149 if (!tsk) 141 if (!tsk)
150 slots->pinned += max_task_bp_pinned(cpu, type); 142 slots->pinned += max_task_bp_pinned(cpu, type);
151 else 143 else
152 slots->pinned += task_bp_pinned(tsk, type); 144 slots->pinned += task_bp_pinned(bp, type);
153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
154 146
155 return; 147 return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
162 if (!tsk) 154 if (!tsk)
163 nr += max_task_bp_pinned(cpu, type); 155 nr += max_task_bp_pinned(cpu, type);
164 else 156 else
165 nr += task_bp_pinned(tsk, type); 157 nr += task_bp_pinned(bp, type);
166 158
167 if (nr > slots->pinned) 159 if (nr > slots->pinned)
168 slots->pinned = nr; 160 slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
188/* 180/*
189 * Add a pinned breakpoint for the given task in our constraint table 181 * Add a pinned breakpoint for the given task in our constraint table
190 */ 182 */
191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, 183static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
192 enum bp_type_idx type, int weight) 184 enum bp_type_idx type, int weight)
193{ 185{
194 unsigned int *tsk_pinned; 186 unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
196 int old_idx = 0; 188 int old_idx = 0;
197 int idx = 0; 189 int idx = 0;
198 190
199 old_count = task_bp_pinned(tsk, type); 191 old_count = task_bp_pinned(bp, type);
200 old_idx = old_count - 1; 192 old_idx = old_count - 1;
201 idx = old_idx + weight; 193 idx = old_idx + weight;
202 194
195 /* tsk_pinned[n] is the number of tasks having n breakpoints */
203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); 196 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
204 if (enable) { 197 if (enable) {
205 tsk_pinned[idx]++; 198 tsk_pinned[idx]++;
@@ -222,23 +215,41 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
222 int cpu = bp->cpu; 215 int cpu = bp->cpu;
223 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->ctx->task;
224 217
218 /* Pinned counter cpu profiling */
219 if (!tsk) {
220
221 if (enable)
222 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
223 else
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
225 return;
226 }
227
225 /* Pinned counter task profiling */ 228 /* Pinned counter task profiling */
226 if (tsk) {
227 if (cpu >= 0) {
228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
229 return;
230 }
231 229
230 if (!enable)
231 list_del(&bp->hw.bp_list);
232
233 if (cpu >= 0) {
234 toggle_bp_task_slot(bp, cpu, enable, type, weight);
235 } else {
232 for_each_online_cpu(cpu) 236 for_each_online_cpu(cpu)
233 toggle_bp_task_slot(tsk, cpu, enable, type, weight); 237 toggle_bp_task_slot(bp, cpu, enable, type, weight);
234 return;
235 } 238 }
236 239
237 /* Pinned counter cpu profiling */
238 if (enable) 240 if (enable)
239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 241 list_add_tail(&bp->hw.bp_list, &bp_task_head);
240 else 242}
241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; 243
244/*
245 * Function to perform processor-specific cleanup during unregistration
246 */
247__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
248{
249 /*
250 * A weak stub function here for those archs that don't define
251 * it inside arch/.../kernel/hw_breakpoint.c
252 */
242} 253}
243 254
244/* 255/*
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
301 weight = hw_breakpoint_weight(bp); 312 weight = hw_breakpoint_weight(bp);
302 313
303 fetch_bp_busy_slots(&slots, bp, type); 314 fetch_bp_busy_slots(&slots, bp, type);
315 /*
316 * Simulate the addition of this breakpoint to the constraints
317 * and see the result.
318 */
304 fetch_this_slot(&slots, weight); 319 fetch_this_slot(&slots, weight);
305 320
306 /* Flexible counters need to keep at least one slot */ 321 /* Flexible counters need to keep at least one slot */
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp)
339{ 354{
340 mutex_lock(&nr_bp_mutex); 355 mutex_lock(&nr_bp_mutex);
341 356
357 arch_unregister_hw_breakpoint(bp);
342 __release_bp_slot(bp); 358 __release_bp_slot(bp);
343 359
344 mutex_unlock(&nr_bp_mutex); 360 mutex_unlock(&nr_bp_mutex);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..c3003e9d91a3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
216void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) 216void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
217{ 217{
218 if (suspend) { 218 if (suspend) {
219 if (!desc->action || (desc->action->flags & IRQF_TIMER)) 219 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
220 return; 220 return;
221 desc->status |= IRQ_SUSPENDED; 221 desc->status |= IRQ_SUSPENDED;
222 } 222 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
18#include <linux/freezer.h>
17#include <trace/events/sched.h> 19#include <trace/events/sched.h>
18 20
19static DEFINE_SPINLOCK(kthread_create_lock); 21static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
35 37
36struct kthread { 38struct kthread {
37 int should_stop; 39 int should_stop;
40 void *data;
38 struct completion exited; 41 struct completion exited;
39}; 42};
40 43
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
54} 57}
55EXPORT_SYMBOL(kthread_should_stop); 58EXPORT_SYMBOL(kthread_should_stop);
56 59
60/**
61 * kthread_data - return data value specified on kthread creation
62 * @task: kthread task in question
63 *
64 * Return the data value specified when kthread @task was created.
65 * The caller is responsible for ensuring the validity of @task when
66 * calling this function.
67 */
68void *kthread_data(struct task_struct *task)
69{
70 return to_kthread(task)->data;
71}
72
57static int kthread(void *_create) 73static int kthread(void *_create)
58{ 74{
59 /* Copy data: it's on kthread's stack */ 75 /* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
64 int ret; 80 int ret;
65 81
66 self.should_stop = 0; 82 self.should_stop = 0;
83 self.data = data;
67 init_completion(&self.exited); 84 init_completion(&self.exited);
68 current->vfork_done = &self.exited; 85 current->vfork_done = &self.exited;
69 86
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
247 264
248 return 0; 265 return 0;
249} 266}
267
268/**
269 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker
271 *
272 * This function can be used as @threadfn to kthread_create() or
273 * kthread_run() with @worker_ptr argument pointing to an initialized
274 * kthread_worker. The started kthread will process work_list until
275 * the it is stopped with kthread_stop(). A kthread can also call
276 * this function directly after extra initialization.
277 *
278 * Different kthreads can be used for the same kthread_worker as long
279 * as there's only one kthread attached to it at any given time. A
280 * kthread_worker without an attached kthread simply collects queued
281 * kthread_works.
282 */
283int kthread_worker_fn(void *worker_ptr)
284{
285 struct kthread_worker *worker = worker_ptr;
286 struct kthread_work *work;
287
288 WARN_ON(worker->task);
289 worker->task = current;
290repeat:
291 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
292
293 if (kthread_should_stop()) {
294 __set_current_state(TASK_RUNNING);
295 spin_lock_irq(&worker->lock);
296 worker->task = NULL;
297 spin_unlock_irq(&worker->lock);
298 return 0;
299 }
300
301 work = NULL;
302 spin_lock_irq(&worker->lock);
303 if (!list_empty(&worker->work_list)) {
304 work = list_first_entry(&worker->work_list,
305 struct kthread_work, node);
306 list_del_init(&work->node);
307 }
308 spin_unlock_irq(&worker->lock);
309
310 if (work) {
311 __set_current_state(TASK_RUNNING);
312 work->func(work);
313 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
314 work->done_seq = work->queue_seq;
315 smp_mb(); /* mb worker-b1 paired with flush-b0 */
316 if (atomic_read(&work->flushing))
317 wake_up_all(&work->done);
318 } else if (!freezing(current))
319 schedule();
320
321 try_to_freeze();
322 goto repeat;
323}
324EXPORT_SYMBOL_GPL(kthread_worker_fn);
325
326/**
327 * queue_kthread_work - queue a kthread_work
328 * @worker: target kthread_worker
329 * @work: kthread_work to queue
330 *
331 * Queue @work to work processor @task for async execution. @task
332 * must have been created with kthread_worker_create(). Returns %true
333 * if @work was successfully queued, %false if it was already pending.
334 */
335bool queue_kthread_work(struct kthread_worker *worker,
336 struct kthread_work *work)
337{
338 bool ret = false;
339 unsigned long flags;
340
341 spin_lock_irqsave(&worker->lock, flags);
342 if (list_empty(&work->node)) {
343 list_add_tail(&work->node, &worker->work_list);
344 work->queue_seq++;
345 if (likely(worker->task))
346 wake_up_process(worker->task);
347 ret = true;
348 }
349 spin_unlock_irqrestore(&worker->lock, flags);
350 return ret;
351}
352EXPORT_SYMBOL_GPL(queue_kthread_work);
353
354/**
355 * flush_kthread_work - flush a kthread_work
356 * @work: work to flush
357 *
358 * If @work is queued or executing, wait for it to finish execution.
359 */
360void flush_kthread_work(struct kthread_work *work)
361{
362 int seq = work->queue_seq;
363
364 atomic_inc(&work->flushing);
365
366 /*
367 * mb flush-b0 paired with worker-b1, to make sure either
368 * worker sees the above increment or we see done_seq update.
369 */
370 smp_mb__after_atomic_inc();
371
372 /* A - B <= 0 tests whether B is in front of A regardless of overflow */
373 wait_event(work->done, seq - work->done_seq <= 0);
374 atomic_dec(&work->flushing);
375
376 /*
377 * rmb flush-b1 paired with worker-b0, to make sure our caller
378 * sees every change made by work->func().
379 */
380 smp_mb__after_atomic_dec();
381}
382EXPORT_SYMBOL_GPL(flush_kthread_work);
383
384struct kthread_flush_work {
385 struct kthread_work work;
386 struct completion done;
387};
388
389static void kthread_flush_work_fn(struct kthread_work *work)
390{
391 struct kthread_flush_work *fwork =
392 container_of(work, struct kthread_flush_work, work);
393 complete(&fwork->done);
394}
395
396/**
397 * flush_kthread_worker - flush all current works on a kthread_worker
398 * @worker: worker to flush
399 *
400 * Wait until all currently executing or pending works on @worker are
401 * finished.
402 */
403void flush_kthread_worker(struct kthread_worker *worker)
404{
405 struct kthread_flush_work fwork = {
406 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
407 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
408 };
409
410 queue_kthread_work(worker, &fwork.work);
411 wait_for_completion(&fwork.done);
412}
413EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
146 146
147static inline u64 lockstat_clock(void) 147static inline u64 lockstat_clock(void)
148{ 148{
149 return cpu_clock(smp_processor_id()); 149 return local_clock();
150} 150}
151 151
152static int lock_point(unsigned long points[], unsigned long ip) 152static int lock_point(unsigned long points[], unsigned long ip)
diff --git a/kernel/module.c b/kernel/module.c
index 6c562828c85c..d0b5f8db11b4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
1/* 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU General Public License as published by
@@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb)
110} 110}
111EXPORT_SYMBOL(unregister_module_notifier); 111EXPORT_SYMBOL(unregister_module_notifier);
112 112
113struct load_info {
114 Elf_Ehdr *hdr;
115 unsigned long len;
116 Elf_Shdr *sechdrs;
117 char *secstrings, *strtab;
118 unsigned long *strmap;
119 unsigned long symoffs, stroffs;
120 struct _ddebug *debug;
121 unsigned int num_debug;
122 struct {
123 unsigned int sym, str, mod, vers, info, pcpu;
124 } index;
125};
126
113/* We require a truly strong try_module_get(): 0 means failure due to 127/* We require a truly strong try_module_get(): 0 means failure due to
114 ongoing or failed initialization etc. */ 128 ongoing or failed initialization etc. */
115static inline int strong_try_module_get(struct module *mod) 129static inline int strong_try_module_get(struct module *mod)
@@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code)
140EXPORT_SYMBOL(__module_put_and_exit); 154EXPORT_SYMBOL(__module_put_and_exit);
141 155
142/* Find a module section: 0 means not found. */ 156/* Find a module section: 0 means not found. */
143static unsigned int find_sec(Elf_Ehdr *hdr, 157static unsigned int find_sec(const struct load_info *info, const char *name)
144 Elf_Shdr *sechdrs,
145 const char *secstrings,
146 const char *name)
147{ 158{
148 unsigned int i; 159 unsigned int i;
149 160
150 for (i = 1; i < hdr->e_shnum; i++) 161 for (i = 1; i < info->hdr->e_shnum; i++) {
162 Elf_Shdr *shdr = &info->sechdrs[i];
151 /* Alloc bit cleared means "ignore it." */ 163 /* Alloc bit cleared means "ignore it." */
152 if ((sechdrs[i].sh_flags & SHF_ALLOC) 164 if ((shdr->sh_flags & SHF_ALLOC)
153 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) 165 && strcmp(info->secstrings + shdr->sh_name, name) == 0)
154 return i; 166 return i;
167 }
155 return 0; 168 return 0;
156} 169}
157 170
158/* Find a module section, or NULL. */ 171/* Find a module section, or NULL. */
159static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, 172static void *section_addr(const struct load_info *info, const char *name)
160 const char *secstrings, const char *name)
161{ 173{
162 /* Section 0 has sh_addr 0. */ 174 /* Section 0 has sh_addr 0. */
163 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; 175 return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
164} 176}
165 177
166/* Find a module section, or NULL. Fill in number of "objects" in section. */ 178/* Find a module section, or NULL. Fill in number of "objects" in section. */
167static void *section_objs(Elf_Ehdr *hdr, 179static void *section_objs(const struct load_info *info,
168 Elf_Shdr *sechdrs,
169 const char *secstrings,
170 const char *name, 180 const char *name,
171 size_t object_size, 181 size_t object_size,
172 unsigned int *num) 182 unsigned int *num)
173{ 183{
174 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); 184 unsigned int sec = find_sec(info, name);
175 185
176 /* Section 0 has sh_addr 0 and sh_size 0. */ 186 /* Section 0 has sh_addr 0 and sh_size 0. */
177 *num = sechdrs[sec].sh_size / object_size; 187 *num = info->sechdrs[sec].sh_size / object_size;
178 return (void *)sechdrs[sec].sh_addr; 188 return (void *)info->sechdrs[sec].sh_addr;
179} 189}
180 190
181/* Provided by the linker */ 191/* Provided by the linker */
@@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
227 unsigned int symnum, void *data), void *data) 237 unsigned int symnum, void *data), void *data)
228{ 238{
229 struct module *mod; 239 struct module *mod;
230 const struct symsearch arr[] = { 240 static const struct symsearch arr[] = {
231 { __start___ksymtab, __stop___ksymtab, __start___kcrctab, 241 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
232 NOT_GPL_ONLY, false }, 242 NOT_GPL_ONLY, false },
233 { __start___ksymtab_gpl, __stop___ksymtab_gpl, 243 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod,
392 mod->percpu = __alloc_reserved_percpu(size, align); 402 mod->percpu = __alloc_reserved_percpu(size, align);
393 if (!mod->percpu) { 403 if (!mod->percpu) {
394 printk(KERN_WARNING 404 printk(KERN_WARNING
395 "Could not allocate %lu bytes percpu data\n", size); 405 "%s: Could not allocate %lu bytes percpu data\n",
406 mod->name, size);
396 return -ENOMEM; 407 return -ENOMEM;
397 } 408 }
398 mod->percpu_size = size; 409 mod->percpu_size = size;
@@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod)
404 free_percpu(mod->percpu); 415 free_percpu(mod->percpu);
405} 416}
406 417
407static unsigned int find_pcpusec(Elf_Ehdr *hdr, 418static unsigned int find_pcpusec(struct load_info *info)
408 Elf_Shdr *sechdrs,
409 const char *secstrings)
410{ 419{
411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); 420 return find_sec(info, ".data..percpu");
412} 421}
413 422
414static void percpu_modcopy(struct module *mod, 423static void percpu_modcopy(struct module *mod,
@@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod,
468static inline void percpu_modfree(struct module *mod) 477static inline void percpu_modfree(struct module *mod)
469{ 478{
470} 479}
471static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 480static unsigned int find_pcpusec(struct load_info *info)
472 Elf_Shdr *sechdrs,
473 const char *secstrings)
474{ 481{
475 return 0; 482 return 0;
476} 483}
@@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
524EXPORT_TRACEPOINT_SYMBOL(module_get); 531EXPORT_TRACEPOINT_SYMBOL(module_get);
525 532
526/* Init the unload section of the module. */ 533/* Init the unload section of the module. */
527static void module_unload_init(struct module *mod) 534static int module_unload_init(struct module *mod)
528{ 535{
529 int cpu; 536 mod->refptr = alloc_percpu(struct module_ref);
537 if (!mod->refptr)
538 return -ENOMEM;
530 539
531 INIT_LIST_HEAD(&mod->source_list); 540 INIT_LIST_HEAD(&mod->source_list);
532 INIT_LIST_HEAD(&mod->target_list); 541 INIT_LIST_HEAD(&mod->target_list);
533 for_each_possible_cpu(cpu) {
534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
536 }
537 542
538 /* Hold reference count during initialization. */ 543 /* Hold reference count during initialization. */
539 __this_cpu_write(mod->refptr->incs, 1); 544 __this_cpu_write(mod->refptr->incs, 1);
540 /* Backwards compatibility macros put refcount during init. */ 545 /* Backwards compatibility macros put refcount during init. */
541 mod->waiter = current; 546 mod->waiter = current;
547
548 return 0;
542} 549}
543 550
544/* Does a already use b? */ 551/* Does a already use b? */
@@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod)
618 kfree(use); 625 kfree(use);
619 } 626 }
620 mutex_unlock(&module_mutex); 627 mutex_unlock(&module_mutex);
628
629 free_percpu(mod->refptr);
621} 630}
622 631
623#ifdef CONFIG_MODULE_FORCE_UNLOAD 632#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -891,8 +900,9 @@ int ref_module(struct module *a, struct module *b)
891} 900}
892EXPORT_SYMBOL_GPL(ref_module); 901EXPORT_SYMBOL_GPL(ref_module);
893 902
894static inline void module_unload_init(struct module *mod) 903static inline int module_unload_init(struct module *mod)
895{ 904{
905 return 0;
896} 906}
897#endif /* CONFIG_MODULE_UNLOAD */ 907#endif /* CONFIG_MODULE_UNLOAD */
898 908
@@ -1051,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1051#endif /* CONFIG_MODVERSIONS */ 1061#endif /* CONFIG_MODVERSIONS */
1052 1062
1053/* Resolve a symbol for this module. I.e. if we find one, record usage. */ 1063/* Resolve a symbol for this module. I.e. if we find one, record usage. */
1054static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1064static const struct kernel_symbol *resolve_symbol(struct module *mod,
1055 unsigned int versindex, 1065 const struct load_info *info,
1056 const char *name, 1066 const char *name,
1057 struct module *mod,
1058 char ownername[]) 1067 char ownername[])
1059{ 1068{
1060 struct module *owner; 1069 struct module *owner;
@@ -1068,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1068 if (!sym) 1077 if (!sym)
1069 goto unlock; 1078 goto unlock;
1070 1079
1071 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { 1080 if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
1081 owner)) {
1072 sym = ERR_PTR(-EINVAL); 1082 sym = ERR_PTR(-EINVAL);
1073 goto getname; 1083 goto getname;
1074 } 1084 }
@@ -1087,21 +1097,20 @@ unlock:
1087 return sym; 1097 return sym;
1088} 1098}
1089 1099
1090static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, 1100static const struct kernel_symbol *
1091 unsigned int versindex, 1101resolve_symbol_wait(struct module *mod,
1092 const char *name, 1102 const struct load_info *info,
1093 struct module *mod) 1103 const char *name)
1094{ 1104{
1095 const struct kernel_symbol *ksym; 1105 const struct kernel_symbol *ksym;
1096 char ownername[MODULE_NAME_LEN]; 1106 char owner[MODULE_NAME_LEN];
1097 1107
1098 if (wait_event_interruptible_timeout(module_wq, 1108 if (wait_event_interruptible_timeout(module_wq,
1099 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, 1109 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
1100 mod, ownername)) || 1110 || PTR_ERR(ksym) != -EBUSY,
1101 PTR_ERR(ksym) != -EBUSY,
1102 30 * HZ) <= 0) { 1111 30 * HZ) <= 0) {
1103 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", 1112 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1104 mod->name, ownername); 1113 mod->name, owner);
1105 } 1114 }
1106 return ksym; 1115 return ksym;
1107} 1116}
@@ -1110,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1110 * /sys/module/foo/sections stuff 1119 * /sys/module/foo/sections stuff
1111 * J. Corbet <corbet@lwn.net> 1120 * J. Corbet <corbet@lwn.net>
1112 */ 1121 */
1113#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1122#ifdef CONFIG_SYSFS
1114 1123
1124#ifdef CONFIG_KALLSYMS
1115static inline bool sect_empty(const Elf_Shdr *sect) 1125static inline bool sect_empty(const Elf_Shdr *sect)
1116{ 1126{
1117 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1127 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1148,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1148 kfree(sect_attrs); 1158 kfree(sect_attrs);
1149} 1159}
1150 1160
1151static void add_sect_attrs(struct module *mod, unsigned int nsect, 1161static void add_sect_attrs(struct module *mod, const struct load_info *info)
1152 char *secstrings, Elf_Shdr *sechdrs)
1153{ 1162{
1154 unsigned int nloaded = 0, i, size[2]; 1163 unsigned int nloaded = 0, i, size[2];
1155 struct module_sect_attrs *sect_attrs; 1164 struct module_sect_attrs *sect_attrs;
@@ -1157,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1157 struct attribute **gattr; 1166 struct attribute **gattr;
1158 1167
1159 /* Count loaded sections and allocate structures */ 1168 /* Count loaded sections and allocate structures */
1160 for (i = 0; i < nsect; i++) 1169 for (i = 0; i < info->hdr->e_shnum; i++)
1161 if (!sect_empty(&sechdrs[i])) 1170 if (!sect_empty(&info->sechdrs[i]))
1162 nloaded++; 1171 nloaded++;
1163 size[0] = ALIGN(sizeof(*sect_attrs) 1172 size[0] = ALIGN(sizeof(*sect_attrs)
1164 + nloaded * sizeof(sect_attrs->attrs[0]), 1173 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1175,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1175 sect_attrs->nsections = 0; 1184 sect_attrs->nsections = 0;
1176 sattr = &sect_attrs->attrs[0]; 1185 sattr = &sect_attrs->attrs[0];
1177 gattr = &sect_attrs->grp.attrs[0]; 1186 gattr = &sect_attrs->grp.attrs[0];
1178 for (i = 0; i < nsect; i++) { 1187 for (i = 0; i < info->hdr->e_shnum; i++) {
1179 if (sect_empty(&sechdrs[i])) 1188 Elf_Shdr *sec = &info->sechdrs[i];
1189 if (sect_empty(sec))
1180 continue; 1190 continue;
1181 sattr->address = sechdrs[i].sh_addr; 1191 sattr->address = sec->sh_addr;
1182 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1192 sattr->name = kstrdup(info->secstrings + sec->sh_name,
1183 GFP_KERNEL); 1193 GFP_KERNEL);
1184 if (sattr->name == NULL) 1194 if (sattr->name == NULL)
1185 goto out; 1195 goto out;
@@ -1247,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1247 kfree(notes_attrs); 1257 kfree(notes_attrs);
1248} 1258}
1249 1259
1250static void add_notes_attrs(struct module *mod, unsigned int nsect, 1260static void add_notes_attrs(struct module *mod, const struct load_info *info)
1251 char *secstrings, Elf_Shdr *sechdrs)
1252{ 1261{
1253 unsigned int notes, loaded, i; 1262 unsigned int notes, loaded, i;
1254 struct module_notes_attrs *notes_attrs; 1263 struct module_notes_attrs *notes_attrs;
@@ -1260,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1260 1269
1261 /* Count notes sections and allocate structures. */ 1270 /* Count notes sections and allocate structures. */
1262 notes = 0; 1271 notes = 0;
1263 for (i = 0; i < nsect; i++) 1272 for (i = 0; i < info->hdr->e_shnum; i++)
1264 if (!sect_empty(&sechdrs[i]) && 1273 if (!sect_empty(&info->sechdrs[i]) &&
1265 (sechdrs[i].sh_type == SHT_NOTE)) 1274 (info->sechdrs[i].sh_type == SHT_NOTE))
1266 ++notes; 1275 ++notes;
1267 1276
1268 if (notes == 0) 1277 if (notes == 0)
@@ -1276,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1276 1285
1277 notes_attrs->notes = notes; 1286 notes_attrs->notes = notes;
1278 nattr = &notes_attrs->attrs[0]; 1287 nattr = &notes_attrs->attrs[0];
1279 for (loaded = i = 0; i < nsect; ++i) { 1288 for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
1280 if (sect_empty(&sechdrs[i])) 1289 if (sect_empty(&info->sechdrs[i]))
1281 continue; 1290 continue;
1282 if (sechdrs[i].sh_type == SHT_NOTE) { 1291 if (info->sechdrs[i].sh_type == SHT_NOTE) {
1283 sysfs_bin_attr_init(nattr); 1292 sysfs_bin_attr_init(nattr);
1284 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1293 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1285 nattr->attr.mode = S_IRUGO; 1294 nattr->attr.mode = S_IRUGO;
1286 nattr->size = sechdrs[i].sh_size; 1295 nattr->size = info->sechdrs[i].sh_size;
1287 nattr->private = (void *) sechdrs[i].sh_addr; 1296 nattr->private = (void *) info->sechdrs[i].sh_addr;
1288 nattr->read = module_notes_read; 1297 nattr->read = module_notes_read;
1289 ++nattr; 1298 ++nattr;
1290 } 1299 }
@@ -1315,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod)
1315 1324
1316#else 1325#else
1317 1326
1318static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1327static inline void add_sect_attrs(struct module *mod,
1319 char *sectstrings, Elf_Shdr *sechdrs) 1328 const struct load_info *info)
1320{ 1329{
1321} 1330}
1322 1331
@@ -1324,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod)
1324{ 1333{
1325} 1334}
1326 1335
1327static inline void add_notes_attrs(struct module *mod, unsigned int nsect, 1336static inline void add_notes_attrs(struct module *mod,
1328 char *sectstrings, Elf_Shdr *sechdrs) 1337 const struct load_info *info)
1329{ 1338{
1330} 1339}
1331 1340
1332static inline void remove_notes_attrs(struct module *mod) 1341static inline void remove_notes_attrs(struct module *mod)
1333{ 1342{
1334} 1343}
1335#endif 1344#endif /* CONFIG_KALLSYMS */
1336 1345
1337#ifdef CONFIG_SYSFS
1338static void add_usage_links(struct module *mod) 1346static void add_usage_links(struct module *mod)
1339{ 1347{
1340#ifdef CONFIG_MODULE_UNLOAD 1348#ifdef CONFIG_MODULE_UNLOAD
@@ -1439,6 +1447,7 @@ out:
1439} 1447}
1440 1448
1441static int mod_sysfs_setup(struct module *mod, 1449static int mod_sysfs_setup(struct module *mod,
1450 const struct load_info *info,
1442 struct kernel_param *kparam, 1451 struct kernel_param *kparam,
1443 unsigned int num_params) 1452 unsigned int num_params)
1444{ 1453{
@@ -1463,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod,
1463 goto out_unreg_param; 1472 goto out_unreg_param;
1464 1473
1465 add_usage_links(mod); 1474 add_usage_links(mod);
1475 add_sect_attrs(mod, info);
1476 add_notes_attrs(mod, info);
1466 1477
1467 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1478 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1468 return 0; 1479 return 0;
@@ -1479,33 +1490,26 @@ out:
1479 1490
1480static void mod_sysfs_fini(struct module *mod) 1491static void mod_sysfs_fini(struct module *mod)
1481{ 1492{
1493 remove_notes_attrs(mod);
1494 remove_sect_attrs(mod);
1482 kobject_put(&mod->mkobj.kobj); 1495 kobject_put(&mod->mkobj.kobj);
1483} 1496}
1484 1497
1485#else /* CONFIG_SYSFS */ 1498#else /* !CONFIG_SYSFS */
1486
1487static inline int mod_sysfs_init(struct module *mod)
1488{
1489 return 0;
1490}
1491 1499
1492static inline int mod_sysfs_setup(struct module *mod, 1500static int mod_sysfs_setup(struct module *mod,
1501 const struct load_info *info,
1493 struct kernel_param *kparam, 1502 struct kernel_param *kparam,
1494 unsigned int num_params) 1503 unsigned int num_params)
1495{ 1504{
1496 return 0; 1505 return 0;
1497} 1506}
1498 1507
1499static inline int module_add_modinfo_attrs(struct module *mod) 1508static void mod_sysfs_fini(struct module *mod)
1500{
1501 return 0;
1502}
1503
1504static inline void module_remove_modinfo_attrs(struct module *mod)
1505{ 1509{
1506} 1510}
1507 1511
1508static void mod_sysfs_fini(struct module *mod) 1512static void module_remove_modinfo_attrs(struct module *mod)
1509{ 1513{
1510} 1514}
1511 1515
@@ -1515,7 +1519,7 @@ static void del_usage_links(struct module *mod)
1515 1519
1516#endif /* CONFIG_SYSFS */ 1520#endif /* CONFIG_SYSFS */
1517 1521
1518static void mod_kobject_remove(struct module *mod) 1522static void mod_sysfs_teardown(struct module *mod)
1519{ 1523{
1520 del_usage_links(mod); 1524 del_usage_links(mod);
1521 module_remove_modinfo_attrs(mod); 1525 module_remove_modinfo_attrs(mod);
@@ -1545,9 +1549,7 @@ static void free_module(struct module *mod)
1545 mutex_lock(&module_mutex); 1549 mutex_lock(&module_mutex);
1546 stop_machine(__unlink_module, mod, NULL); 1550 stop_machine(__unlink_module, mod, NULL);
1547 mutex_unlock(&module_mutex); 1551 mutex_unlock(&module_mutex);
1548 remove_notes_attrs(mod); 1552 mod_sysfs_teardown(mod);
1549 remove_sect_attrs(mod);
1550 mod_kobject_remove(mod);
1551 1553
1552 /* Remove dynamic debug info */ 1554 /* Remove dynamic debug info */
1553 ddebug_remove_module(mod->name); 1555 ddebug_remove_module(mod->name);
@@ -1565,10 +1567,7 @@ static void free_module(struct module *mod)
1565 module_free(mod, mod->module_init); 1567 module_free(mod, mod->module_init);
1566 kfree(mod->args); 1568 kfree(mod->args);
1567 percpu_modfree(mod); 1569 percpu_modfree(mod);
1568#if defined(CONFIG_MODULE_UNLOAD) 1570
1569 if (mod->refptr)
1570 free_percpu(mod->refptr);
1571#endif
1572 /* Free lock-classes: */ 1571 /* Free lock-classes: */
1573 lockdep_free_key_range(mod->module_core, mod->core_size); 1572 lockdep_free_key_range(mod->module_core, mod->core_size);
1574 1573
@@ -1634,25 +1633,23 @@ static int verify_export_symbols(struct module *mod)
1634} 1633}
1635 1634
1636/* Change all symbols so that st_value encodes the pointer directly. */ 1635/* Change all symbols so that st_value encodes the pointer directly. */
1637static int simplify_symbols(Elf_Shdr *sechdrs, 1636static int simplify_symbols(struct module *mod, const struct load_info *info)
1638 unsigned int symindex, 1637{
1639 const char *strtab, 1638 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
1640 unsigned int versindex, 1639 Elf_Sym *sym = (void *)symsec->sh_addr;
1641 unsigned int pcpuindex,
1642 struct module *mod)
1643{
1644 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
1645 unsigned long secbase; 1640 unsigned long secbase;
1646 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1641 unsigned int i;
1647 int ret = 0; 1642 int ret = 0;
1648 const struct kernel_symbol *ksym; 1643 const struct kernel_symbol *ksym;
1649 1644
1650 for (i = 1; i < n; i++) { 1645 for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
1646 const char *name = info->strtab + sym[i].st_name;
1647
1651 switch (sym[i].st_shndx) { 1648 switch (sym[i].st_shndx) {
1652 case SHN_COMMON: 1649 case SHN_COMMON:
1653 /* We compiled with -fno-common. These are not 1650 /* We compiled with -fno-common. These are not
1654 supposed to happen. */ 1651 supposed to happen. */
1655 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); 1652 DEBUGP("Common symbol: %s\n", name);
1656 printk("%s: please compile with -fno-common\n", 1653 printk("%s: please compile with -fno-common\n",
1657 mod->name); 1654 mod->name);
1658 ret = -ENOEXEC; 1655 ret = -ENOEXEC;
@@ -1665,9 +1662,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1665 break; 1662 break;
1666 1663
1667 case SHN_UNDEF: 1664 case SHN_UNDEF:
1668 ksym = resolve_symbol_wait(sechdrs, versindex, 1665 ksym = resolve_symbol_wait(mod, info, name);
1669 strtab + sym[i].st_name,
1670 mod);
1671 /* Ok if resolved. */ 1666 /* Ok if resolved. */
1672 if (ksym && !IS_ERR(ksym)) { 1667 if (ksym && !IS_ERR(ksym)) {
1673 sym[i].st_value = ksym->value; 1668 sym[i].st_value = ksym->value;
@@ -1679,17 +1674,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1679 break; 1674 break;
1680 1675
1681 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", 1676 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1682 mod->name, strtab + sym[i].st_name, 1677 mod->name, name, PTR_ERR(ksym));
1683 PTR_ERR(ksym));
1684 ret = PTR_ERR(ksym) ?: -ENOENT; 1678 ret = PTR_ERR(ksym) ?: -ENOENT;
1685 break; 1679 break;
1686 1680
1687 default: 1681 default:
1688 /* Divert to percpu allocation if a percpu var. */ 1682 /* Divert to percpu allocation if a percpu var. */
1689 if (sym[i].st_shndx == pcpuindex) 1683 if (sym[i].st_shndx == info->index.pcpu)
1690 secbase = (unsigned long)mod_percpu(mod); 1684 secbase = (unsigned long)mod_percpu(mod);
1691 else 1685 else
1692 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1686 secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
1693 sym[i].st_value += secbase; 1687 sym[i].st_value += secbase;
1694 break; 1688 break;
1695 } 1689 }
@@ -1698,6 +1692,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1698 return ret; 1692 return ret;
1699} 1693}
1700 1694
1695static int apply_relocations(struct module *mod, const struct load_info *info)
1696{
1697 unsigned int i;
1698 int err = 0;
1699
1700 /* Now do relocations. */
1701 for (i = 1; i < info->hdr->e_shnum; i++) {
1702 unsigned int infosec = info->sechdrs[i].sh_info;
1703
1704 /* Not a valid relocation section? */
1705 if (infosec >= info->hdr->e_shnum)
1706 continue;
1707
1708 /* Don't bother with non-allocated sections */
1709 if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
1710 continue;
1711
1712 if (info->sechdrs[i].sh_type == SHT_REL)
1713 err = apply_relocate(info->sechdrs, info->strtab,
1714 info->index.sym, i, mod);
1715 else if (info->sechdrs[i].sh_type == SHT_RELA)
1716 err = apply_relocate_add(info->sechdrs, info->strtab,
1717 info->index.sym, i, mod);
1718 if (err < 0)
1719 break;
1720 }
1721 return err;
1722}
1723
1701/* Additional bytes needed by arch in front of individual sections */ 1724/* Additional bytes needed by arch in front of individual sections */
1702unsigned int __weak arch_mod_section_prepend(struct module *mod, 1725unsigned int __weak arch_mod_section_prepend(struct module *mod,
1703 unsigned int section) 1726 unsigned int section)
@@ -1722,10 +1745,7 @@ static long get_offset(struct module *mod, unsigned int *size,
1722 might -- code, read-only data, read-write data, small data. Tally 1745 might -- code, read-only data, read-write data, small data. Tally
1723 sizes, and place the offsets into sh_entsize fields: high bit means it 1746 sizes, and place the offsets into sh_entsize fields: high bit means it
1724 belongs in init. */ 1747 belongs in init. */
1725static void layout_sections(struct module *mod, 1748static void layout_sections(struct module *mod, struct load_info *info)
1726 const Elf_Ehdr *hdr,
1727 Elf_Shdr *sechdrs,
1728 const char *secstrings)
1729{ 1749{
1730 static unsigned long const masks[][2] = { 1750 static unsigned long const masks[][2] = {
1731 /* NOTE: all executable code must be the first section 1751 /* NOTE: all executable code must be the first section
@@ -1738,21 +1758,22 @@ static void layout_sections(struct module *mod,
1738 }; 1758 };
1739 unsigned int m, i; 1759 unsigned int m, i;
1740 1760
1741 for (i = 0; i < hdr->e_shnum; i++) 1761 for (i = 0; i < info->hdr->e_shnum; i++)
1742 sechdrs[i].sh_entsize = ~0UL; 1762 info->sechdrs[i].sh_entsize = ~0UL;
1743 1763
1744 DEBUGP("Core section allocation order:\n"); 1764 DEBUGP("Core section allocation order:\n");
1745 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1765 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1746 for (i = 0; i < hdr->e_shnum; ++i) { 1766 for (i = 0; i < info->hdr->e_shnum; ++i) {
1747 Elf_Shdr *s = &sechdrs[i]; 1767 Elf_Shdr *s = &info->sechdrs[i];
1768 const char *sname = info->secstrings + s->sh_name;
1748 1769
1749 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1770 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1750 || (s->sh_flags & masks[m][1]) 1771 || (s->sh_flags & masks[m][1])
1751 || s->sh_entsize != ~0UL 1772 || s->sh_entsize != ~0UL
1752 || strstarts(secstrings + s->sh_name, ".init")) 1773 || strstarts(sname, ".init"))
1753 continue; 1774 continue;
1754 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1775 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1755 DEBUGP("\t%s\n", secstrings + s->sh_name); 1776 DEBUGP("\t%s\n", name);
1756 } 1777 }
1757 if (m == 0) 1778 if (m == 0)
1758 mod->core_text_size = mod->core_size; 1779 mod->core_text_size = mod->core_size;
@@ -1760,17 +1781,18 @@ static void layout_sections(struct module *mod,
1760 1781
1761 DEBUGP("Init section allocation order:\n"); 1782 DEBUGP("Init section allocation order:\n");
1762 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1783 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1763 for (i = 0; i < hdr->e_shnum; ++i) { 1784 for (i = 0; i < info->hdr->e_shnum; ++i) {
1764 Elf_Shdr *s = &sechdrs[i]; 1785 Elf_Shdr *s = &info->sechdrs[i];
1786 const char *sname = info->secstrings + s->sh_name;
1765 1787
1766 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1788 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1767 || (s->sh_flags & masks[m][1]) 1789 || (s->sh_flags & masks[m][1])
1768 || s->sh_entsize != ~0UL 1790 || s->sh_entsize != ~0UL
1769 || !strstarts(secstrings + s->sh_name, ".init")) 1791 || !strstarts(sname, ".init"))
1770 continue; 1792 continue;
1771 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1793 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1772 | INIT_OFFSET_MASK); 1794 | INIT_OFFSET_MASK);
1773 DEBUGP("\t%s\n", secstrings + s->sh_name); 1795 DEBUGP("\t%s\n", sname);
1774 } 1796 }
1775 if (m == 0) 1797 if (m == 0)
1776 mod->init_text_size = mod->init_size; 1798 mod->init_text_size = mod->init_size;
@@ -1809,33 +1831,28 @@ static char *next_string(char *string, unsigned long *secsize)
1809 return string; 1831 return string;
1810} 1832}
1811 1833
1812static char *get_modinfo(Elf_Shdr *sechdrs, 1834static char *get_modinfo(struct load_info *info, const char *tag)
1813 unsigned int info,
1814 const char *tag)
1815{ 1835{
1816 char *p; 1836 char *p;
1817 unsigned int taglen = strlen(tag); 1837 unsigned int taglen = strlen(tag);
1818 unsigned long size = sechdrs[info].sh_size; 1838 Elf_Shdr *infosec = &info->sechdrs[info->index.info];
1839 unsigned long size = infosec->sh_size;
1819 1840
1820 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { 1841 for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
1821 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') 1842 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1822 return p + taglen + 1; 1843 return p + taglen + 1;
1823 } 1844 }
1824 return NULL; 1845 return NULL;
1825} 1846}
1826 1847
1827static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, 1848static void setup_modinfo(struct module *mod, struct load_info *info)
1828 unsigned int infoindex)
1829{ 1849{
1830 struct module_attribute *attr; 1850 struct module_attribute *attr;
1831 int i; 1851 int i;
1832 1852
1833 for (i = 0; (attr = modinfo_attrs[i]); i++) { 1853 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1834 if (attr->setup) 1854 if (attr->setup)
1835 attr->setup(mod, 1855 attr->setup(mod, get_modinfo(info, attr->attr.name));
1836 get_modinfo(sechdrs,
1837 infoindex,
1838 attr->attr.name));
1839 } 1856 }
1840} 1857}
1841 1858
@@ -1876,11 +1893,10 @@ static int is_exported(const char *name, unsigned long value,
1876} 1893}
1877 1894
1878/* As per nm */ 1895/* As per nm */
1879static char elf_type(const Elf_Sym *sym, 1896static char elf_type(const Elf_Sym *sym, const struct load_info *info)
1880 Elf_Shdr *sechdrs,
1881 const char *secstrings,
1882 struct module *mod)
1883{ 1897{
1898 const Elf_Shdr *sechdrs = info->sechdrs;
1899
1884 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { 1900 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1885 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) 1901 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1886 return 'v'; 1902 return 'v';
@@ -1910,8 +1926,10 @@ static char elf_type(const Elf_Sym *sym,
1910 else 1926 else
1911 return 'b'; 1927 return 'b';
1912 } 1928 }
1913 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) 1929 if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
1930 ".debug")) {
1914 return 'n'; 1931 return 'n';
1932 }
1915 return '?'; 1933 return '?';
1916} 1934}
1917 1935
@@ -1936,127 +1954,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1936 return true; 1954 return true;
1937} 1955}
1938 1956
1939static unsigned long layout_symtab(struct module *mod, 1957static void layout_symtab(struct module *mod, struct load_info *info)
1940 Elf_Shdr *sechdrs,
1941 unsigned int symindex,
1942 unsigned int strindex,
1943 const Elf_Ehdr *hdr,
1944 const char *secstrings,
1945 unsigned long *pstroffs,
1946 unsigned long *strmap)
1947{ 1958{
1948 unsigned long symoffs; 1959 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
1949 Elf_Shdr *symsect = sechdrs + symindex; 1960 Elf_Shdr *strsect = info->sechdrs + info->index.str;
1950 Elf_Shdr *strsect = sechdrs + strindex;
1951 const Elf_Sym *src; 1961 const Elf_Sym *src;
1952 const char *strtab;
1953 unsigned int i, nsrc, ndst; 1962 unsigned int i, nsrc, ndst;
1954 1963
1955 /* Put symbol section at end of init part of module. */ 1964 /* Put symbol section at end of init part of module. */
1956 symsect->sh_flags |= SHF_ALLOC; 1965 symsect->sh_flags |= SHF_ALLOC;
1957 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 1966 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1958 symindex) | INIT_OFFSET_MASK; 1967 info->index.sym) | INIT_OFFSET_MASK;
1959 DEBUGP("\t%s\n", secstrings + symsect->sh_name); 1968 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
1960 1969
1961 src = (void *)hdr + symsect->sh_offset; 1970 src = (void *)info->hdr + symsect->sh_offset;
1962 nsrc = symsect->sh_size / sizeof(*src); 1971 nsrc = symsect->sh_size / sizeof(*src);
1963 strtab = (void *)hdr + strsect->sh_offset;
1964 for (ndst = i = 1; i < nsrc; ++i, ++src) 1972 for (ndst = i = 1; i < nsrc; ++i, ++src)
1965 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { 1973 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
1966 unsigned int j = src->st_name; 1974 unsigned int j = src->st_name;
1967 1975
1968 while(!__test_and_set_bit(j, strmap) && strtab[j]) 1976 while (!__test_and_set_bit(j, info->strmap)
1977 && info->strtab[j])
1969 ++j; 1978 ++j;
1970 ++ndst; 1979 ++ndst;
1971 } 1980 }
1972 1981
1973 /* Append room for core symbols at end of core part. */ 1982 /* Append room for core symbols at end of core part. */
1974 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 1983 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1975 mod->core_size = symoffs + ndst * sizeof(Elf_Sym); 1984 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
1976 1985
1977 /* Put string table section at end of init part of module. */ 1986 /* Put string table section at end of init part of module. */
1978 strsect->sh_flags |= SHF_ALLOC; 1987 strsect->sh_flags |= SHF_ALLOC;
1979 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 1988 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1980 strindex) | INIT_OFFSET_MASK; 1989 info->index.str) | INIT_OFFSET_MASK;
1981 DEBUGP("\t%s\n", secstrings + strsect->sh_name); 1990 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
1982 1991
1983 /* Append room for core symbols' strings at end of core part. */ 1992 /* Append room for core symbols' strings at end of core part. */
1984 *pstroffs = mod->core_size; 1993 info->stroffs = mod->core_size;
1985 __set_bit(0, strmap); 1994 __set_bit(0, info->strmap);
1986 mod->core_size += bitmap_weight(strmap, strsect->sh_size); 1995 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
1987
1988 return symoffs;
1989} 1996}
1990 1997
1991static void add_kallsyms(struct module *mod, 1998static void add_kallsyms(struct module *mod, const struct load_info *info)
1992 Elf_Shdr *sechdrs,
1993 unsigned int shnum,
1994 unsigned int symindex,
1995 unsigned int strindex,
1996 unsigned long symoffs,
1997 unsigned long stroffs,
1998 const char *secstrings,
1999 unsigned long *strmap)
2000{ 1999{
2001 unsigned int i, ndst; 2000 unsigned int i, ndst;
2002 const Elf_Sym *src; 2001 const Elf_Sym *src;
2003 Elf_Sym *dst; 2002 Elf_Sym *dst;
2004 char *s; 2003 char *s;
2004 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
2005 2005
2006 mod->symtab = (void *)sechdrs[symindex].sh_addr; 2006 mod->symtab = (void *)symsec->sh_addr;
2007 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 2007 mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
2008 mod->strtab = (void *)sechdrs[strindex].sh_addr; 2008 /* Make sure we get permanent strtab: don't use info->strtab. */
2009 mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
2009 2010
2010 /* Set types up while we still have access to sections. */ 2011 /* Set types up while we still have access to sections. */
2011 for (i = 0; i < mod->num_symtab; i++) 2012 for (i = 0; i < mod->num_symtab; i++)
2012 mod->symtab[i].st_info 2013 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2013 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
2014 2014
2015 mod->core_symtab = dst = mod->module_core + symoffs; 2015 mod->core_symtab = dst = mod->module_core + info->symoffs;
2016 src = mod->symtab; 2016 src = mod->symtab;
2017 *dst = *src; 2017 *dst = *src;
2018 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2018 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2019 if (!is_core_symbol(src, sechdrs, shnum)) 2019 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2020 continue; 2020 continue;
2021 dst[ndst] = *src; 2021 dst[ndst] = *src;
2022 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); 2022 dst[ndst].st_name = bitmap_weight(info->strmap,
2023 dst[ndst].st_name);
2023 ++ndst; 2024 ++ndst;
2024 } 2025 }
2025 mod->core_num_syms = ndst; 2026 mod->core_num_syms = ndst;
2026 2027
2027 mod->core_strtab = s = mod->module_core + stroffs; 2028 mod->core_strtab = s = mod->module_core + info->stroffs;
2028 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) 2029 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2029 if (test_bit(i, strmap)) 2030 if (test_bit(i, info->strmap))
2030 *++s = mod->strtab[i]; 2031 *++s = mod->strtab[i];
2031} 2032}
2032#else 2033#else
2033static inline unsigned long layout_symtab(struct module *mod, 2034static inline void layout_symtab(struct module *mod, struct load_info *info)
2034 Elf_Shdr *sechdrs,
2035 unsigned int symindex,
2036 unsigned int strindex,
2037 const Elf_Ehdr *hdr,
2038 const char *secstrings,
2039 unsigned long *pstroffs,
2040 unsigned long *strmap)
2041{ 2035{
2042 return 0;
2043} 2036}
2044 2037
2045static inline void add_kallsyms(struct module *mod, 2038static void add_kallsyms(struct module *mod, struct load_info *info)
2046 Elf_Shdr *sechdrs,
2047 unsigned int shnum,
2048 unsigned int symindex,
2049 unsigned int strindex,
2050 unsigned long symoffs,
2051 unsigned long stroffs,
2052 const char *secstrings,
2053 const unsigned long *strmap)
2054{ 2039{
2055} 2040}
2056#endif /* CONFIG_KALLSYMS */ 2041#endif /* CONFIG_KALLSYMS */
2057 2042
2058static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) 2043static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
2059{ 2044{
2045 if (!debug)
2046 return;
2060#ifdef CONFIG_DYNAMIC_DEBUG 2047#ifdef CONFIG_DYNAMIC_DEBUG
2061 if (ddebug_add_module(debug, num, debug->modname)) 2048 if (ddebug_add_module(debug, num, debug->modname))
2062 printk(KERN_ERR "dynamic debug error adding module: %s\n", 2049 printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -2087,65 +2074,47 @@ static void *module_alloc_update_bounds(unsigned long size)
2087} 2074}
2088 2075
2089#ifdef CONFIG_DEBUG_KMEMLEAK 2076#ifdef CONFIG_DEBUG_KMEMLEAK
2090static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2077static void kmemleak_load_module(const struct module *mod,
2091 Elf_Shdr *sechdrs, char *secstrings) 2078 const struct load_info *info)
2092{ 2079{
2093 unsigned int i; 2080 unsigned int i;
2094 2081
2095 /* only scan the sections containing data */ 2082 /* only scan the sections containing data */
2096 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); 2083 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2097 2084
2098 for (i = 1; i < hdr->e_shnum; i++) { 2085 for (i = 1; i < info->hdr->e_shnum; i++) {
2099 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2086 const char *name = info->secstrings + info->sechdrs[i].sh_name;
2087 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
2100 continue; 2088 continue;
2101 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 2089 if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
2102 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2103 continue; 2090 continue;
2104 2091
2105 kmemleak_scan_area((void *)sechdrs[i].sh_addr, 2092 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
2106 sechdrs[i].sh_size, GFP_KERNEL); 2093 info->sechdrs[i].sh_size, GFP_KERNEL);
2107 } 2094 }
2108} 2095}
2109#else 2096#else
2110static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2097static inline void kmemleak_load_module(const struct module *mod,
2111 Elf_Shdr *sechdrs, char *secstrings) 2098 const struct load_info *info)
2112{ 2099{
2113} 2100}
2114#endif 2101#endif
2115 2102
2116/* Allocate and load the module: note that size of section 0 is always 2103/* Sets info->hdr and info->len. */
2117 zero, and we rely on this for optional sections. */ 2104static int copy_and_check(struct load_info *info,
2118static noinline struct module *load_module(void __user *umod, 2105 const void __user *umod, unsigned long len,
2119 unsigned long len, 2106 const char __user *uargs)
2120 const char __user *uargs)
2121{ 2107{
2108 int err;
2122 Elf_Ehdr *hdr; 2109 Elf_Ehdr *hdr;
2123 Elf_Shdr *sechdrs;
2124 char *secstrings, *args, *modmagic, *strtab = NULL;
2125 char *staging;
2126 unsigned int i;
2127 unsigned int symindex = 0;
2128 unsigned int strindex = 0;
2129 unsigned int modindex, versindex, infoindex, pcpuindex;
2130 struct module *mod;
2131 long err = 0;
2132 void *ptr = NULL; /* Stops spurious gcc warning */
2133 unsigned long symoffs, stroffs, *strmap;
2134 void __percpu *percpu;
2135 struct _ddebug *debug = NULL;
2136 unsigned int num_debug = 0;
2137 2110
2138 mm_segment_t old_fs;
2139
2140 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2141 umod, len, uargs);
2142 if (len < sizeof(*hdr)) 2111 if (len < sizeof(*hdr))
2143 return ERR_PTR(-ENOEXEC); 2112 return -ENOEXEC;
2144 2113
2145 /* Suck in entire file: we'll want most of it. */ 2114 /* Suck in entire file: we'll want most of it. */
2146 /* vmalloc barfs on "unusual" numbers. Check here */ 2115 /* vmalloc barfs on "unusual" numbers. Check here */
2147 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 2116 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2148 return ERR_PTR(-ENOMEM); 2117 return -ENOMEM;
2149 2118
2150 if (copy_from_user(hdr, umod, len) != 0) { 2119 if (copy_from_user(hdr, umod, len) != 0) {
2151 err = -EFAULT; 2120 err = -EFAULT;
@@ -2153,135 +2122,225 @@ static noinline struct module *load_module(void __user *umod,
2153 } 2122 }
2154 2123
2155 /* Sanity checks against insmoding binaries or wrong arch, 2124 /* Sanity checks against insmoding binaries or wrong arch,
2156 weird elf version */ 2125 weird elf version */
2157 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2126 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
2158 || hdr->e_type != ET_REL 2127 || hdr->e_type != ET_REL
2159 || !elf_check_arch(hdr) 2128 || !elf_check_arch(hdr)
2160 || hdr->e_shentsize != sizeof(*sechdrs)) { 2129 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2161 err = -ENOEXEC; 2130 err = -ENOEXEC;
2162 goto free_hdr; 2131 goto free_hdr;
2163 } 2132 }
2164 2133
2165 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) 2134 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
2166 goto truncated; 2135 err = -ENOEXEC;
2136 goto free_hdr;
2137 }
2167 2138
2168 /* Convenience variables */ 2139 info->hdr = hdr;
2169 sechdrs = (void *)hdr + hdr->e_shoff; 2140 info->len = len;
2170 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 2141 return 0;
2171 sechdrs[0].sh_addr = 0;
2172 2142
2173 for (i = 1; i < hdr->e_shnum; i++) { 2143free_hdr:
2174 if (sechdrs[i].sh_type != SHT_NOBITS 2144 vfree(hdr);
2175 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) 2145 return err;
2176 goto truncated; 2146}
2147
2148static void free_copy(struct load_info *info)
2149{
2150 vfree(info->hdr);
2151}
2152
2153static int rewrite_section_headers(struct load_info *info)
2154{
2155 unsigned int i;
2156
2157 /* This should always be true, but let's be sure. */
2158 info->sechdrs[0].sh_addr = 0;
2159
2160 for (i = 1; i < info->hdr->e_shnum; i++) {
2161 Elf_Shdr *shdr = &info->sechdrs[i];
2162 if (shdr->sh_type != SHT_NOBITS
2163 && info->len < shdr->sh_offset + shdr->sh_size) {
2164 printk(KERN_ERR "Module len %lu truncated\n",
2165 info->len);
2166 return -ENOEXEC;
2167 }
2177 2168
2178 /* Mark all sections sh_addr with their address in the 2169 /* Mark all sections sh_addr with their address in the
2179 temporary image. */ 2170 temporary image. */
2180 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; 2171 shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
2181 2172
2182 /* Internal symbols and strings. */
2183 if (sechdrs[i].sh_type == SHT_SYMTAB) {
2184 symindex = i;
2185 strindex = sechdrs[i].sh_link;
2186 strtab = (char *)hdr + sechdrs[strindex].sh_offset;
2187 }
2188#ifndef CONFIG_MODULE_UNLOAD 2173#ifndef CONFIG_MODULE_UNLOAD
2189 /* Don't load .exit sections */ 2174 /* Don't load .exit sections */
2190 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) 2175 if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
2191 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 2176 shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
2192#endif 2177#endif
2193 } 2178 }
2194 2179
2195 modindex = find_sec(hdr, sechdrs, secstrings, 2180 /* Track but don't keep modinfo and version sections. */
2196 ".gnu.linkonce.this_module"); 2181 info->index.vers = find_sec(info, "__versions");
2197 if (!modindex) { 2182 info->index.info = find_sec(info, ".modinfo");
2183 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2184 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
2185 return 0;
2186}
2187
2188/*
2189 * Set up our basic convenience variables (pointers to section headers,
2190 * search for module section index etc), and do some basic section
2191 * verification.
2192 *
2193 * Return the temporary module pointer (we'll replace it with the final
2194 * one when we move the module sections around).
2195 */
2196static struct module *setup_load_info(struct load_info *info)
2197{
2198 unsigned int i;
2199 int err;
2200 struct module *mod;
2201
2202 /* Set up the convenience variables */
2203 info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
2204 info->secstrings = (void *)info->hdr
2205 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2206
2207 err = rewrite_section_headers(info);
2208 if (err)
2209 return ERR_PTR(err);
2210
2211 /* Find internal symbols and strings. */
2212 for (i = 1; i < info->hdr->e_shnum; i++) {
2213 if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
2214 info->index.sym = i;
2215 info->index.str = info->sechdrs[i].sh_link;
2216 info->strtab = (char *)info->hdr
2217 + info->sechdrs[info->index.str].sh_offset;
2218 break;
2219 }
2220 }
2221
2222 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2223 if (!info->index.mod) {
2198 printk(KERN_WARNING "No module found in object\n"); 2224 printk(KERN_WARNING "No module found in object\n");
2199 err = -ENOEXEC; 2225 return ERR_PTR(-ENOEXEC);
2200 goto free_hdr;
2201 } 2226 }
2202 /* This is temporary: point mod into copy of data. */ 2227 /* This is temporary: point mod into copy of data. */
2203 mod = (void *)sechdrs[modindex].sh_addr; 2228 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2204 2229
2205 if (symindex == 0) { 2230 if (info->index.sym == 0) {
2206 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 2231 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
2207 mod->name); 2232 mod->name);
2208 err = -ENOEXEC; 2233 return ERR_PTR(-ENOEXEC);
2209 goto free_hdr;
2210 } 2234 }
2211 2235
2212 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 2236 info->index.pcpu = find_pcpusec(info);
2213 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
2214 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
2215
2216 /* Don't keep modinfo and version sections. */
2217 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2218 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2219 2237
2220 /* Check module struct version now, before we try to use module. */ 2238 /* Check module struct version now, before we try to use module. */
2221 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2239 if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
2222 err = -ENOEXEC; 2240 return ERR_PTR(-ENOEXEC);
2223 goto free_hdr; 2241
2224 } 2242 return mod;
2243}
2244
2245static int check_modinfo(struct module *mod, struct load_info *info)
2246{
2247 const char *modmagic = get_modinfo(info, "vermagic");
2248 int err;
2225 2249
2226 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2227 /* This is allowed: modprobe --force will invalidate it. */ 2250 /* This is allowed: modprobe --force will invalidate it. */
2228 if (!modmagic) { 2251 if (!modmagic) {
2229 err = try_to_force_load(mod, "bad vermagic"); 2252 err = try_to_force_load(mod, "bad vermagic");
2230 if (err) 2253 if (err)
2231 goto free_hdr; 2254 return err;
2232 } else if (!same_magic(modmagic, vermagic, versindex)) { 2255 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2233 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 2256 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
2234 mod->name, modmagic, vermagic); 2257 mod->name, modmagic, vermagic);
2235 err = -ENOEXEC; 2258 return -ENOEXEC;
2236 goto free_hdr;
2237 } 2259 }
2238 2260
2239 staging = get_modinfo(sechdrs, infoindex, "staging"); 2261 if (get_modinfo(info, "staging")) {
2240 if (staging) {
2241 add_taint_module(mod, TAINT_CRAP); 2262 add_taint_module(mod, TAINT_CRAP);
2242 printk(KERN_WARNING "%s: module is from the staging directory," 2263 printk(KERN_WARNING "%s: module is from the staging directory,"
2243 " the quality is unknown, you have been warned.\n", 2264 " the quality is unknown, you have been warned.\n",
2244 mod->name); 2265 mod->name);
2245 } 2266 }
2246 2267
2247 /* Now copy in args */ 2268 /* Set up license info based on the info section */
2248 args = strndup_user(uargs, ~0UL >> 1); 2269 set_license(mod, get_modinfo(info, "license"));
2249 if (IS_ERR(args)) {
2250 err = PTR_ERR(args);
2251 goto free_hdr;
2252 }
2253 2270
2254 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) 2271 return 0;
2255 * sizeof(long), GFP_KERNEL); 2272}
2256 if (!strmap) {
2257 err = -ENOMEM;
2258 goto free_mod;
2259 }
2260 2273
2261 mod->state = MODULE_STATE_COMING; 2274static void find_module_sections(struct module *mod, struct load_info *info)
2275{
2276 mod->kp = section_objs(info, "__param",
2277 sizeof(*mod->kp), &mod->num_kp);
2278 mod->syms = section_objs(info, "__ksymtab",
2279 sizeof(*mod->syms), &mod->num_syms);
2280 mod->crcs = section_addr(info, "__kcrctab");
2281 mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
2282 sizeof(*mod->gpl_syms),
2283 &mod->num_gpl_syms);
2284 mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
2285 mod->gpl_future_syms = section_objs(info,
2286 "__ksymtab_gpl_future",
2287 sizeof(*mod->gpl_future_syms),
2288 &mod->num_gpl_future_syms);
2289 mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
2262 2290
2263 /* Allow arches to frob section contents and sizes. */ 2291#ifdef CONFIG_UNUSED_SYMBOLS
2264 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); 2292 mod->unused_syms = section_objs(info, "__ksymtab_unused",
2265 if (err < 0) 2293 sizeof(*mod->unused_syms),
2266 goto free_mod; 2294 &mod->num_unused_syms);
2295 mod->unused_crcs = section_addr(info, "__kcrctab_unused");
2296 mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
2297 sizeof(*mod->unused_gpl_syms),
2298 &mod->num_unused_gpl_syms);
2299 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
2300#endif
2301#ifdef CONFIG_CONSTRUCTORS
2302 mod->ctors = section_objs(info, ".ctors",
2303 sizeof(*mod->ctors), &mod->num_ctors);
2304#endif
2267 2305
2268 if (pcpuindex) { 2306#ifdef CONFIG_TRACEPOINTS
2269 /* We have a special allocation for this section. */ 2307 mod->tracepoints = section_objs(info, "__tracepoints",
2270 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, 2308 sizeof(*mod->tracepoints),
2271 sechdrs[pcpuindex].sh_addralign); 2309 &mod->num_tracepoints);
2272 if (err) 2310#endif
2273 goto free_mod; 2311#ifdef CONFIG_EVENT_TRACING
2274 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2312 mod->trace_events = section_objs(info, "_ftrace_events",
2275 } 2313 sizeof(*mod->trace_events),
2276 /* Keep this around for failure path. */ 2314 &mod->num_trace_events);
2277 percpu = mod_percpu(mod); 2315 /*
2316 * This section contains pointers to allocated objects in the trace
2317 * code and not scanning it leads to false positives.
2318 */
2319 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2320 mod->num_trace_events, GFP_KERNEL);
2321#endif
2322#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2323 /* sechdrs[0].sh_size is always zero */
2324 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
2325 sizeof(*mod->ftrace_callsites),
2326 &mod->num_ftrace_callsites);
2327#endif
2278 2328
2279 /* Determine total sizes, and put offsets in sh_entsize. For now 2329 mod->extable = section_objs(info, "__ex_table",
2280 this is done generically; there doesn't appear to be any 2330 sizeof(*mod->extable), &mod->num_exentries);
2281 special cases for the architectures. */ 2331
2282 layout_sections(mod, hdr, sechdrs, secstrings); 2332 if (section_addr(info, "__obsparm"))
2283 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, 2333 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2284 secstrings, &stroffs, strmap); 2334 mod->name);
2335
2336 info->debug = section_objs(info, "__verbose",
2337 sizeof(*info->debug), &info->num_debug);
2338}
2339
2340static int move_module(struct module *mod, struct load_info *info)
2341{
2342 int i;
2343 void *ptr;
2285 2344
2286 /* Do the allocs. */ 2345 /* Do the allocs. */
2287 ptr = module_alloc_update_bounds(mod->core_size); 2346 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2291,10 +2350,9 @@ static noinline struct module *load_module(void __user *umod,
2291 * leak. 2350 * leak.
2292 */ 2351 */
2293 kmemleak_not_leak(ptr); 2352 kmemleak_not_leak(ptr);
2294 if (!ptr) { 2353 if (!ptr)
2295 err = -ENOMEM; 2354 return -ENOMEM;
2296 goto free_percpu; 2355
2297 }
2298 memset(ptr, 0, mod->core_size); 2356 memset(ptr, 0, mod->core_size);
2299 mod->module_core = ptr; 2357 mod->module_core = ptr;
2300 2358
@@ -2307,50 +2365,40 @@ static noinline struct module *load_module(void __user *umod,
2307 */ 2365 */
2308 kmemleak_ignore(ptr); 2366 kmemleak_ignore(ptr);
2309 if (!ptr && mod->init_size) { 2367 if (!ptr && mod->init_size) {
2310 err = -ENOMEM; 2368 module_free(mod, mod->module_core);
2311 goto free_core; 2369 return -ENOMEM;
2312 } 2370 }
2313 memset(ptr, 0, mod->init_size); 2371 memset(ptr, 0, mod->init_size);
2314 mod->module_init = ptr; 2372 mod->module_init = ptr;
2315 2373
2316 /* Transfer each section which specifies SHF_ALLOC */ 2374 /* Transfer each section which specifies SHF_ALLOC */
2317 DEBUGP("final section addresses:\n"); 2375 DEBUGP("final section addresses:\n");
2318 for (i = 0; i < hdr->e_shnum; i++) { 2376 for (i = 0; i < info->hdr->e_shnum; i++) {
2319 void *dest; 2377 void *dest;
2378 Elf_Shdr *shdr = &info->sechdrs[i];
2320 2379
2321 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2380 if (!(shdr->sh_flags & SHF_ALLOC))
2322 continue; 2381 continue;
2323 2382
2324 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) 2383 if (shdr->sh_entsize & INIT_OFFSET_MASK)
2325 dest = mod->module_init 2384 dest = mod->module_init
2326 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); 2385 + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
2327 else 2386 else
2328 dest = mod->module_core + sechdrs[i].sh_entsize; 2387 dest = mod->module_core + shdr->sh_entsize;
2329 2388
2330 if (sechdrs[i].sh_type != SHT_NOBITS) 2389 if (shdr->sh_type != SHT_NOBITS)
2331 memcpy(dest, (void *)sechdrs[i].sh_addr, 2390 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2332 sechdrs[i].sh_size);
2333 /* Update sh_addr to point to copy in image. */ 2391 /* Update sh_addr to point to copy in image. */
2334 sechdrs[i].sh_addr = (unsigned long)dest; 2392 shdr->sh_addr = (unsigned long)dest;
2335 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); 2393 DEBUGP("\t0x%lx %s\n",
2336 } 2394 shdr->sh_addr, info->secstrings + shdr->sh_name);
2337 /* Module has been moved. */
2338 mod = (void *)sechdrs[modindex].sh_addr;
2339 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2340
2341#if defined(CONFIG_MODULE_UNLOAD)
2342 mod->refptr = alloc_percpu(struct module_ref);
2343 if (!mod->refptr) {
2344 err = -ENOMEM;
2345 goto free_init;
2346 } 2395 }
2347#endif
2348 /* Now we've moved module, initialize linked lists, etc. */
2349 module_unload_init(mod);
2350 2396
2351 /* Set up license info based on the info section */ 2397 return 0;
2352 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2398}
2353 2399
2400static int check_module_license_and_versions(struct module *mod)
2401{
2354 /* 2402 /*
2355 * ndiswrapper is under GPL by itself, but loads proprietary modules. 2403 * ndiswrapper is under GPL by itself, but loads proprietary modules.
2356 * Don't use add_taint_module(), as it would prevent ndiswrapper from 2404 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2363,77 +2411,6 @@ static noinline struct module *load_module(void __user *umod,
2363 if (strcmp(mod->name, "driverloader") == 0) 2411 if (strcmp(mod->name, "driverloader") == 0)
2364 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2412 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2365 2413
2366 /* Set up MODINFO_ATTR fields */
2367 setup_modinfo(mod, sechdrs, infoindex);
2368
2369 /* Fix up syms, so that st_value is a pointer to location. */
2370 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
2371 mod);
2372 if (err < 0)
2373 goto cleanup;
2374
2375 /* Now we've got everything in the final locations, we can
2376 * find optional sections. */
2377 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2378 sizeof(*mod->kp), &mod->num_kp);
2379 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2380 sizeof(*mod->syms), &mod->num_syms);
2381 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2382 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2383 sizeof(*mod->gpl_syms),
2384 &mod->num_gpl_syms);
2385 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2386 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2387 "__ksymtab_gpl_future",
2388 sizeof(*mod->gpl_future_syms),
2389 &mod->num_gpl_future_syms);
2390 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2391 "__kcrctab_gpl_future");
2392
2393#ifdef CONFIG_UNUSED_SYMBOLS
2394 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2395 "__ksymtab_unused",
2396 sizeof(*mod->unused_syms),
2397 &mod->num_unused_syms);
2398 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2399 "__kcrctab_unused");
2400 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2401 "__ksymtab_unused_gpl",
2402 sizeof(*mod->unused_gpl_syms),
2403 &mod->num_unused_gpl_syms);
2404 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2405 "__kcrctab_unused_gpl");
2406#endif
2407#ifdef CONFIG_CONSTRUCTORS
2408 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2409 sizeof(*mod->ctors), &mod->num_ctors);
2410#endif
2411
2412#ifdef CONFIG_TRACEPOINTS
2413 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2414 "__tracepoints",
2415 sizeof(*mod->tracepoints),
2416 &mod->num_tracepoints);
2417#endif
2418#ifdef CONFIG_EVENT_TRACING
2419 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2420 "_ftrace_events",
2421 sizeof(*mod->trace_events),
2422 &mod->num_trace_events);
2423 /*
2424 * This section contains pointers to allocated objects in the trace
2425 * code and not scanning it leads to false positives.
2426 */
2427 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2428 mod->num_trace_events, GFP_KERNEL);
2429#endif
2430#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2431 /* sechdrs[0].sh_size is always zero */
2432 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2433 "__mcount_loc",
2434 sizeof(*mod->ftrace_callsites),
2435 &mod->num_ftrace_callsites);
2436#endif
2437#ifdef CONFIG_MODVERSIONS 2414#ifdef CONFIG_MODVERSIONS
2438 if ((mod->num_syms && !mod->crcs) 2415 if ((mod->num_syms && !mod->crcs)
2439 || (mod->num_gpl_syms && !mod->gpl_crcs) 2416 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2443,56 +2420,16 @@ static noinline struct module *load_module(void __user *umod,
2443 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2420 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2444#endif 2421#endif
2445 ) { 2422 ) {
2446 err = try_to_force_load(mod, 2423 return try_to_force_load(mod,
2447 "no versions for exported symbols"); 2424 "no versions for exported symbols");
2448 if (err)
2449 goto cleanup;
2450 } 2425 }
2451#endif 2426#endif
2427 return 0;
2428}
2452 2429
2453 /* Now do relocations. */ 2430static void flush_module_icache(const struct module *mod)
2454 for (i = 1; i < hdr->e_shnum; i++) { 2431{
2455 const char *strtab = (char *)sechdrs[strindex].sh_addr; 2432 mm_segment_t old_fs;
2456 unsigned int info = sechdrs[i].sh_info;
2457
2458 /* Not a valid relocation section? */
2459 if (info >= hdr->e_shnum)
2460 continue;
2461
2462 /* Don't bother with non-allocated sections */
2463 if (!(sechdrs[info].sh_flags & SHF_ALLOC))
2464 continue;
2465
2466 if (sechdrs[i].sh_type == SHT_REL)
2467 err = apply_relocate(sechdrs, strtab, symindex, i,mod);
2468 else if (sechdrs[i].sh_type == SHT_RELA)
2469 err = apply_relocate_add(sechdrs, strtab, symindex, i,
2470 mod);
2471 if (err < 0)
2472 goto cleanup;
2473 }
2474
2475 /* Set up and sort exception table */
2476 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2477 sizeof(*mod->extable), &mod->num_exentries);
2478 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2479
2480 /* Finally, copy percpu area over. */
2481 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2482 sechdrs[pcpuindex].sh_size);
2483
2484 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2485 symoffs, stroffs, secstrings, strmap);
2486 kfree(strmap);
2487 strmap = NULL;
2488
2489 if (!mod->taints)
2490 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2491 sizeof(*debug), &num_debug);
2492
2493 err = module_finalize(hdr, sechdrs, mod);
2494 if (err < 0)
2495 goto cleanup;
2496 2433
2497 /* flush the icache in correct context */ 2434 /* flush the icache in correct context */
2498 old_fs = get_fs(); 2435 old_fs = get_fs();
@@ -2511,11 +2448,160 @@ static noinline struct module *load_module(void __user *umod,
2511 (unsigned long)mod->module_core + mod->core_size); 2448 (unsigned long)mod->module_core + mod->core_size);
2512 2449
2513 set_fs(old_fs); 2450 set_fs(old_fs);
2451}
2514 2452
2515 mod->args = args; 2453static struct module *layout_and_allocate(struct load_info *info)
2516 if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) 2454{
2517 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2455 /* Module within temporary copy. */
2518 mod->name); 2456 struct module *mod;
2457 Elf_Shdr *pcpusec;
2458 int err;
2459
2460 mod = setup_load_info(info);
2461 if (IS_ERR(mod))
2462 return mod;
2463
2464 err = check_modinfo(mod, info);
2465 if (err)
2466 return ERR_PTR(err);
2467
2468 /* Allow arches to frob section contents and sizes. */
2469 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2470 info->secstrings, mod);
2471 if (err < 0)
2472 goto out;
2473
2474 pcpusec = &info->sechdrs[info->index.pcpu];
2475 if (pcpusec->sh_size) {
2476 /* We have a special allocation for this section. */
2477 err = percpu_modalloc(mod,
2478 pcpusec->sh_size, pcpusec->sh_addralign);
2479 if (err)
2480 goto out;
2481 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2482 }
2483
2484 /* Determine total sizes, and put offsets in sh_entsize. For now
2485 this is done generically; there doesn't appear to be any
2486 special cases for the architectures. */
2487 layout_sections(mod, info);
2488
2489 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2490 * sizeof(long), GFP_KERNEL);
2491 if (!info->strmap) {
2492 err = -ENOMEM;
2493 goto free_percpu;
2494 }
2495 layout_symtab(mod, info);
2496
2497 /* Allocate and move to the final place */
2498 err = move_module(mod, info);
2499 if (err)
2500 goto free_strmap;
2501
2502 /* Module has been copied to its final place now: return it. */
2503 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2504 kmemleak_load_module(mod, info);
2505 return mod;
2506
2507free_strmap:
2508 kfree(info->strmap);
2509free_percpu:
2510 percpu_modfree(mod);
2511out:
2512 return ERR_PTR(err);
2513}
2514
2515/* mod is no longer valid after this! */
2516static void module_deallocate(struct module *mod, struct load_info *info)
2517{
2518 kfree(info->strmap);
2519 percpu_modfree(mod);
2520 module_free(mod, mod->module_init);
2521 module_free(mod, mod->module_core);
2522}
2523
2524static int post_relocation(struct module *mod, const struct load_info *info)
2525{
2526 /* Sort exception table now relocations are done. */
2527 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2528
2529 /* Copy relocated percpu area over. */
2530 percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
2531 info->sechdrs[info->index.pcpu].sh_size);
2532
2533 /* Setup kallsyms-specific fields. */
2534 add_kallsyms(mod, info);
2535
2536 /* Arch-specific module finalizing. */
2537 return module_finalize(info->hdr, info->sechdrs, mod);
2538}
2539
2540/* Allocate and load the module: note that size of section 0 is always
2541 zero, and we rely on this for optional sections. */
2542static struct module *load_module(void __user *umod,
2543 unsigned long len,
2544 const char __user *uargs)
2545{
2546 struct load_info info = { NULL, };
2547 struct module *mod;
2548 long err;
2549
2550 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2551 umod, len, uargs);
2552
2553 /* Copy in the blobs from userspace, check they are vaguely sane. */
2554 err = copy_and_check(&info, umod, len, uargs);
2555 if (err)
2556 return ERR_PTR(err);
2557
2558 /* Figure out module layout, and allocate all the memory. */
2559 mod = layout_and_allocate(&info);
2560 if (IS_ERR(mod)) {
2561 err = PTR_ERR(mod);
2562 goto free_copy;
2563 }
2564
2565 /* Now module is in final location, initialize linked lists, etc. */
2566 err = module_unload_init(mod);
2567 if (err)
2568 goto free_module;
2569
2570 /* Now we've got everything in the final locations, we can
2571 * find optional sections. */
2572 find_module_sections(mod, &info);
2573
2574 err = check_module_license_and_versions(mod);
2575 if (err)
2576 goto free_unload;
2577
2578 /* Set up MODINFO_ATTR fields */
2579 setup_modinfo(mod, &info);
2580
2581 /* Fix up syms, so that st_value is a pointer to location. */
2582 err = simplify_symbols(mod, &info);
2583 if (err < 0)
2584 goto free_modinfo;
2585
2586 err = apply_relocations(mod, &info);
2587 if (err < 0)
2588 goto free_modinfo;
2589
2590 err = post_relocation(mod, &info);
2591 if (err < 0)
2592 goto free_modinfo;
2593
2594 flush_module_icache(mod);
2595
2596 /* Now copy in args */
2597 mod->args = strndup_user(uargs, ~0UL >> 1);
2598 if (IS_ERR(mod->args)) {
2599 err = PTR_ERR(mod->args);
2600 goto free_arch_cleanup;
2601 }
2602
2603 /* Mark state as coming so strong_try_module_get() ignores us. */
2604 mod->state = MODULE_STATE_COMING;
2519 2605
2520 /* Now sew it into the lists so we can get lockdep and oops 2606 /* Now sew it into the lists so we can get lockdep and oops
2521 * info during argument parsing. Noone should access us, since 2607 * info during argument parsing. Noone should access us, since
@@ -2530,8 +2616,9 @@ static noinline struct module *load_module(void __user *umod,
2530 goto unlock; 2616 goto unlock;
2531 } 2617 }
2532 2618
2533 if (debug) 2619 /* This has to be done once we're sure module name is unique. */
2534 dynamic_debug_setup(debug, num_debug); 2620 if (!mod->taints)
2621 dynamic_debug_setup(info.debug, info.num_debug);
2535 2622
2536 /* Find duplicate symbols */ 2623 /* Find duplicate symbols */
2537 err = verify_export_symbols(mod); 2624 err = verify_export_symbols(mod);
@@ -2541,23 +2628,22 @@ static noinline struct module *load_module(void __user *umod,
2541 list_add_rcu(&mod->list, &modules); 2628 list_add_rcu(&mod->list, &modules);
2542 mutex_unlock(&module_mutex); 2629 mutex_unlock(&module_mutex);
2543 2630
2631 /* Module is ready to execute: parsing args may do that. */
2544 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2632 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2545 if (err < 0) 2633 if (err < 0)
2546 goto unlink; 2634 goto unlink;
2547 2635
2548 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2636 /* Link in to syfs. */
2637 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
2549 if (err < 0) 2638 if (err < 0)
2550 goto unlink; 2639 goto unlink;
2551 2640
2552 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2641 /* Get rid of temporary copy and strmap. */
2553 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2642 kfree(info.strmap);
2554 2643 free_copy(&info);
2555 /* Get rid of temporary copy */
2556 vfree(hdr);
2557
2558 trace_module_load(mod);
2559 2644
2560 /* Done! */ 2645 /* Done! */
2646 trace_module_load(mod);
2561 return mod; 2647 return mod;
2562 2648
2563 unlink: 2649 unlink:
@@ -2565,35 +2651,23 @@ static noinline struct module *load_module(void __user *umod,
2565 /* Unlink carefully: kallsyms could be walking list. */ 2651 /* Unlink carefully: kallsyms could be walking list. */
2566 list_del_rcu(&mod->list); 2652 list_del_rcu(&mod->list);
2567 ddebug: 2653 ddebug:
2568 dynamic_debug_remove(debug); 2654 if (!mod->taints)
2655 dynamic_debug_remove(info.debug);
2569 unlock: 2656 unlock:
2570 mutex_unlock(&module_mutex); 2657 mutex_unlock(&module_mutex);
2571 synchronize_sched(); 2658 synchronize_sched();
2659 kfree(mod->args);
2660 free_arch_cleanup:
2572 module_arch_cleanup(mod); 2661 module_arch_cleanup(mod);
2573 cleanup: 2662 free_modinfo:
2574 free_modinfo(mod); 2663 free_modinfo(mod);
2664 free_unload:
2575 module_unload_free(mod); 2665 module_unload_free(mod);
2576#if defined(CONFIG_MODULE_UNLOAD) 2666 free_module:
2577 free_percpu(mod->refptr); 2667 module_deallocate(mod, &info);
2578 free_init: 2668 free_copy:
2579#endif 2669 free_copy(&info);
2580 module_free(mod, mod->module_init);
2581 free_core:
2582 module_free(mod, mod->module_core);
2583 /* mod will be freed with core. Don't access it beyond this line! */
2584 free_percpu:
2585 free_percpu(percpu);
2586 free_mod:
2587 kfree(args);
2588 kfree(strmap);
2589 free_hdr:
2590 vfree(hdr);
2591 return ERR_PTR(err); 2670 return ERR_PTR(err);
2592
2593 truncated:
2594 printk(KERN_ERR "Module len %lu truncated\n", len);
2595 err = -ENOEXEC;
2596 goto free_hdr;
2597} 2671}
2598 2672
2599/* Call module constructors. */ 2673/* Call module constructors. */
diff --git a/kernel/padata.c b/kernel/padata.c
index fdd8ae609ce3..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,18 +26,19 @@
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sysfs.h>
29#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
30 31
31#define MAX_SEQ_NR INT_MAX - NR_CPUS 32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
32#define MAX_OBJ_NUM 1000 33#define MAX_OBJ_NUM 1000
33 34
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{ 36{
36 int cpu, target_cpu; 37 int cpu, target_cpu;
37 38
38 target_cpu = cpumask_first(pd->cpumask); 39 target_cpu = cpumask_first(pd->cpumask.pcpu);
39 for (cpu = 0; cpu < cpu_index; cpu++) 40 for (cpu = 0; cpu < cpu_index; cpu++)
40 target_cpu = cpumask_next(target_cpu, pd->cpumask); 41 target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
41 42
42 return target_cpu; 43 return target_cpu;
43} 44}
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
53 * Hash the sequence numbers to the cpus by taking 54 * Hash the sequence numbers to the cpus by taking
54 * seq_nr mod. number of cpus in use. 55 * seq_nr mod. number of cpus in use.
55 */ 56 */
56 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); 57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
57 58
58 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
59} 60}
60 61
61static void padata_parallel_worker(struct work_struct *work) 62static void padata_parallel_worker(struct work_struct *parallel_work)
62{ 63{
63 struct padata_queue *queue; 64 struct padata_parallel_queue *pqueue;
64 struct parallel_data *pd; 65 struct parallel_data *pd;
65 struct padata_instance *pinst; 66 struct padata_instance *pinst;
66 LIST_HEAD(local_list); 67 LIST_HEAD(local_list);
67 68
68 local_bh_disable(); 69 local_bh_disable();
69 queue = container_of(work, struct padata_queue, pwork); 70 pqueue = container_of(parallel_work,
70 pd = queue->pd; 71 struct padata_parallel_queue, work);
72 pd = pqueue->pd;
71 pinst = pd->pinst; 73 pinst = pd->pinst;
72 74
73 spin_lock(&queue->parallel.lock); 75 spin_lock(&pqueue->parallel.lock);
74 list_replace_init(&queue->parallel.list, &local_list); 76 list_replace_init(&pqueue->parallel.list, &local_list);
75 spin_unlock(&queue->parallel.lock); 77 spin_unlock(&pqueue->parallel.lock);
76 78
77 while (!list_empty(&local_list)) { 79 while (!list_empty(&local_list)) {
78 struct padata_priv *padata; 80 struct padata_priv *padata;
@@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work)
94 * @pinst: padata instance 96 * @pinst: padata instance
95 * @padata: object to be parallelized 97 * @padata: object to be parallelized
96 * @cb_cpu: cpu the serialization callback function will run on, 98 * @cb_cpu: cpu the serialization callback function will run on,
97 * must be in the cpumask of padata. 99 * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
98 * 100 *
99 * The parallelization callback function will run with BHs off. 101 * The parallelization callback function will run with BHs off.
100 * Note: Every object which is parallelized by padata_do_parallel 102 * Note: Every object which is parallelized by padata_do_parallel
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
104 struct padata_priv *padata, int cb_cpu) 106 struct padata_priv *padata, int cb_cpu)
105{ 107{
106 int target_cpu, err; 108 int target_cpu, err;
107 struct padata_queue *queue; 109 struct padata_parallel_queue *queue;
108 struct parallel_data *pd; 110 struct parallel_data *pd;
109 111
110 rcu_read_lock_bh(); 112 rcu_read_lock_bh();
111 113
112 pd = rcu_dereference(pinst->pd); 114 pd = rcu_dereference(pinst->pd);
113 115
114 err = 0; 116 err = -EINVAL;
115 if (!(pinst->flags & PADATA_INIT)) 117 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
118 goto out;
119
120 if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
116 goto out; 121 goto out;
117 122
118 err = -EBUSY; 123 err = -EBUSY;
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
122 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) 127 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
123 goto out; 128 goto out;
124 129
125 err = -EINVAL; 130 err = 0;
126 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
127 goto out;
128
129 err = -EINPROGRESS;
130 atomic_inc(&pd->refcnt); 131 atomic_inc(&pd->refcnt);
131 padata->pd = pd; 132 padata->pd = pd;
132 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
137 padata->seq_nr = atomic_inc_return(&pd->seq_nr); 138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
138 139
139 target_cpu = padata_cpu_hash(padata); 140 target_cpu = padata_cpu_hash(padata);
140 queue = per_cpu_ptr(pd->queue, target_cpu); 141 queue = per_cpu_ptr(pd->pqueue, target_cpu);
141 142
142 spin_lock(&queue->parallel.lock); 143 spin_lock(&queue->parallel.lock);
143 list_add_tail(&padata->list, &queue->parallel.list); 144 list_add_tail(&padata->list, &queue->parallel.list);
144 spin_unlock(&queue->parallel.lock); 145 spin_unlock(&queue->parallel.lock);
145 146
146 queue_work_on(target_cpu, pinst->wq, &queue->pwork); 147 queue_work_on(target_cpu, pinst->wq, &queue->work);
147 148
148out: 149out:
149 rcu_read_unlock_bh(); 150 rcu_read_unlock_bh();
@@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel);
171 */ 172 */
172static struct padata_priv *padata_get_next(struct parallel_data *pd) 173static struct padata_priv *padata_get_next(struct parallel_data *pd)
173{ 174{
174 int cpu, num_cpus, empty, calc_seq_nr; 175 int cpu, num_cpus;
175 int seq_nr, next_nr, overrun, next_overrun; 176 int next_nr, next_index;
176 struct padata_queue *queue, *next_queue; 177 struct padata_parallel_queue *queue, *next_queue;
177 struct padata_priv *padata; 178 struct padata_priv *padata;
178 struct padata_list *reorder; 179 struct padata_list *reorder;
179 180
180 empty = 0; 181 num_cpus = cpumask_weight(pd->cpumask.pcpu);
181 next_nr = -1;
182 next_overrun = 0;
183 next_queue = NULL;
184
185 num_cpus = cpumask_weight(pd->cpumask);
186
187 for_each_cpu(cpu, pd->cpumask) {
188 queue = per_cpu_ptr(pd->queue, cpu);
189 reorder = &queue->reorder;
190
191 /*
192 * Calculate the seq_nr of the object that should be
193 * next in this reorder queue.
194 */
195 overrun = 0;
196 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
197 + queue->cpu_index;
198 182
199 if (unlikely(calc_seq_nr > pd->max_seq_nr)) { 183 /*
200 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; 184 * Calculate the percpu reorder queue and the sequence
201 overrun = 1; 185 * number of the next object.
202 } 186 */
203 187 next_nr = pd->processed;
204 if (!list_empty(&reorder->list)) { 188 next_index = next_nr % num_cpus;
205 padata = list_entry(reorder->list.next, 189 cpu = padata_index_to_cpu(pd, next_index);
206 struct padata_priv, list); 190 next_queue = per_cpu_ptr(pd->pqueue, cpu);
207 191
208 seq_nr = padata->seq_nr; 192 if (unlikely(next_nr > pd->max_seq_nr)) {
209 BUG_ON(calc_seq_nr != seq_nr); 193 next_nr = next_nr - pd->max_seq_nr - 1;
210 } else { 194 next_index = next_nr % num_cpus;
211 seq_nr = calc_seq_nr; 195 cpu = padata_index_to_cpu(pd, next_index);
212 empty++; 196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
213 } 197 pd->processed = 0;
214
215 if (next_nr < 0 || seq_nr < next_nr
216 || (next_overrun && !overrun)) {
217 next_nr = seq_nr;
218 next_overrun = overrun;
219 next_queue = queue;
220 }
221 } 198 }
222 199
223 padata = NULL; 200 padata = NULL;
224 201
225 if (empty == num_cpus)
226 goto out;
227
228 reorder = &next_queue->reorder; 202 reorder = &next_queue->reorder;
229 203
230 if (!list_empty(&reorder->list)) { 204 if (!list_empty(&reorder->list)) {
231 padata = list_entry(reorder->list.next, 205 padata = list_entry(reorder->list.next,
232 struct padata_priv, list); 206 struct padata_priv, list);
233 207
234 if (unlikely(next_overrun)) { 208 BUG_ON(next_nr != padata->seq_nr);
235 for_each_cpu(cpu, pd->cpumask) {
236 queue = per_cpu_ptr(pd->queue, cpu);
237 atomic_set(&queue->num_obj, 0);
238 }
239 }
240 209
241 spin_lock(&reorder->lock); 210 spin_lock(&reorder->lock);
242 list_del_init(&padata->list); 211 list_del_init(&padata->list);
243 atomic_dec(&pd->reorder_objects); 212 atomic_dec(&pd->reorder_objects);
244 spin_unlock(&reorder->lock); 213 spin_unlock(&reorder->lock);
245 214
246 atomic_inc(&next_queue->num_obj); 215 pd->processed++;
247 216
248 goto out; 217 goto out;
249 } 218 }
250 219
251 queue = per_cpu_ptr(pd->queue, smp_processor_id()); 220 queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
252 if (queue->cpu_index == next_queue->cpu_index) { 221 if (queue->cpu_index == next_queue->cpu_index) {
253 padata = ERR_PTR(-ENODATA); 222 padata = ERR_PTR(-ENODATA);
254 goto out; 223 goto out;
@@ -262,7 +231,7 @@ out:
262static void padata_reorder(struct parallel_data *pd) 231static void padata_reorder(struct parallel_data *pd)
263{ 232{
264 struct padata_priv *padata; 233 struct padata_priv *padata;
265 struct padata_queue *queue; 234 struct padata_serial_queue *squeue;
266 struct padata_instance *pinst = pd->pinst; 235 struct padata_instance *pinst = pd->pinst;
267 236
268 /* 237 /*
@@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd)
301 return; 270 return;
302 } 271 }
303 272
304 queue = per_cpu_ptr(pd->queue, padata->cb_cpu); 273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
305 274
306 spin_lock(&queue->serial.lock); 275 spin_lock(&squeue->serial.lock);
307 list_add_tail(&padata->list, &queue->serial.list); 276 list_add_tail(&padata->list, &squeue->serial.list);
308 spin_unlock(&queue->serial.lock); 277 spin_unlock(&squeue->serial.lock);
309 278
310 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); 279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
311 } 280 }
312 281
313 spin_unlock_bh(&pd->lock); 282 spin_unlock_bh(&pd->lock);
@@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg)
333 padata_reorder(pd); 302 padata_reorder(pd);
334} 303}
335 304
336static void padata_serial_worker(struct work_struct *work) 305static void padata_serial_worker(struct work_struct *serial_work)
337{ 306{
338 struct padata_queue *queue; 307 struct padata_serial_queue *squeue;
339 struct parallel_data *pd; 308 struct parallel_data *pd;
340 LIST_HEAD(local_list); 309 LIST_HEAD(local_list);
341 310
342 local_bh_disable(); 311 local_bh_disable();
343 queue = container_of(work, struct padata_queue, swork); 312 squeue = container_of(serial_work, struct padata_serial_queue, work);
344 pd = queue->pd; 313 pd = squeue->pd;
345 314
346 spin_lock(&queue->serial.lock); 315 spin_lock(&squeue->serial.lock);
347 list_replace_init(&queue->serial.list, &local_list); 316 list_replace_init(&squeue->serial.list, &local_list);
348 spin_unlock(&queue->serial.lock); 317 spin_unlock(&squeue->serial.lock);
349 318
350 while (!list_empty(&local_list)) { 319 while (!list_empty(&local_list)) {
351 struct padata_priv *padata; 320 struct padata_priv *padata;
@@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
372void padata_do_serial(struct padata_priv *padata) 341void padata_do_serial(struct padata_priv *padata)
373{ 342{
374 int cpu; 343 int cpu;
375 struct padata_queue *queue; 344 struct padata_parallel_queue *pqueue;
376 struct parallel_data *pd; 345 struct parallel_data *pd;
377 346
378 pd = padata->pd; 347 pd = padata->pd;
379 348
380 cpu = get_cpu(); 349 cpu = get_cpu();
381 queue = per_cpu_ptr(pd->queue, cpu); 350 pqueue = per_cpu_ptr(pd->pqueue, cpu);
382 351
383 spin_lock(&queue->reorder.lock); 352 spin_lock(&pqueue->reorder.lock);
384 atomic_inc(&pd->reorder_objects); 353 atomic_inc(&pd->reorder_objects);
385 list_add_tail(&padata->list, &queue->reorder.list); 354 list_add_tail(&padata->list, &pqueue->reorder.list);
386 spin_unlock(&queue->reorder.lock); 355 spin_unlock(&pqueue->reorder.lock);
387 356
388 put_cpu(); 357 put_cpu();
389 358
@@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata)
391} 360}
392EXPORT_SYMBOL(padata_do_serial); 361EXPORT_SYMBOL(padata_do_serial);
393 362
394/* Allocate and initialize the internal cpumask dependend resources. */ 363static int padata_setup_cpumasks(struct parallel_data *pd,
395static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, 364 const struct cpumask *pcpumask,
396 const struct cpumask *cpumask) 365 const struct cpumask *cbcpumask)
397{ 366{
398 int cpu, cpu_index, num_cpus; 367 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
399 struct padata_queue *queue; 368 return -ENOMEM;
400 struct parallel_data *pd;
401
402 cpu_index = 0;
403 369
404 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); 370 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
405 if (!pd) 371 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
406 goto err; 372 free_cpumask_var(pd->cpumask.cbcpu);
373 return -ENOMEM;
374 }
407 375
408 pd->queue = alloc_percpu(struct padata_queue); 376 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
409 if (!pd->queue) 377 return 0;
410 goto err_free_pd; 378}
411 379
412 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) 380static void __padata_list_init(struct padata_list *pd_list)
413 goto err_free_queue; 381{
382 INIT_LIST_HEAD(&pd_list->list);
383 spin_lock_init(&pd_list->lock);
384}
414 385
415 cpumask_and(pd->cpumask, cpumask, cpu_active_mask); 386/* Initialize all percpu queues used by serial workers */
387static void padata_init_squeues(struct parallel_data *pd)
388{
389 int cpu;
390 struct padata_serial_queue *squeue;
416 391
417 for_each_cpu(cpu, pd->cpumask) { 392 for_each_cpu(cpu, pd->cpumask.cbcpu) {
418 queue = per_cpu_ptr(pd->queue, cpu); 393 squeue = per_cpu_ptr(pd->squeue, cpu);
394 squeue->pd = pd;
395 __padata_list_init(&squeue->serial);
396 INIT_WORK(&squeue->work, padata_serial_worker);
397 }
398}
419 399
420 queue->pd = pd; 400/* Initialize all percpu queues used by parallel workers */
401static void padata_init_pqueues(struct parallel_data *pd)
402{
403 int cpu_index, num_cpus, cpu;
404 struct padata_parallel_queue *pqueue;
421 405
422 queue->cpu_index = cpu_index; 406 cpu_index = 0;
407 for_each_cpu(cpu, pd->cpumask.pcpu) {
408 pqueue = per_cpu_ptr(pd->pqueue, cpu);
409 pqueue->pd = pd;
410 pqueue->cpu_index = cpu_index;
423 cpu_index++; 411 cpu_index++;
424 412
425 INIT_LIST_HEAD(&queue->reorder.list); 413 __padata_list_init(&pqueue->reorder);
426 INIT_LIST_HEAD(&queue->parallel.list); 414 __padata_list_init(&pqueue->parallel);
427 INIT_LIST_HEAD(&queue->serial.list); 415 INIT_WORK(&pqueue->work, padata_parallel_worker);
428 spin_lock_init(&queue->reorder.lock); 416 atomic_set(&pqueue->num_obj, 0);
429 spin_lock_init(&queue->parallel.lock);
430 spin_lock_init(&queue->serial.lock);
431
432 INIT_WORK(&queue->pwork, padata_parallel_worker);
433 INIT_WORK(&queue->swork, padata_serial_worker);
434 atomic_set(&queue->num_obj, 0);
435 } 417 }
436 418
437 num_cpus = cpumask_weight(pd->cpumask); 419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
438 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; 420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
421}
422
423/* Allocate and initialize the internal cpumask dependend resources. */
424static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
425 const struct cpumask *pcpumask,
426 const struct cpumask *cbcpumask)
427{
428 struct parallel_data *pd;
439 429
430 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
431 if (!pd)
432 goto err;
433
434 pd->pqueue = alloc_percpu(struct padata_parallel_queue);
435 if (!pd->pqueue)
436 goto err_free_pd;
437
438 pd->squeue = alloc_percpu(struct padata_serial_queue);
439 if (!pd->squeue)
440 goto err_free_pqueue;
441 if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
442 goto err_free_squeue;
443
444 padata_init_pqueues(pd);
445 padata_init_squeues(pd);
440 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
441 atomic_set(&pd->seq_nr, -1); 447 atomic_set(&pd->seq_nr, -1);
442 atomic_set(&pd->reorder_objects, 0); 448 atomic_set(&pd->reorder_objects, 0);
@@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
446 452
447 return pd; 453 return pd;
448 454
449err_free_queue: 455err_free_squeue:
450 free_percpu(pd->queue); 456 free_percpu(pd->squeue);
457err_free_pqueue:
458 free_percpu(pd->pqueue);
451err_free_pd: 459err_free_pd:
452 kfree(pd); 460 kfree(pd);
453err: 461err:
@@ -456,8 +464,10 @@ err:
456 464
457static void padata_free_pd(struct parallel_data *pd) 465static void padata_free_pd(struct parallel_data *pd)
458{ 466{
459 free_cpumask_var(pd->cpumask); 467 free_cpumask_var(pd->cpumask.pcpu);
460 free_percpu(pd->queue); 468 free_cpumask_var(pd->cpumask.cbcpu);
469 free_percpu(pd->pqueue);
470 free_percpu(pd->squeue);
461 kfree(pd); 471 kfree(pd);
462} 472}
463 473
@@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd)
465static void padata_flush_queues(struct parallel_data *pd) 475static void padata_flush_queues(struct parallel_data *pd)
466{ 476{
467 int cpu; 477 int cpu;
468 struct padata_queue *queue; 478 struct padata_parallel_queue *pqueue;
479 struct padata_serial_queue *squeue;
469 480
470 for_each_cpu(cpu, pd->cpumask) { 481 for_each_cpu(cpu, pd->cpumask.pcpu) {
471 queue = per_cpu_ptr(pd->queue, cpu); 482 pqueue = per_cpu_ptr(pd->pqueue, cpu);
472 flush_work(&queue->pwork); 483 flush_work(&pqueue->work);
473 } 484 }
474 485
475 del_timer_sync(&pd->timer); 486 del_timer_sync(&pd->timer);
@@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd)
477 if (atomic_read(&pd->reorder_objects)) 488 if (atomic_read(&pd->reorder_objects))
478 padata_reorder(pd); 489 padata_reorder(pd);
479 490
480 for_each_cpu(cpu, pd->cpumask) { 491 for_each_cpu(cpu, pd->cpumask.cbcpu) {
481 queue = per_cpu_ptr(pd->queue, cpu); 492 squeue = per_cpu_ptr(pd->squeue, cpu);
482 flush_work(&queue->swork); 493 flush_work(&squeue->work);
483 } 494 }
484 495
485 BUG_ON(atomic_read(&pd->refcnt) != 0); 496 BUG_ON(atomic_read(&pd->refcnt) != 0);
486} 497}
487 498
499static void __padata_start(struct padata_instance *pinst)
500{
501 pinst->flags |= PADATA_INIT;
502}
503
504static void __padata_stop(struct padata_instance *pinst)
505{
506 if (!(pinst->flags & PADATA_INIT))
507 return;
508
509 pinst->flags &= ~PADATA_INIT;
510
511 synchronize_rcu();
512
513 get_online_cpus();
514 padata_flush_queues(pinst->pd);
515 put_online_cpus();
516}
517
488/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control stucture with a new one. */
489static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
490 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
491{ 521{
492 struct parallel_data *pd_old = pinst->pd; 522 struct parallel_data *pd_old = pinst->pd;
523 int notification_mask = 0;
493 524
494 pinst->flags |= PADATA_RESET; 525 pinst->flags |= PADATA_RESET;
495 526
@@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
497 528
498 synchronize_rcu(); 529 synchronize_rcu();
499 530
531 if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
532 notification_mask |= PADATA_CPU_PARALLEL;
533 if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
534 notification_mask |= PADATA_CPU_SERIAL;
535
500 padata_flush_queues(pd_old); 536 padata_flush_queues(pd_old);
501 padata_free_pd(pd_old); 537 padata_free_pd(pd_old);
502 538
539 if (notification_mask)
540 blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
541 notification_mask,
542 &pd_new->cpumask);
543
503 pinst->flags &= ~PADATA_RESET; 544 pinst->flags &= ~PADATA_RESET;
504} 545}
505 546
506/** 547/**
507 * padata_set_cpumask - set the cpumask that padata should use 548 * padata_register_cpumask_notifier - Registers a notifier that will be called
549 * if either pcpu or cbcpu or both cpumasks change.
508 * 550 *
509 * @pinst: padata instance 551 * @pinst: A poineter to padata instance
510 * @cpumask: the cpumask to use 552 * @nblock: A pointer to notifier block.
511 */ 553 */
512int padata_set_cpumask(struct padata_instance *pinst, 554int padata_register_cpumask_notifier(struct padata_instance *pinst,
513 cpumask_var_t cpumask) 555 struct notifier_block *nblock)
514{ 556{
557 return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
558 nblock);
559}
560EXPORT_SYMBOL(padata_register_cpumask_notifier);
561
562/**
563 * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
564 * registered earlier using padata_register_cpumask_notifier
565 *
566 * @pinst: A pointer to data instance.
567 * @nlock: A pointer to notifier block.
568 */
569int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
570 struct notifier_block *nblock)
571{
572 return blocking_notifier_chain_unregister(
573 &pinst->cpumask_change_notifier,
574 nblock);
575}
576EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
577
578
579/* If cpumask contains no active cpu, we mark the instance as invalid. */
580static bool padata_validate_cpumask(struct padata_instance *pinst,
581 const struct cpumask *cpumask)
582{
583 if (!cpumask_intersects(cpumask, cpu_active_mask)) {
584 pinst->flags |= PADATA_INVALID;
585 return false;
586 }
587
588 pinst->flags &= ~PADATA_INVALID;
589 return true;
590}
591
592static int __padata_set_cpumasks(struct padata_instance *pinst,
593 cpumask_var_t pcpumask,
594 cpumask_var_t cbcpumask)
595{
596 int valid;
515 struct parallel_data *pd; 597 struct parallel_data *pd;
516 int err = 0; 598
599 valid = padata_validate_cpumask(pinst, pcpumask);
600 if (!valid) {
601 __padata_stop(pinst);
602 goto out_replace;
603 }
604
605 valid = padata_validate_cpumask(pinst, cbcpumask);
606 if (!valid)
607 __padata_stop(pinst);
608
609out_replace:
610 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
611 if (!pd)
612 return -ENOMEM;
613
614 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
615 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
616
617 padata_replace(pinst, pd);
618
619 if (valid)
620 __padata_start(pinst);
621
622 return 0;
623}
624
625/**
626 * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
627 * one is used by parallel workers and the second one
628 * by the wokers doing serialization.
629 *
630 * @pinst: padata instance
631 * @pcpumask: the cpumask to use for parallel workers
632 * @cbcpumask: the cpumsak to use for serial workers
633 */
634int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
635 cpumask_var_t cbcpumask)
636{
637 int err;
517 638
518 mutex_lock(&pinst->lock); 639 mutex_lock(&pinst->lock);
640 get_online_cpus();
519 641
642 err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
643
644 put_online_cpus();
645 mutex_unlock(&pinst->lock);
646
647 return err;
648
649}
650EXPORT_SYMBOL(padata_set_cpumasks);
651
652/**
653 * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
654 * equivalent to @cpumask.
655 *
656 * @pinst: padata instance
657 * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
658 * to parallel and serial cpumasks respectively.
659 * @cpumask: the cpumask to use
660 */
661int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
662 cpumask_var_t cpumask)
663{
664 struct cpumask *serial_mask, *parallel_mask;
665 int err = -EINVAL;
666
667 mutex_lock(&pinst->lock);
520 get_online_cpus(); 668 get_online_cpus();
521 669
522 pd = padata_alloc_pd(pinst, cpumask); 670 switch (cpumask_type) {
523 if (!pd) { 671 case PADATA_CPU_PARALLEL:
524 err = -ENOMEM; 672 serial_mask = pinst->cpumask.cbcpu;
525 goto out; 673 parallel_mask = cpumask;
674 break;
675 case PADATA_CPU_SERIAL:
676 parallel_mask = pinst->cpumask.pcpu;
677 serial_mask = cpumask;
678 break;
679 default:
680 goto out;
526 } 681 }
527 682
528 cpumask_copy(pinst->cpumask, cpumask); 683 err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
529
530 padata_replace(pinst, pd);
531 684
532out: 685out:
533 put_online_cpus(); 686 put_online_cpus();
534
535 mutex_unlock(&pinst->lock); 687 mutex_unlock(&pinst->lock);
536 688
537 return err; 689 return err;
@@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
543 struct parallel_data *pd; 695 struct parallel_data *pd;
544 696
545 if (cpumask_test_cpu(cpu, cpu_active_mask)) { 697 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
546 pd = padata_alloc_pd(pinst, pinst->cpumask); 698 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
699 pinst->cpumask.cbcpu);
547 if (!pd) 700 if (!pd)
548 return -ENOMEM; 701 return -ENOMEM;
549 702
550 padata_replace(pinst, pd); 703 padata_replace(pinst, pd);
704
705 if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
706 padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
707 __padata_start(pinst);
551 } 708 }
552 709
553 return 0; 710 return 0;
554} 711}
555 712
556/** 713 /**
557 * padata_add_cpu - add a cpu to the padata cpumask 714 * padata_add_cpu - add a cpu to one or both(parallel and serial)
715 * padata cpumasks.
558 * 716 *
559 * @pinst: padata instance 717 * @pinst: padata instance
560 * @cpu: cpu to add 718 * @cpu: cpu to add
719 * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
720 * The @mask may be any combination of the following flags:
721 * PADATA_CPU_SERIAL - serial cpumask
722 * PADATA_CPU_PARALLEL - parallel cpumask
561 */ 723 */
562int padata_add_cpu(struct padata_instance *pinst, int cpu) 724
725int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
563{ 726{
564 int err; 727 int err;
565 728
729 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
730 return -EINVAL;
731
566 mutex_lock(&pinst->lock); 732 mutex_lock(&pinst->lock);
567 733
568 get_online_cpus(); 734 get_online_cpus();
569 cpumask_set_cpu(cpu, pinst->cpumask); 735 if (mask & PADATA_CPU_SERIAL)
736 cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
737 if (mask & PADATA_CPU_PARALLEL)
738 cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
739
570 err = __padata_add_cpu(pinst, cpu); 740 err = __padata_add_cpu(pinst, cpu);
571 put_online_cpus(); 741 put_online_cpus();
572 742
@@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
578 748
579static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) 749static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
580{ 750{
581 struct parallel_data *pd; 751 struct parallel_data *pd = NULL;
582 752
583 if (cpumask_test_cpu(cpu, cpu_online_mask)) { 753 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
584 pd = padata_alloc_pd(pinst, pinst->cpumask); 754
755 if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
756 !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
757 __padata_stop(pinst);
758
759 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
760 pinst->cpumask.cbcpu);
585 if (!pd) 761 if (!pd)
586 return -ENOMEM; 762 return -ENOMEM;
587 763
@@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
591 return 0; 767 return 0;
592} 768}
593 769
594/** 770 /**
595 * padata_remove_cpu - remove a cpu from the padata cpumask 771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
772 * padata cpumasks.
596 * 773 *
597 * @pinst: padata instance 774 * @pinst: padata instance
598 * @cpu: cpu to remove 775 * @cpu: cpu to remove
776 * @mask: bitmask specifying from which cpumask @cpu should be removed
777 * The @mask may be any combination of the following flags:
778 * PADATA_CPU_SERIAL - serial cpumask
779 * PADATA_CPU_PARALLEL - parallel cpumask
599 */ 780 */
600int padata_remove_cpu(struct padata_instance *pinst, int cpu) 781int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
601{ 782{
602 int err; 783 int err;
603 784
785 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
786 return -EINVAL;
787
604 mutex_lock(&pinst->lock); 788 mutex_lock(&pinst->lock);
605 789
606 get_online_cpus(); 790 get_online_cpus();
607 cpumask_clear_cpu(cpu, pinst->cpumask); 791 if (mask & PADATA_CPU_SERIAL)
792 cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
793 if (mask & PADATA_CPU_PARALLEL)
794 cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
795
608 err = __padata_remove_cpu(pinst, cpu); 796 err = __padata_remove_cpu(pinst, cpu);
609 put_online_cpus(); 797 put_online_cpus();
610 798
@@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu);
619 * 807 *
620 * @pinst: padata instance to start 808 * @pinst: padata instance to start
621 */ 809 */
622void padata_start(struct padata_instance *pinst) 810int padata_start(struct padata_instance *pinst)
623{ 811{
812 int err = 0;
813
624 mutex_lock(&pinst->lock); 814 mutex_lock(&pinst->lock);
625 pinst->flags |= PADATA_INIT; 815
816 if (pinst->flags & PADATA_INVALID)
817 err =-EINVAL;
818
819 __padata_start(pinst);
820
626 mutex_unlock(&pinst->lock); 821 mutex_unlock(&pinst->lock);
822
823 return err;
627} 824}
628EXPORT_SYMBOL(padata_start); 825EXPORT_SYMBOL(padata_start);
629 826
@@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start);
635void padata_stop(struct padata_instance *pinst) 832void padata_stop(struct padata_instance *pinst)
636{ 833{
637 mutex_lock(&pinst->lock); 834 mutex_lock(&pinst->lock);
638 pinst->flags &= ~PADATA_INIT; 835 __padata_stop(pinst);
639 mutex_unlock(&pinst->lock); 836 mutex_unlock(&pinst->lock);
640} 837}
641EXPORT_SYMBOL(padata_stop); 838EXPORT_SYMBOL(padata_stop);
642 839
643#ifdef CONFIG_HOTPLUG_CPU 840#ifdef CONFIG_HOTPLUG_CPU
841
842static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
843{
844 return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
845 cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
846}
847
848
644static int padata_cpu_callback(struct notifier_block *nfb, 849static int padata_cpu_callback(struct notifier_block *nfb,
645 unsigned long action, void *hcpu) 850 unsigned long action, void *hcpu)
646{ 851{
@@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
653 switch (action) { 858 switch (action) {
654 case CPU_ONLINE: 859 case CPU_ONLINE:
655 case CPU_ONLINE_FROZEN: 860 case CPU_ONLINE_FROZEN:
656 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 861 if (!pinst_has_cpu(pinst, cpu))
657 break; 862 break;
658 mutex_lock(&pinst->lock); 863 mutex_lock(&pinst->lock);
659 err = __padata_add_cpu(pinst, cpu); 864 err = __padata_add_cpu(pinst, cpu);
@@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
664 869
665 case CPU_DOWN_PREPARE: 870 case CPU_DOWN_PREPARE:
666 case CPU_DOWN_PREPARE_FROZEN: 871 case CPU_DOWN_PREPARE_FROZEN:
667 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 872 if (!pinst_has_cpu(pinst, cpu))
668 break; 873 break;
669 mutex_lock(&pinst->lock); 874 mutex_lock(&pinst->lock);
670 err = __padata_remove_cpu(pinst, cpu); 875 err = __padata_remove_cpu(pinst, cpu);
@@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
675 880
676 case CPU_UP_CANCELED: 881 case CPU_UP_CANCELED:
677 case CPU_UP_CANCELED_FROZEN: 882 case CPU_UP_CANCELED_FROZEN:
678 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 883 if (!pinst_has_cpu(pinst, cpu))
679 break; 884 break;
680 mutex_lock(&pinst->lock); 885 mutex_lock(&pinst->lock);
681 __padata_remove_cpu(pinst, cpu); 886 __padata_remove_cpu(pinst, cpu);
@@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
683 888
684 case CPU_DOWN_FAILED: 889 case CPU_DOWN_FAILED:
685 case CPU_DOWN_FAILED_FROZEN: 890 case CPU_DOWN_FAILED_FROZEN:
686 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 891 if (!pinst_has_cpu(pinst, cpu))
687 break; 892 break;
688 mutex_lock(&pinst->lock); 893 mutex_lock(&pinst->lock);
689 __padata_add_cpu(pinst, cpu); 894 __padata_add_cpu(pinst, cpu);
@@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb,
694} 899}
695#endif 900#endif
696 901
902static void __padata_free(struct padata_instance *pinst)
903{
904#ifdef CONFIG_HOTPLUG_CPU
905 unregister_hotcpu_notifier(&pinst->cpu_notifier);
906#endif
907
908 padata_stop(pinst);
909 padata_free_pd(pinst->pd);
910 free_cpumask_var(pinst->cpumask.pcpu);
911 free_cpumask_var(pinst->cpumask.cbcpu);
912 kfree(pinst);
913}
914
915#define kobj2pinst(_kobj) \
916 container_of(_kobj, struct padata_instance, kobj)
917#define attr2pentry(_attr) \
918 container_of(_attr, struct padata_sysfs_entry, attr)
919
920static void padata_sysfs_release(struct kobject *kobj)
921{
922 struct padata_instance *pinst = kobj2pinst(kobj);
923 __padata_free(pinst);
924}
925
926struct padata_sysfs_entry {
927 struct attribute attr;
928 ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
929 ssize_t (*store)(struct padata_instance *, struct attribute *,
930 const char *, size_t);
931};
932
933static ssize_t show_cpumask(struct padata_instance *pinst,
934 struct attribute *attr, char *buf)
935{
936 struct cpumask *cpumask;
937 ssize_t len;
938
939 mutex_lock(&pinst->lock);
940 if (!strcmp(attr->name, "serial_cpumask"))
941 cpumask = pinst->cpumask.cbcpu;
942 else
943 cpumask = pinst->cpumask.pcpu;
944
945 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
946 nr_cpu_ids);
947 if (PAGE_SIZE - len < 2)
948 len = -EINVAL;
949 else
950 len += sprintf(buf + len, "\n");
951
952 mutex_unlock(&pinst->lock);
953 return len;
954}
955
956static ssize_t store_cpumask(struct padata_instance *pinst,
957 struct attribute *attr,
958 const char *buf, size_t count)
959{
960 cpumask_var_t new_cpumask;
961 ssize_t ret;
962 int mask_type;
963
964 if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
965 return -ENOMEM;
966
967 ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
968 nr_cpumask_bits);
969 if (ret < 0)
970 goto out;
971
972 mask_type = !strcmp(attr->name, "serial_cpumask") ?
973 PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
974 ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
975 if (!ret)
976 ret = count;
977
978out:
979 free_cpumask_var(new_cpumask);
980 return ret;
981}
982
983#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
984 static struct padata_sysfs_entry _name##_attr = \
985 __ATTR(_name, 0644, _show_name, _store_name)
986#define PADATA_ATTR_RO(_name, _show_name) \
987 static struct padata_sysfs_entry _name##_attr = \
988 __ATTR(_name, 0400, _show_name, NULL)
989
990PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
991PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
992
993/*
994 * Padata sysfs provides the following objects:
995 * serial_cpumask [RW] - cpumask for serial workers
996 * parallel_cpumask [RW] - cpumask for parallel workers
997 */
998static struct attribute *padata_default_attrs[] = {
999 &serial_cpumask_attr.attr,
1000 &parallel_cpumask_attr.attr,
1001 NULL,
1002};
1003
1004static ssize_t padata_sysfs_show(struct kobject *kobj,
1005 struct attribute *attr, char *buf)
1006{
1007 struct padata_instance *pinst;
1008 struct padata_sysfs_entry *pentry;
1009 ssize_t ret = -EIO;
1010
1011 pinst = kobj2pinst(kobj);
1012 pentry = attr2pentry(attr);
1013 if (pentry->show)
1014 ret = pentry->show(pinst, attr, buf);
1015
1016 return ret;
1017}
1018
1019static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
1020 const char *buf, size_t count)
1021{
1022 struct padata_instance *pinst;
1023 struct padata_sysfs_entry *pentry;
1024 ssize_t ret = -EIO;
1025
1026 pinst = kobj2pinst(kobj);
1027 pentry = attr2pentry(attr);
1028 if (pentry->show)
1029 ret = pentry->store(pinst, attr, buf, count);
1030
1031 return ret;
1032}
1033
1034static const struct sysfs_ops padata_sysfs_ops = {
1035 .show = padata_sysfs_show,
1036 .store = padata_sysfs_store,
1037};
1038
1039static struct kobj_type padata_attr_type = {
1040 .sysfs_ops = &padata_sysfs_ops,
1041 .default_attrs = padata_default_attrs,
1042 .release = padata_sysfs_release,
1043};
1044
697/** 1045/**
698 * padata_alloc - allocate and initialize a padata instance 1046 * padata_alloc_possible - Allocate and initialize padata instance.
1047 * Use the cpu_possible_mask for serial and
1048 * parallel workers.
699 * 1049 *
700 * @cpumask: cpumask that padata uses for parallelization
701 * @wq: workqueue to use for the allocated padata instance 1050 * @wq: workqueue to use for the allocated padata instance
702 */ 1051 */
703struct padata_instance *padata_alloc(const struct cpumask *cpumask, 1052struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
704 struct workqueue_struct *wq) 1053{
1054 return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
1055}
1056EXPORT_SYMBOL(padata_alloc_possible);
1057
1058/**
1059 * padata_alloc - allocate and initialize a padata instance and specify
1060 * cpumasks for serial and parallel workers.
1061 *
1062 * @wq: workqueue to use for the allocated padata instance
1063 * @pcpumask: cpumask that will be used for padata parallelization
1064 * @cbcpumask: cpumask that will be used for padata serialization
1065 */
1066struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1067 const struct cpumask *pcpumask,
1068 const struct cpumask *cbcpumask)
705{ 1069{
706 struct padata_instance *pinst; 1070 struct padata_instance *pinst;
707 struct parallel_data *pd; 1071 struct parallel_data *pd = NULL;
708 1072
709 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); 1073 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
710 if (!pinst) 1074 if (!pinst)
711 goto err; 1075 goto err;
712 1076
713 get_online_cpus(); 1077 get_online_cpus();
714 1078 if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
715 pd = padata_alloc_pd(pinst, cpumask);
716 if (!pd)
717 goto err_free_inst; 1079 goto err_free_inst;
1080 if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
1081 free_cpumask_var(pinst->cpumask.pcpu);
1082 goto err_free_inst;
1083 }
1084 if (!padata_validate_cpumask(pinst, pcpumask) ||
1085 !padata_validate_cpumask(pinst, cbcpumask))
1086 goto err_free_masks;
718 1087
719 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) 1088 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
720 goto err_free_pd; 1089 if (!pd)
1090 goto err_free_masks;
721 1091
722 rcu_assign_pointer(pinst->pd, pd); 1092 rcu_assign_pointer(pinst->pd, pd);
723 1093
724 pinst->wq = wq; 1094 pinst->wq = wq;
725 1095
726 cpumask_copy(pinst->cpumask, cpumask); 1096 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
1097 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
727 1098
728 pinst->flags = 0; 1099 pinst->flags = 0;
729 1100
@@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
735 1106
736 put_online_cpus(); 1107 put_online_cpus();
737 1108
1109 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
1110 kobject_init(&pinst->kobj, &padata_attr_type);
738 mutex_init(&pinst->lock); 1111 mutex_init(&pinst->lock);
739 1112
740 return pinst; 1113 return pinst;
741 1114
742err_free_pd: 1115err_free_masks:
743 padata_free_pd(pd); 1116 free_cpumask_var(pinst->cpumask.pcpu);
1117 free_cpumask_var(pinst->cpumask.cbcpu);
744err_free_inst: 1118err_free_inst:
745 kfree(pinst); 1119 kfree(pinst);
746 put_online_cpus(); 1120 put_online_cpus();
@@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc);
756 */ 1130 */
757void padata_free(struct padata_instance *pinst) 1131void padata_free(struct padata_instance *pinst)
758{ 1132{
759 padata_stop(pinst); 1133 kobject_put(&pinst->kobj);
760
761 synchronize_rcu();
762
763#ifdef CONFIG_HOTPLUG_CPU
764 unregister_hotcpu_notifier(&pinst->cpu_notifier);
765#endif
766 get_online_cpus();
767 padata_flush_queues(pinst->pd);
768 put_online_cpus();
769
770 padata_free_pd(pinst->pd);
771 free_cpumask_var(pinst->cpumask);
772 kfree(pinst);
773} 1134}
774EXPORT_SYMBOL(padata_free); 1135EXPORT_SYMBOL(padata_free);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..403d1804b198 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
214 214
215static inline u64 perf_clock(void) 215static inline u64 perf_clock(void)
216{ 216{
217 return cpu_clock(raw_smp_processor_id()); 217 return local_clock();
218} 218}
219 219
220/* 220/*
@@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event,
675 struct perf_event *event, *partial_group = NULL; 675 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu; 676 const struct pmu *pmu = group_event->pmu;
677 bool txn = false; 677 bool txn = false;
678 int ret;
679 678
680 if (group_event->state == PERF_EVENT_STATE_OFF) 679 if (group_event->state == PERF_EVENT_STATE_OFF)
681 return 0; 680 return 0;
@@ -703,14 +702,8 @@ group_sched_in(struct perf_event *group_event,
703 } 702 }
704 } 703 }
705 704
706 if (!txn) 705 if (!txn || !pmu->commit_txn(pmu))
707 return 0;
708
709 ret = pmu->commit_txn(pmu);
710 if (!ret) {
711 pmu->cancel_txn(pmu);
712 return 0; 706 return 0;
713 }
714 707
715group_error: 708group_error:
716 /* 709 /*
@@ -1155,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
1155 * In order to keep per-task stats reliable we need to flip the event 1148 * In order to keep per-task stats reliable we need to flip the event
1156 * values when we flip the contexts. 1149 * values when we flip the contexts.
1157 */ 1150 */
1158 value = atomic64_read(&next_event->count); 1151 value = local64_read(&next_event->count);
1159 value = atomic64_xchg(&event->count, value); 1152 value = local64_xchg(&event->count, value);
1160 atomic64_set(&next_event->count, value); 1153 local64_set(&next_event->count, value);
1161 1154
1162 swap(event->total_time_enabled, next_event->total_time_enabled); 1155 swap(event->total_time_enabled, next_event->total_time_enabled);
1163 swap(event->total_time_running, next_event->total_time_running); 1156 swap(event->total_time_running, next_event->total_time_running);
@@ -1547,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1547 1540
1548 hwc->sample_period = sample_period; 1541 hwc->sample_period = sample_period;
1549 1542
1550 if (atomic64_read(&hwc->period_left) > 8*sample_period) { 1543 if (local64_read(&hwc->period_left) > 8*sample_period) {
1551 perf_disable(); 1544 perf_disable();
1552 perf_event_stop(event); 1545 perf_event_stop(event);
1553 atomic64_set(&hwc->period_left, 0); 1546 local64_set(&hwc->period_left, 0);
1554 perf_event_start(event); 1547 perf_event_start(event);
1555 perf_enable(); 1548 perf_enable();
1556 } 1549 }
@@ -1591,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1591 1584
1592 perf_disable(); 1585 perf_disable();
1593 event->pmu->read(event); 1586 event->pmu->read(event);
1594 now = atomic64_read(&event->count); 1587 now = local64_read(&event->count);
1595 delta = now - hwc->freq_count_stamp; 1588 delta = now - hwc->freq_count_stamp;
1596 hwc->freq_count_stamp = now; 1589 hwc->freq_count_stamp = now;
1597 1590
@@ -1743,6 +1736,11 @@ static void __perf_event_read(void *info)
1743 event->pmu->read(event); 1736 event->pmu->read(event);
1744} 1737}
1745 1738
1739static inline u64 perf_event_count(struct perf_event *event)
1740{
1741 return local64_read(&event->count) + atomic64_read(&event->child_count);
1742}
1743
1746static u64 perf_event_read(struct perf_event *event) 1744static u64 perf_event_read(struct perf_event *event)
1747{ 1745{
1748 /* 1746 /*
@@ -1762,7 +1760,7 @@ static u64 perf_event_read(struct perf_event *event)
1762 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1760 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1763 } 1761 }
1764 1762
1765 return atomic64_read(&event->count); 1763 return perf_event_count(event);
1766} 1764}
1767 1765
1768/* 1766/*
@@ -1883,7 +1881,7 @@ static void free_event_rcu(struct rcu_head *head)
1883} 1881}
1884 1882
1885static void perf_pending_sync(struct perf_event *event); 1883static void perf_pending_sync(struct perf_event *event);
1886static void perf_mmap_data_put(struct perf_mmap_data *data); 1884static void perf_buffer_put(struct perf_buffer *buffer);
1887 1885
1888static void free_event(struct perf_event *event) 1886static void free_event(struct perf_event *event)
1889{ 1887{
@@ -1891,7 +1889,7 @@ static void free_event(struct perf_event *event)
1891 1889
1892 if (!event->parent) { 1890 if (!event->parent) {
1893 atomic_dec(&nr_events); 1891 atomic_dec(&nr_events);
1894 if (event->attr.mmap) 1892 if (event->attr.mmap || event->attr.mmap_data)
1895 atomic_dec(&nr_mmap_events); 1893 atomic_dec(&nr_mmap_events);
1896 if (event->attr.comm) 1894 if (event->attr.comm)
1897 atomic_dec(&nr_comm_events); 1895 atomic_dec(&nr_comm_events);
@@ -1899,9 +1897,9 @@ static void free_event(struct perf_event *event)
1899 atomic_dec(&nr_task_events); 1897 atomic_dec(&nr_task_events);
1900 } 1898 }
1901 1899
1902 if (event->data) { 1900 if (event->buffer) {
1903 perf_mmap_data_put(event->data); 1901 perf_buffer_put(event->buffer);
1904 event->data = NULL; 1902 event->buffer = NULL;
1905 } 1903 }
1906 1904
1907 if (event->destroy) 1905 if (event->destroy)
@@ -2126,13 +2124,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2126static unsigned int perf_poll(struct file *file, poll_table *wait) 2124static unsigned int perf_poll(struct file *file, poll_table *wait)
2127{ 2125{
2128 struct perf_event *event = file->private_data; 2126 struct perf_event *event = file->private_data;
2129 struct perf_mmap_data *data; 2127 struct perf_buffer *buffer;
2130 unsigned int events = POLL_HUP; 2128 unsigned int events = POLL_HUP;
2131 2129
2132 rcu_read_lock(); 2130 rcu_read_lock();
2133 data = rcu_dereference(event->data); 2131 buffer = rcu_dereference(event->buffer);
2134 if (data) 2132 if (buffer)
2135 events = atomic_xchg(&data->poll, 0); 2133 events = atomic_xchg(&buffer->poll, 0);
2136 rcu_read_unlock(); 2134 rcu_read_unlock();
2137 2135
2138 poll_wait(file, &event->waitq, wait); 2136 poll_wait(file, &event->waitq, wait);
@@ -2143,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
2143static void perf_event_reset(struct perf_event *event) 2141static void perf_event_reset(struct perf_event *event)
2144{ 2142{
2145 (void)perf_event_read(event); 2143 (void)perf_event_read(event);
2146 atomic64_set(&event->count, 0); 2144 local64_set(&event->count, 0);
2147 perf_event_update_userpage(event); 2145 perf_event_update_userpage(event);
2148} 2146}
2149 2147
@@ -2342,14 +2340,14 @@ static int perf_event_index(struct perf_event *event)
2342void perf_event_update_userpage(struct perf_event *event) 2340void perf_event_update_userpage(struct perf_event *event)
2343{ 2341{
2344 struct perf_event_mmap_page *userpg; 2342 struct perf_event_mmap_page *userpg;
2345 struct perf_mmap_data *data; 2343 struct perf_buffer *buffer;
2346 2344
2347 rcu_read_lock(); 2345 rcu_read_lock();
2348 data = rcu_dereference(event->data); 2346 buffer = rcu_dereference(event->buffer);
2349 if (!data) 2347 if (!buffer)
2350 goto unlock; 2348 goto unlock;
2351 2349
2352 userpg = data->user_page; 2350 userpg = buffer->user_page;
2353 2351
2354 /* 2352 /*
2355 * Disable preemption so as to not let the corresponding user-space 2353 * Disable preemption so as to not let the corresponding user-space
@@ -2359,9 +2357,9 @@ void perf_event_update_userpage(struct perf_event *event)
2359 ++userpg->lock; 2357 ++userpg->lock;
2360 barrier(); 2358 barrier();
2361 userpg->index = perf_event_index(event); 2359 userpg->index = perf_event_index(event);
2362 userpg->offset = atomic64_read(&event->count); 2360 userpg->offset = perf_event_count(event);
2363 if (event->state == PERF_EVENT_STATE_ACTIVE) 2361 if (event->state == PERF_EVENT_STATE_ACTIVE)
2364 userpg->offset -= atomic64_read(&event->hw.prev_count); 2362 userpg->offset -= local64_read(&event->hw.prev_count);
2365 2363
2366 userpg->time_enabled = event->total_time_enabled + 2364 userpg->time_enabled = event->total_time_enabled +
2367 atomic64_read(&event->child_total_time_enabled); 2365 atomic64_read(&event->child_total_time_enabled);
@@ -2376,6 +2374,25 @@ unlock:
2376 rcu_read_unlock(); 2374 rcu_read_unlock();
2377} 2375}
2378 2376
2377static unsigned long perf_data_size(struct perf_buffer *buffer);
2378
2379static void
2380perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
2381{
2382 long max_size = perf_data_size(buffer);
2383
2384 if (watermark)
2385 buffer->watermark = min(max_size, watermark);
2386
2387 if (!buffer->watermark)
2388 buffer->watermark = max_size / 2;
2389
2390 if (flags & PERF_BUFFER_WRITABLE)
2391 buffer->writable = 1;
2392
2393 atomic_set(&buffer->refcount, 1);
2394}
2395
2379#ifndef CONFIG_PERF_USE_VMALLOC 2396#ifndef CONFIG_PERF_USE_VMALLOC
2380 2397
2381/* 2398/*
@@ -2383,15 +2400,15 @@ unlock:
2383 */ 2400 */
2384 2401
2385static struct page * 2402static struct page *
2386perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2403perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2387{ 2404{
2388 if (pgoff > data->nr_pages) 2405 if (pgoff > buffer->nr_pages)
2389 return NULL; 2406 return NULL;
2390 2407
2391 if (pgoff == 0) 2408 if (pgoff == 0)
2392 return virt_to_page(data->user_page); 2409 return virt_to_page(buffer->user_page);
2393 2410
2394 return virt_to_page(data->data_pages[pgoff - 1]); 2411 return virt_to_page(buffer->data_pages[pgoff - 1]);
2395} 2412}
2396 2413
2397static void *perf_mmap_alloc_page(int cpu) 2414static void *perf_mmap_alloc_page(int cpu)
@@ -2407,42 +2424,44 @@ static void *perf_mmap_alloc_page(int cpu)
2407 return page_address(page); 2424 return page_address(page);
2408} 2425}
2409 2426
2410static struct perf_mmap_data * 2427static struct perf_buffer *
2411perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2428perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2412{ 2429{
2413 struct perf_mmap_data *data; 2430 struct perf_buffer *buffer;
2414 unsigned long size; 2431 unsigned long size;
2415 int i; 2432 int i;
2416 2433
2417 size = sizeof(struct perf_mmap_data); 2434 size = sizeof(struct perf_buffer);
2418 size += nr_pages * sizeof(void *); 2435 size += nr_pages * sizeof(void *);
2419 2436
2420 data = kzalloc(size, GFP_KERNEL); 2437 buffer = kzalloc(size, GFP_KERNEL);
2421 if (!data) 2438 if (!buffer)
2422 goto fail; 2439 goto fail;
2423 2440
2424 data->user_page = perf_mmap_alloc_page(event->cpu); 2441 buffer->user_page = perf_mmap_alloc_page(cpu);
2425 if (!data->user_page) 2442 if (!buffer->user_page)
2426 goto fail_user_page; 2443 goto fail_user_page;
2427 2444
2428 for (i = 0; i < nr_pages; i++) { 2445 for (i = 0; i < nr_pages; i++) {
2429 data->data_pages[i] = perf_mmap_alloc_page(event->cpu); 2446 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2430 if (!data->data_pages[i]) 2447 if (!buffer->data_pages[i])
2431 goto fail_data_pages; 2448 goto fail_data_pages;
2432 } 2449 }
2433 2450
2434 data->nr_pages = nr_pages; 2451 buffer->nr_pages = nr_pages;
2452
2453 perf_buffer_init(buffer, watermark, flags);
2435 2454
2436 return data; 2455 return buffer;
2437 2456
2438fail_data_pages: 2457fail_data_pages:
2439 for (i--; i >= 0; i--) 2458 for (i--; i >= 0; i--)
2440 free_page((unsigned long)data->data_pages[i]); 2459 free_page((unsigned long)buffer->data_pages[i]);
2441 2460
2442 free_page((unsigned long)data->user_page); 2461 free_page((unsigned long)buffer->user_page);
2443 2462
2444fail_user_page: 2463fail_user_page:
2445 kfree(data); 2464 kfree(buffer);
2446 2465
2447fail: 2466fail:
2448 return NULL; 2467 return NULL;
@@ -2456,17 +2475,17 @@ static void perf_mmap_free_page(unsigned long addr)
2456 __free_page(page); 2475 __free_page(page);
2457} 2476}
2458 2477
2459static void perf_mmap_data_free(struct perf_mmap_data *data) 2478static void perf_buffer_free(struct perf_buffer *buffer)
2460{ 2479{
2461 int i; 2480 int i;
2462 2481
2463 perf_mmap_free_page((unsigned long)data->user_page); 2482 perf_mmap_free_page((unsigned long)buffer->user_page);
2464 for (i = 0; i < data->nr_pages; i++) 2483 for (i = 0; i < buffer->nr_pages; i++)
2465 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2484 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
2466 kfree(data); 2485 kfree(buffer);
2467} 2486}
2468 2487
2469static inline int page_order(struct perf_mmap_data *data) 2488static inline int page_order(struct perf_buffer *buffer)
2470{ 2489{
2471 return 0; 2490 return 0;
2472} 2491}
@@ -2479,18 +2498,18 @@ static inline int page_order(struct perf_mmap_data *data)
2479 * Required for architectures that have d-cache aliasing issues. 2498 * Required for architectures that have d-cache aliasing issues.
2480 */ 2499 */
2481 2500
2482static inline int page_order(struct perf_mmap_data *data) 2501static inline int page_order(struct perf_buffer *buffer)
2483{ 2502{
2484 return data->page_order; 2503 return buffer->page_order;
2485} 2504}
2486 2505
2487static struct page * 2506static struct page *
2488perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2507perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2489{ 2508{
2490 if (pgoff > (1UL << page_order(data))) 2509 if (pgoff > (1UL << page_order(buffer)))
2491 return NULL; 2510 return NULL;
2492 2511
2493 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2512 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2494} 2513}
2495 2514
2496static void perf_mmap_unmark_page(void *addr) 2515static void perf_mmap_unmark_page(void *addr)
@@ -2500,57 +2519,59 @@ static void perf_mmap_unmark_page(void *addr)
2500 page->mapping = NULL; 2519 page->mapping = NULL;
2501} 2520}
2502 2521
2503static void perf_mmap_data_free_work(struct work_struct *work) 2522static void perf_buffer_free_work(struct work_struct *work)
2504{ 2523{
2505 struct perf_mmap_data *data; 2524 struct perf_buffer *buffer;
2506 void *base; 2525 void *base;
2507 int i, nr; 2526 int i, nr;
2508 2527
2509 data = container_of(work, struct perf_mmap_data, work); 2528 buffer = container_of(work, struct perf_buffer, work);
2510 nr = 1 << page_order(data); 2529 nr = 1 << page_order(buffer);
2511 2530
2512 base = data->user_page; 2531 base = buffer->user_page;
2513 for (i = 0; i < nr + 1; i++) 2532 for (i = 0; i < nr + 1; i++)
2514 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2533 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2515 2534
2516 vfree(base); 2535 vfree(base);
2517 kfree(data); 2536 kfree(buffer);
2518} 2537}
2519 2538
2520static void perf_mmap_data_free(struct perf_mmap_data *data) 2539static void perf_buffer_free(struct perf_buffer *buffer)
2521{ 2540{
2522 schedule_work(&data->work); 2541 schedule_work(&buffer->work);
2523} 2542}
2524 2543
2525static struct perf_mmap_data * 2544static struct perf_buffer *
2526perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2545perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2527{ 2546{
2528 struct perf_mmap_data *data; 2547 struct perf_buffer *buffer;
2529 unsigned long size; 2548 unsigned long size;
2530 void *all_buf; 2549 void *all_buf;
2531 2550
2532 size = sizeof(struct perf_mmap_data); 2551 size = sizeof(struct perf_buffer);
2533 size += sizeof(void *); 2552 size += sizeof(void *);
2534 2553
2535 data = kzalloc(size, GFP_KERNEL); 2554 buffer = kzalloc(size, GFP_KERNEL);
2536 if (!data) 2555 if (!buffer)
2537 goto fail; 2556 goto fail;
2538 2557
2539 INIT_WORK(&data->work, perf_mmap_data_free_work); 2558 INIT_WORK(&buffer->work, perf_buffer_free_work);
2540 2559
2541 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 2560 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2542 if (!all_buf) 2561 if (!all_buf)
2543 goto fail_all_buf; 2562 goto fail_all_buf;
2544 2563
2545 data->user_page = all_buf; 2564 buffer->user_page = all_buf;
2546 data->data_pages[0] = all_buf + PAGE_SIZE; 2565 buffer->data_pages[0] = all_buf + PAGE_SIZE;
2547 data->page_order = ilog2(nr_pages); 2566 buffer->page_order = ilog2(nr_pages);
2548 data->nr_pages = 1; 2567 buffer->nr_pages = 1;
2568
2569 perf_buffer_init(buffer, watermark, flags);
2549 2570
2550 return data; 2571 return buffer;
2551 2572
2552fail_all_buf: 2573fail_all_buf:
2553 kfree(data); 2574 kfree(buffer);
2554 2575
2555fail: 2576fail:
2556 return NULL; 2577 return NULL;
@@ -2558,15 +2579,15 @@ fail:
2558 2579
2559#endif 2580#endif
2560 2581
2561static unsigned long perf_data_size(struct perf_mmap_data *data) 2582static unsigned long perf_data_size(struct perf_buffer *buffer)
2562{ 2583{
2563 return data->nr_pages << (PAGE_SHIFT + page_order(data)); 2584 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2564} 2585}
2565 2586
2566static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2587static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2567{ 2588{
2568 struct perf_event *event = vma->vm_file->private_data; 2589 struct perf_event *event = vma->vm_file->private_data;
2569 struct perf_mmap_data *data; 2590 struct perf_buffer *buffer;
2570 int ret = VM_FAULT_SIGBUS; 2591 int ret = VM_FAULT_SIGBUS;
2571 2592
2572 if (vmf->flags & FAULT_FLAG_MKWRITE) { 2593 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2576,14 +2597,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2576 } 2597 }
2577 2598
2578 rcu_read_lock(); 2599 rcu_read_lock();
2579 data = rcu_dereference(event->data); 2600 buffer = rcu_dereference(event->buffer);
2580 if (!data) 2601 if (!buffer)
2581 goto unlock; 2602 goto unlock;
2582 2603
2583 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 2604 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2584 goto unlock; 2605 goto unlock;
2585 2606
2586 vmf->page = perf_mmap_to_page(data, vmf->pgoff); 2607 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
2587 if (!vmf->page) 2608 if (!vmf->page)
2588 goto unlock; 2609 goto unlock;
2589 2610
@@ -2598,52 +2619,35 @@ unlock:
2598 return ret; 2619 return ret;
2599} 2620}
2600 2621
2601static void 2622static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
2602perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2603{
2604 long max_size = perf_data_size(data);
2605
2606 if (event->attr.watermark) {
2607 data->watermark = min_t(long, max_size,
2608 event->attr.wakeup_watermark);
2609 }
2610
2611 if (!data->watermark)
2612 data->watermark = max_size / 2;
2613
2614 atomic_set(&data->refcount, 1);
2615 rcu_assign_pointer(event->data, data);
2616}
2617
2618static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2619{ 2623{
2620 struct perf_mmap_data *data; 2624 struct perf_buffer *buffer;
2621 2625
2622 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2626 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
2623 perf_mmap_data_free(data); 2627 perf_buffer_free(buffer);
2624} 2628}
2625 2629
2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) 2630static struct perf_buffer *perf_buffer_get(struct perf_event *event)
2627{ 2631{
2628 struct perf_mmap_data *data; 2632 struct perf_buffer *buffer;
2629 2633
2630 rcu_read_lock(); 2634 rcu_read_lock();
2631 data = rcu_dereference(event->data); 2635 buffer = rcu_dereference(event->buffer);
2632 if (data) { 2636 if (buffer) {
2633 if (!atomic_inc_not_zero(&data->refcount)) 2637 if (!atomic_inc_not_zero(&buffer->refcount))
2634 data = NULL; 2638 buffer = NULL;
2635 } 2639 }
2636 rcu_read_unlock(); 2640 rcu_read_unlock();
2637 2641
2638 return data; 2642 return buffer;
2639} 2643}
2640 2644
2641static void perf_mmap_data_put(struct perf_mmap_data *data) 2645static void perf_buffer_put(struct perf_buffer *buffer)
2642{ 2646{
2643 if (!atomic_dec_and_test(&data->refcount)) 2647 if (!atomic_dec_and_test(&buffer->refcount))
2644 return; 2648 return;
2645 2649
2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2650 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
2647} 2651}
2648 2652
2649static void perf_mmap_open(struct vm_area_struct *vma) 2653static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2658,16 +2662,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2658 struct perf_event *event = vma->vm_file->private_data; 2662 struct perf_event *event = vma->vm_file->private_data;
2659 2663
2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2664 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2661 unsigned long size = perf_data_size(event->data); 2665 unsigned long size = perf_data_size(event->buffer);
2662 struct user_struct *user = event->mmap_user; 2666 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data; 2667 struct perf_buffer *buffer = event->buffer;
2664 2668
2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2669 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2666 vma->vm_mm->locked_vm -= event->mmap_locked; 2670 vma->vm_mm->locked_vm -= event->mmap_locked;
2667 rcu_assign_pointer(event->data, NULL); 2671 rcu_assign_pointer(event->buffer, NULL);
2668 mutex_unlock(&event->mmap_mutex); 2672 mutex_unlock(&event->mmap_mutex);
2669 2673
2670 perf_mmap_data_put(data); 2674 perf_buffer_put(buffer);
2671 free_uid(user); 2675 free_uid(user);
2672 } 2676 }
2673} 2677}
@@ -2685,11 +2689,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2685 unsigned long user_locked, user_lock_limit; 2689 unsigned long user_locked, user_lock_limit;
2686 struct user_struct *user = current_user(); 2690 struct user_struct *user = current_user();
2687 unsigned long locked, lock_limit; 2691 unsigned long locked, lock_limit;
2688 struct perf_mmap_data *data; 2692 struct perf_buffer *buffer;
2689 unsigned long vma_size; 2693 unsigned long vma_size;
2690 unsigned long nr_pages; 2694 unsigned long nr_pages;
2691 long user_extra, extra; 2695 long user_extra, extra;
2692 int ret = 0; 2696 int ret = 0, flags = 0;
2693 2697
2694 /* 2698 /*
2695 * Don't allow mmap() of inherited per-task counters. This would 2699 * Don't allow mmap() of inherited per-task counters. This would
@@ -2706,7 +2710,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2706 nr_pages = (vma_size / PAGE_SIZE) - 1; 2710 nr_pages = (vma_size / PAGE_SIZE) - 1;
2707 2711
2708 /* 2712 /*
2709 * If we have data pages ensure they're a power-of-two number, so we 2713 * If we have buffer pages ensure they're a power-of-two number, so we
2710 * can do bitmasks instead of modulo. 2714 * can do bitmasks instead of modulo.
2711 */ 2715 */
2712 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 2716 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2720,9 +2724,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2720 2724
2721 WARN_ON_ONCE(event->ctx->parent_ctx); 2725 WARN_ON_ONCE(event->ctx->parent_ctx);
2722 mutex_lock(&event->mmap_mutex); 2726 mutex_lock(&event->mmap_mutex);
2723 if (event->data) { 2727 if (event->buffer) {
2724 if (event->data->nr_pages == nr_pages) 2728 if (event->buffer->nr_pages == nr_pages)
2725 atomic_inc(&event->data->refcount); 2729 atomic_inc(&event->buffer->refcount);
2726 else 2730 else
2727 ret = -EINVAL; 2731 ret = -EINVAL;
2728 goto unlock; 2732 goto unlock;
@@ -2752,17 +2756,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2752 goto unlock; 2756 goto unlock;
2753 } 2757 }
2754 2758
2755 WARN_ON(event->data); 2759 WARN_ON(event->buffer);
2760
2761 if (vma->vm_flags & VM_WRITE)
2762 flags |= PERF_BUFFER_WRITABLE;
2756 2763
2757 data = perf_mmap_data_alloc(event, nr_pages); 2764 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
2758 if (!data) { 2765 event->cpu, flags);
2766 if (!buffer) {
2759 ret = -ENOMEM; 2767 ret = -ENOMEM;
2760 goto unlock; 2768 goto unlock;
2761 } 2769 }
2762 2770 rcu_assign_pointer(event->buffer, buffer);
2763 perf_mmap_data_init(event, data);
2764 if (vma->vm_flags & VM_WRITE)
2765 event->data->writable = 1;
2766 2771
2767 atomic_long_add(user_extra, &user->locked_vm); 2772 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra; 2773 event->mmap_locked = extra;
@@ -2941,11 +2946,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2941 return NULL; 2946 return NULL;
2942} 2947}
2943 2948
2944__weak
2945void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2946{
2947}
2948
2949 2949
2950/* 2950/*
2951 * We assume there is only KVM supporting the callbacks. 2951 * We assume there is only KVM supporting the callbacks.
@@ -2971,15 +2971,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2971/* 2971/*
2972 * Output 2972 * Output
2973 */ 2973 */
2974static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 2974static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
2975 unsigned long offset, unsigned long head) 2975 unsigned long offset, unsigned long head)
2976{ 2976{
2977 unsigned long mask; 2977 unsigned long mask;
2978 2978
2979 if (!data->writable) 2979 if (!buffer->writable)
2980 return true; 2980 return true;
2981 2981
2982 mask = perf_data_size(data) - 1; 2982 mask = perf_data_size(buffer) - 1;
2983 2983
2984 offset = (offset - tail) & mask; 2984 offset = (offset - tail) & mask;
2985 head = (head - tail) & mask; 2985 head = (head - tail) & mask;
@@ -2992,7 +2992,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2992 2992
2993static void perf_output_wakeup(struct perf_output_handle *handle) 2993static void perf_output_wakeup(struct perf_output_handle *handle)
2994{ 2994{
2995 atomic_set(&handle->data->poll, POLL_IN); 2995 atomic_set(&handle->buffer->poll, POLL_IN);
2996 2996
2997 if (handle->nmi) { 2997 if (handle->nmi) {
2998 handle->event->pending_wakeup = 1; 2998 handle->event->pending_wakeup = 1;
@@ -3012,45 +3012,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
3012 */ 3012 */
3013static void perf_output_get_handle(struct perf_output_handle *handle) 3013static void perf_output_get_handle(struct perf_output_handle *handle)
3014{ 3014{
3015 struct perf_mmap_data *data = handle->data; 3015 struct perf_buffer *buffer = handle->buffer;
3016 3016
3017 preempt_disable(); 3017 preempt_disable();
3018 local_inc(&data->nest); 3018 local_inc(&buffer->nest);
3019 handle->wakeup = local_read(&data->wakeup); 3019 handle->wakeup = local_read(&buffer->wakeup);
3020} 3020}
3021 3021
3022static void perf_output_put_handle(struct perf_output_handle *handle) 3022static void perf_output_put_handle(struct perf_output_handle *handle)
3023{ 3023{
3024 struct perf_mmap_data *data = handle->data; 3024 struct perf_buffer *buffer = handle->buffer;
3025 unsigned long head; 3025 unsigned long head;
3026 3026
3027again: 3027again:
3028 head = local_read(&data->head); 3028 head = local_read(&buffer->head);
3029 3029
3030 /* 3030 /*
3031 * IRQ/NMI can happen here, which means we can miss a head update. 3031 * IRQ/NMI can happen here, which means we can miss a head update.
3032 */ 3032 */
3033 3033
3034 if (!local_dec_and_test(&data->nest)) 3034 if (!local_dec_and_test(&buffer->nest))
3035 goto out; 3035 goto out;
3036 3036
3037 /* 3037 /*
3038 * Publish the known good head. Rely on the full barrier implied 3038 * Publish the known good head. Rely on the full barrier implied
3039 * by atomic_dec_and_test() order the data->head read and this 3039 * by atomic_dec_and_test() order the buffer->head read and this
3040 * write. 3040 * write.
3041 */ 3041 */
3042 data->user_page->data_head = head; 3042 buffer->user_page->data_head = head;
3043 3043
3044 /* 3044 /*
3045 * Now check if we missed an update, rely on the (compiler) 3045 * Now check if we missed an update, rely on the (compiler)
3046 * barrier in atomic_dec_and_test() to re-read data->head. 3046 * barrier in atomic_dec_and_test() to re-read buffer->head.
3047 */ 3047 */
3048 if (unlikely(head != local_read(&data->head))) { 3048 if (unlikely(head != local_read(&buffer->head))) {
3049 local_inc(&data->nest); 3049 local_inc(&buffer->nest);
3050 goto again; 3050 goto again;
3051 } 3051 }
3052 3052
3053 if (handle->wakeup != local_read(&data->wakeup)) 3053 if (handle->wakeup != local_read(&buffer->wakeup))
3054 perf_output_wakeup(handle); 3054 perf_output_wakeup(handle);
3055 3055
3056 out: 3056 out:
@@ -3070,12 +3070,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3070 buf += size; 3070 buf += size;
3071 handle->size -= size; 3071 handle->size -= size;
3072 if (!handle->size) { 3072 if (!handle->size) {
3073 struct perf_mmap_data *data = handle->data; 3073 struct perf_buffer *buffer = handle->buffer;
3074 3074
3075 handle->page++; 3075 handle->page++;
3076 handle->page &= data->nr_pages - 1; 3076 handle->page &= buffer->nr_pages - 1;
3077 handle->addr = data->data_pages[handle->page]; 3077 handle->addr = buffer->data_pages[handle->page];
3078 handle->size = PAGE_SIZE << page_order(data); 3078 handle->size = PAGE_SIZE << page_order(buffer);
3079 } 3079 }
3080 } while (len); 3080 } while (len);
3081} 3081}
@@ -3084,7 +3084,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3084 struct perf_event *event, unsigned int size, 3084 struct perf_event *event, unsigned int size,
3085 int nmi, int sample) 3085 int nmi, int sample)
3086{ 3086{
3087 struct perf_mmap_data *data; 3087 struct perf_buffer *buffer;
3088 unsigned long tail, offset, head; 3088 unsigned long tail, offset, head;
3089 int have_lost; 3089 int have_lost;
3090 struct { 3090 struct {
@@ -3100,19 +3100,19 @@ int perf_output_begin(struct perf_output_handle *handle,
3100 if (event->parent) 3100 if (event->parent)
3101 event = event->parent; 3101 event = event->parent;
3102 3102
3103 data = rcu_dereference(event->data); 3103 buffer = rcu_dereference(event->buffer);
3104 if (!data) 3104 if (!buffer)
3105 goto out; 3105 goto out;
3106 3106
3107 handle->data = data; 3107 handle->buffer = buffer;
3108 handle->event = event; 3108 handle->event = event;
3109 handle->nmi = nmi; 3109 handle->nmi = nmi;
3110 handle->sample = sample; 3110 handle->sample = sample;
3111 3111
3112 if (!data->nr_pages) 3112 if (!buffer->nr_pages)
3113 goto out; 3113 goto out;
3114 3114
3115 have_lost = local_read(&data->lost); 3115 have_lost = local_read(&buffer->lost);
3116 if (have_lost) 3116 if (have_lost)
3117 size += sizeof(lost_event); 3117 size += sizeof(lost_event);
3118 3118
@@ -3124,30 +3124,30 @@ int perf_output_begin(struct perf_output_handle *handle,
3124 * tail pointer. So that all reads will be completed before the 3124 * tail pointer. So that all reads will be completed before the
3125 * write is issued. 3125 * write is issued.
3126 */ 3126 */
3127 tail = ACCESS_ONCE(data->user_page->data_tail); 3127 tail = ACCESS_ONCE(buffer->user_page->data_tail);
3128 smp_rmb(); 3128 smp_rmb();
3129 offset = head = local_read(&data->head); 3129 offset = head = local_read(&buffer->head);
3130 head += size; 3130 head += size;
3131 if (unlikely(!perf_output_space(data, tail, offset, head))) 3131 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
3132 goto fail; 3132 goto fail;
3133 } while (local_cmpxchg(&data->head, offset, head) != offset); 3133 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
3134 3134
3135 if (head - local_read(&data->wakeup) > data->watermark) 3135 if (head - local_read(&buffer->wakeup) > buffer->watermark)
3136 local_add(data->watermark, &data->wakeup); 3136 local_add(buffer->watermark, &buffer->wakeup);
3137 3137
3138 handle->page = offset >> (PAGE_SHIFT + page_order(data)); 3138 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
3139 handle->page &= data->nr_pages - 1; 3139 handle->page &= buffer->nr_pages - 1;
3140 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); 3140 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
3141 handle->addr = data->data_pages[handle->page]; 3141 handle->addr = buffer->data_pages[handle->page];
3142 handle->addr += handle->size; 3142 handle->addr += handle->size;
3143 handle->size = (PAGE_SIZE << page_order(data)) - handle->size; 3143 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3144 3144
3145 if (have_lost) { 3145 if (have_lost) {
3146 lost_event.header.type = PERF_RECORD_LOST; 3146 lost_event.header.type = PERF_RECORD_LOST;
3147 lost_event.header.misc = 0; 3147 lost_event.header.misc = 0;
3148 lost_event.header.size = sizeof(lost_event); 3148 lost_event.header.size = sizeof(lost_event);
3149 lost_event.id = event->id; 3149 lost_event.id = event->id;
3150 lost_event.lost = local_xchg(&data->lost, 0); 3150 lost_event.lost = local_xchg(&buffer->lost, 0);
3151 3151
3152 perf_output_put(handle, lost_event); 3152 perf_output_put(handle, lost_event);
3153 } 3153 }
@@ -3155,7 +3155,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3155 return 0; 3155 return 0;
3156 3156
3157fail: 3157fail:
3158 local_inc(&data->lost); 3158 local_inc(&buffer->lost);
3159 perf_output_put_handle(handle); 3159 perf_output_put_handle(handle);
3160out: 3160out:
3161 rcu_read_unlock(); 3161 rcu_read_unlock();
@@ -3166,15 +3166,15 @@ out:
3166void perf_output_end(struct perf_output_handle *handle) 3166void perf_output_end(struct perf_output_handle *handle)
3167{ 3167{
3168 struct perf_event *event = handle->event; 3168 struct perf_event *event = handle->event;
3169 struct perf_mmap_data *data = handle->data; 3169 struct perf_buffer *buffer = handle->buffer;
3170 3170
3171 int wakeup_events = event->attr.wakeup_events; 3171 int wakeup_events = event->attr.wakeup_events;
3172 3172
3173 if (handle->sample && wakeup_events) { 3173 if (handle->sample && wakeup_events) {
3174 int events = local_inc_return(&data->events); 3174 int events = local_inc_return(&buffer->events);
3175 if (events >= wakeup_events) { 3175 if (events >= wakeup_events) {
3176 local_sub(wakeup_events, &data->events); 3176 local_sub(wakeup_events, &buffer->events);
3177 local_inc(&data->wakeup); 3177 local_inc(&buffer->wakeup);
3178 } 3178 }
3179 } 3179 }
3180 3180
@@ -3211,7 +3211,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3211 u64 values[4]; 3211 u64 values[4];
3212 int n = 0; 3212 int n = 0;
3213 3213
3214 values[n++] = atomic64_read(&event->count); 3214 values[n++] = perf_event_count(event);
3215 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3215 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3216 values[n++] = event->total_time_enabled + 3216 values[n++] = event->total_time_enabled +
3217 atomic64_read(&event->child_total_time_enabled); 3217 atomic64_read(&event->child_total_time_enabled);
@@ -3248,7 +3248,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3248 if (leader != event) 3248 if (leader != event)
3249 leader->pmu->read(leader); 3249 leader->pmu->read(leader);
3250 3250
3251 values[n++] = atomic64_read(&leader->count); 3251 values[n++] = perf_event_count(leader);
3252 if (read_format & PERF_FORMAT_ID) 3252 if (read_format & PERF_FORMAT_ID)
3253 values[n++] = primary_event_id(leader); 3253 values[n++] = primary_event_id(leader);
3254 3254
@@ -3260,7 +3260,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3260 if (sub != event) 3260 if (sub != event)
3261 sub->pmu->read(sub); 3261 sub->pmu->read(sub);
3262 3262
3263 values[n++] = atomic64_read(&sub->count); 3263 values[n++] = perf_event_count(sub);
3264 if (read_format & PERF_FORMAT_ID) 3264 if (read_format & PERF_FORMAT_ID)
3265 values[n++] = primary_event_id(sub); 3265 values[n++] = primary_event_id(sub);
3266 3266
@@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event,
3491/* 3491/*
3492 * task tracking -- fork/exit 3492 * task tracking -- fork/exit
3493 * 3493 *
3494 * enabled by: attr.comm | attr.mmap | attr.task 3494 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
3495 */ 3495 */
3496 3496
3497struct perf_task_event { 3497struct perf_task_event {
@@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event)
3541 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3541 if (event->cpu != -1 && event->cpu != smp_processor_id())
3542 return 0; 3542 return 0;
3543 3543
3544 if (event->attr.comm || event->attr.mmap || event->attr.task) 3544 if (event->attr.comm || event->attr.mmap ||
3545 event->attr.mmap_data || event->attr.task)
3545 return 1; 3546 return 1;
3546 3547
3547 return 0; 3548 return 0;
@@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event,
3766} 3767}
3767 3768
3768static int perf_event_mmap_match(struct perf_event *event, 3769static int perf_event_mmap_match(struct perf_event *event,
3769 struct perf_mmap_event *mmap_event) 3770 struct perf_mmap_event *mmap_event,
3771 int executable)
3770{ 3772{
3771 if (event->state < PERF_EVENT_STATE_INACTIVE) 3773 if (event->state < PERF_EVENT_STATE_INACTIVE)
3772 return 0; 3774 return 0;
@@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event,
3774 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3776 if (event->cpu != -1 && event->cpu != smp_processor_id())
3775 return 0; 3777 return 0;
3776 3778
3777 if (event->attr.mmap) 3779 if ((!executable && event->attr.mmap_data) ||
3780 (executable && event->attr.mmap))
3778 return 1; 3781 return 1;
3779 3782
3780 return 0; 3783 return 0;
3781} 3784}
3782 3785
3783static void perf_event_mmap_ctx(struct perf_event_context *ctx, 3786static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3784 struct perf_mmap_event *mmap_event) 3787 struct perf_mmap_event *mmap_event,
3788 int executable)
3785{ 3789{
3786 struct perf_event *event; 3790 struct perf_event *event;
3787 3791
3788 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3792 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3789 if (perf_event_mmap_match(event, mmap_event)) 3793 if (perf_event_mmap_match(event, mmap_event, executable))
3790 perf_event_mmap_output(event, mmap_event); 3794 perf_event_mmap_output(event, mmap_event);
3791 } 3795 }
3792} 3796}
@@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3830 if (!vma->vm_mm) { 3834 if (!vma->vm_mm) {
3831 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3835 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3832 goto got_name; 3836 goto got_name;
3837 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
3838 vma->vm_end >= vma->vm_mm->brk) {
3839 name = strncpy(tmp, "[heap]", sizeof(tmp));
3840 goto got_name;
3841 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
3842 vma->vm_end >= vma->vm_mm->start_stack) {
3843 name = strncpy(tmp, "[stack]", sizeof(tmp));
3844 goto got_name;
3833 } 3845 }
3834 3846
3835 name = strncpy(tmp, "//anon", sizeof(tmp)); 3847 name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3846,17 +3858,17 @@ got_name:
3846 3858
3847 rcu_read_lock(); 3859 rcu_read_lock();
3848 cpuctx = &get_cpu_var(perf_cpu_context); 3860 cpuctx = &get_cpu_var(perf_cpu_context);
3849 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3861 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
3850 ctx = rcu_dereference(current->perf_event_ctxp); 3862 ctx = rcu_dereference(current->perf_event_ctxp);
3851 if (ctx) 3863 if (ctx)
3852 perf_event_mmap_ctx(ctx, mmap_event); 3864 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
3853 put_cpu_var(perf_cpu_context); 3865 put_cpu_var(perf_cpu_context);
3854 rcu_read_unlock(); 3866 rcu_read_unlock();
3855 3867
3856 kfree(buf); 3868 kfree(buf);
3857} 3869}
3858 3870
3859void __perf_event_mmap(struct vm_area_struct *vma) 3871void perf_event_mmap(struct vm_area_struct *vma)
3860{ 3872{
3861 struct perf_mmap_event mmap_event; 3873 struct perf_mmap_event mmap_event;
3862 3874
@@ -4018,14 +4030,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
4018 hwc->last_period = hwc->sample_period; 4030 hwc->last_period = hwc->sample_period;
4019 4031
4020again: 4032again:
4021 old = val = atomic64_read(&hwc->period_left); 4033 old = val = local64_read(&hwc->period_left);
4022 if (val < 0) 4034 if (val < 0)
4023 return 0; 4035 return 0;
4024 4036
4025 nr = div64_u64(period + val, period); 4037 nr = div64_u64(period + val, period);
4026 offset = nr * period; 4038 offset = nr * period;
4027 val -= offset; 4039 val -= offset;
4028 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) 4040 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4029 goto again; 4041 goto again;
4030 4042
4031 return nr; 4043 return nr;
@@ -4064,7 +4076,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4064{ 4076{
4065 struct hw_perf_event *hwc = &event->hw; 4077 struct hw_perf_event *hwc = &event->hw;
4066 4078
4067 atomic64_add(nr, &event->count); 4079 local64_add(nr, &event->count);
4068 4080
4069 if (!regs) 4081 if (!regs)
4070 return; 4082 return;
@@ -4075,7 +4087,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4075 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4087 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4076 return perf_swevent_overflow(event, 1, nmi, data, regs); 4088 return perf_swevent_overflow(event, 1, nmi, data, regs);
4077 4089
4078 if (atomic64_add_negative(nr, &hwc->period_left)) 4090 if (local64_add_negative(nr, &hwc->period_left))
4079 return; 4091 return;
4080 4092
4081 perf_swevent_overflow(event, 0, nmi, data, regs); 4093 perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4213,14 +4225,12 @@ int perf_swevent_get_recursion_context(void)
4213} 4225}
4214EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4226EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4215 4227
4216void perf_swevent_put_recursion_context(int rctx) 4228void inline perf_swevent_put_recursion_context(int rctx)
4217{ 4229{
4218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4230 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4219 barrier(); 4231 barrier();
4220 cpuctx->recursion[rctx]--; 4232 cpuctx->recursion[rctx]--;
4221} 4233}
4222EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4223
4224 4234
4225void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4235void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4226 struct pt_regs *regs, u64 addr) 4236 struct pt_regs *regs, u64 addr)
@@ -4368,8 +4378,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4368 u64 now; 4378 u64 now;
4369 4379
4370 now = cpu_clock(cpu); 4380 now = cpu_clock(cpu);
4371 prev = atomic64_xchg(&event->hw.prev_count, now); 4381 prev = local64_xchg(&event->hw.prev_count, now);
4372 atomic64_add(now - prev, &event->count); 4382 local64_add(now - prev, &event->count);
4373} 4383}
4374 4384
4375static int cpu_clock_perf_event_enable(struct perf_event *event) 4385static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4377,7 +4387,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
4377 struct hw_perf_event *hwc = &event->hw; 4387 struct hw_perf_event *hwc = &event->hw;
4378 int cpu = raw_smp_processor_id(); 4388 int cpu = raw_smp_processor_id();
4379 4389
4380 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4390 local64_set(&hwc->prev_count, cpu_clock(cpu));
4381 perf_swevent_start_hrtimer(event); 4391 perf_swevent_start_hrtimer(event);
4382 4392
4383 return 0; 4393 return 0;
@@ -4409,9 +4419,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4409 u64 prev; 4419 u64 prev;
4410 s64 delta; 4420 s64 delta;
4411 4421
4412 prev = atomic64_xchg(&event->hw.prev_count, now); 4422 prev = local64_xchg(&event->hw.prev_count, now);
4413 delta = now - prev; 4423 delta = now - prev;
4414 atomic64_add(delta, &event->count); 4424 local64_add(delta, &event->count);
4415} 4425}
4416 4426
4417static int task_clock_perf_event_enable(struct perf_event *event) 4427static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4421,7 +4431,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
4421 4431
4422 now = event->ctx->time; 4432 now = event->ctx->time;
4423 4433
4424 atomic64_set(&hwc->prev_count, now); 4434 local64_set(&hwc->prev_count, now);
4425 4435
4426 perf_swevent_start_hrtimer(event); 4436 perf_swevent_start_hrtimer(event);
4427 4437
@@ -4601,7 +4611,7 @@ static int perf_tp_event_match(struct perf_event *event,
4601} 4611}
4602 4612
4603void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 4613void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4604 struct pt_regs *regs, struct hlist_head *head) 4614 struct pt_regs *regs, struct hlist_head *head, int rctx)
4605{ 4615{
4606 struct perf_sample_data data; 4616 struct perf_sample_data data;
4607 struct perf_event *event; 4617 struct perf_event *event;
@@ -4615,12 +4625,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4615 perf_sample_data_init(&data, addr); 4625 perf_sample_data_init(&data, addr);
4616 data.raw = &raw; 4626 data.raw = &raw;
4617 4627
4618 rcu_read_lock();
4619 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4628 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4620 if (perf_tp_event_match(event, &data, regs)) 4629 if (perf_tp_event_match(event, &data, regs))
4621 perf_swevent_add(event, count, 1, &data, regs); 4630 perf_swevent_add(event, count, 1, &data, regs);
4622 } 4631 }
4623 rcu_read_unlock(); 4632
4633 perf_swevent_put_recursion_context(rctx);
4624} 4634}
4625EXPORT_SYMBOL_GPL(perf_tp_event); 4635EXPORT_SYMBOL_GPL(perf_tp_event);
4626 4636
@@ -4864,7 +4874,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4864 hwc->sample_period = 1; 4874 hwc->sample_period = 1;
4865 hwc->last_period = hwc->sample_period; 4875 hwc->last_period = hwc->sample_period;
4866 4876
4867 atomic64_set(&hwc->period_left, hwc->sample_period); 4877 local64_set(&hwc->period_left, hwc->sample_period);
4868 4878
4869 /* 4879 /*
4870 * we currently do not support PERF_FORMAT_GROUP on inherited events 4880 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4913,7 +4923,7 @@ done:
4913 4923
4914 if (!event->parent) { 4924 if (!event->parent) {
4915 atomic_inc(&nr_events); 4925 atomic_inc(&nr_events);
4916 if (event->attr.mmap) 4926 if (event->attr.mmap || event->attr.mmap_data)
4917 atomic_inc(&nr_mmap_events); 4927 atomic_inc(&nr_mmap_events);
4918 if (event->attr.comm) 4928 if (event->attr.comm)
4919 atomic_inc(&nr_comm_events); 4929 atomic_inc(&nr_comm_events);
@@ -5007,7 +5017,7 @@ err_size:
5007static int 5017static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5018perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5009{ 5019{
5010 struct perf_mmap_data *data = NULL, *old_data = NULL; 5020 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
5011 int ret = -EINVAL; 5021 int ret = -EINVAL;
5012 5022
5013 if (!output_event) 5023 if (!output_event)
@@ -5037,19 +5047,19 @@ set:
5037 5047
5038 if (output_event) { 5048 if (output_event) {
5039 /* get the buffer we want to redirect to */ 5049 /* get the buffer we want to redirect to */
5040 data = perf_mmap_data_get(output_event); 5050 buffer = perf_buffer_get(output_event);
5041 if (!data) 5051 if (!buffer)
5042 goto unlock; 5052 goto unlock;
5043 } 5053 }
5044 5054
5045 old_data = event->data; 5055 old_buffer = event->buffer;
5046 rcu_assign_pointer(event->data, data); 5056 rcu_assign_pointer(event->buffer, buffer);
5047 ret = 0; 5057 ret = 0;
5048unlock: 5058unlock:
5049 mutex_unlock(&event->mmap_mutex); 5059 mutex_unlock(&event->mmap_mutex);
5050 5060
5051 if (old_data) 5061 if (old_buffer)
5052 perf_mmap_data_put(old_data); 5062 perf_buffer_put(old_buffer);
5053out: 5063out:
5054 return ret; 5064 return ret;
5055} 5065}
@@ -5298,7 +5308,7 @@ inherit_event(struct perf_event *parent_event,
5298 hwc->sample_period = sample_period; 5308 hwc->sample_period = sample_period;
5299 hwc->last_period = sample_period; 5309 hwc->last_period = sample_period;
5300 5310
5301 atomic64_set(&hwc->period_left, sample_period); 5311 local64_set(&hwc->period_left, sample_period);
5302 } 5312 }
5303 5313
5304 child_event->overflow_handler = parent_event->overflow_handler; 5314 child_event->overflow_handler = parent_event->overflow_handler;
@@ -5359,12 +5369,12 @@ static void sync_child_event(struct perf_event *child_event,
5359 if (child_event->attr.inherit_stat) 5369 if (child_event->attr.inherit_stat)
5360 perf_event_read_event(child_event, child); 5370 perf_event_read_event(child_event, child);
5361 5371
5362 child_val = atomic64_read(&child_event->count); 5372 child_val = perf_event_count(child_event);
5363 5373
5364 /* 5374 /*
5365 * Add back the child's count to the parent's count: 5375 * Add back the child's count to the parent's count:
5366 */ 5376 */
5367 atomic64_add(child_val, &parent_event->count); 5377 atomic64_add(child_val, &parent_event->child_count);
5368 atomic64_add(child_event->total_time_enabled, 5378 atomic64_add(child_event->total_time_enabled,
5369 &parent_event->child_total_time_enabled); 5379 &parent_event->child_total_time_enabled);
5370 atomic64_add(child_event->total_time_running, 5380 atomic64_add(child_event->total_time_running,
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index f42d3f737a33..996a4dec5f96 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -48,59 +48,49 @@
48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
49 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
50 */ 50 */
51struct pm_qos_request_list { 51enum pm_qos_type {
52 struct list_head list; 52 PM_QOS_MAX, /* return the largest value */
53 union { 53 PM_QOS_MIN /* return the smallest value */
54 s32 value;
55 s32 usec;
56 s32 kbps;
57 };
58 int pm_qos_class;
59}; 54};
60 55
61static s32 max_compare(s32 v1, s32 v2);
62static s32 min_compare(s32 v1, s32 v2);
63
64struct pm_qos_object { 56struct pm_qos_object {
65 struct pm_qos_request_list requests; 57 struct plist_head requests;
66 struct blocking_notifier_head *notifiers; 58 struct blocking_notifier_head *notifiers;
67 struct miscdevice pm_qos_power_miscdev; 59 struct miscdevice pm_qos_power_miscdev;
68 char *name; 60 char *name;
69 s32 default_value; 61 s32 default_value;
70 atomic_t target_value; 62 enum pm_qos_type type;
71 s32 (*comparitor)(s32, s32);
72}; 63};
73 64
65static DEFINE_SPINLOCK(pm_qos_lock);
66
74static struct pm_qos_object null_pm_qos; 67static struct pm_qos_object null_pm_qos;
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 68static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 69static struct pm_qos_object cpu_dma_pm_qos = {
77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, 70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
78 .notifiers = &cpu_dma_lat_notifier, 71 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency", 72 .name = "cpu_dma_latency",
80 .default_value = 2000 * USEC_PER_SEC, 73 .default_value = 2000 * USEC_PER_SEC,
81 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 74 .type = PM_QOS_MIN,
82 .comparitor = min_compare
83}; 75};
84 76
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 77static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 78static struct pm_qos_object network_lat_pm_qos = {
87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, 79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
88 .notifiers = &network_lat_notifier, 80 .notifiers = &network_lat_notifier,
89 .name = "network_latency", 81 .name = "network_latency",
90 .default_value = 2000 * USEC_PER_SEC, 82 .default_value = 2000 * USEC_PER_SEC,
91 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 83 .type = PM_QOS_MIN
92 .comparitor = min_compare
93}; 84};
94 85
95 86
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 87static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 88static struct pm_qos_object network_throughput_pm_qos = {
98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, 89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
99 .notifiers = &network_throughput_notifier, 90 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 91 .name = "network_throughput",
101 .default_value = 0, 92 .default_value = 0,
102 .target_value = ATOMIC_INIT(0), 93 .type = PM_QOS_MAX,
103 .comparitor = max_compare
104}; 94};
105 95
106 96
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
111 &network_throughput_pm_qos 101 &network_throughput_pm_qos
112}; 102};
113 103
114static DEFINE_SPINLOCK(pm_qos_lock);
115
116static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
117 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
118static int pm_qos_power_open(struct inode *inode, struct file *filp); 106static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
124 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
125}; 113};
126 114
127/* static helper functions */ 115/* unlocked internal variant */
128static s32 max_compare(s32 v1, s32 v2) 116static inline int pm_qos_get_value(struct pm_qos_object *o)
129{ 117{
130 return max(v1, v2); 118 if (plist_head_empty(&o->requests))
131} 119 return o->default_value;
132 120
133static s32 min_compare(s32 v1, s32 v2) 121 switch (o->type) {
134{ 122 case PM_QOS_MIN:
135 return min(v1, v2); 123 return plist_last(&o->requests)->prio;
136}
137 124
125 case PM_QOS_MAX:
126 return plist_first(&o->requests)->prio;
138 127
139static void update_target(int pm_qos_class) 128 default:
129 /* runtime check for not using enum */
130 BUG();
131 }
132}
133
134static void update_target(struct pm_qos_object *o, struct plist_node *node,
135 int del, int value)
140{ 136{
141 s32 extreme_value;
142 struct pm_qos_request_list *node;
143 unsigned long flags; 137 unsigned long flags;
144 int call_notifier = 0; 138 int prev_value, curr_value;
145 139
146 spin_lock_irqsave(&pm_qos_lock, flags); 140 spin_lock_irqsave(&pm_qos_lock, flags);
147 extreme_value = pm_qos_array[pm_qos_class]->default_value; 141 prev_value = pm_qos_get_value(o);
148 list_for_each_entry(node, 142 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
149 &pm_qos_array[pm_qos_class]->requests.list, list) { 143 if (value != PM_QOS_DEFAULT_VALUE) {
150 extreme_value = pm_qos_array[pm_qos_class]->comparitor( 144 /*
151 extreme_value, node->value); 145 * to change the list, we atomically remove, reinit
152 } 146 * with new value and add, then see if the extremal
153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != 147 * changed
154 extreme_value) { 148 */
155 call_notifier = 1; 149 plist_del(node, &o->requests);
156 atomic_set(&pm_qos_array[pm_qos_class]->target_value, 150 plist_node_init(node, value);
157 extreme_value); 151 plist_add(node, &o->requests);
158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, 152 } else if (del) {
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value)); 153 plist_del(node, &o->requests);
154 } else {
155 plist_add(node, &o->requests);
160 } 156 }
157 curr_value = pm_qos_get_value(o);
161 spin_unlock_irqrestore(&pm_qos_lock, flags); 158 spin_unlock_irqrestore(&pm_qos_lock, flags);
162 159
163 if (call_notifier) 160 if (prev_value != curr_value)
164 blocking_notifier_call_chain( 161 blocking_notifier_call_chain(o->notifiers,
165 pm_qos_array[pm_qos_class]->notifiers, 162 (unsigned long)curr_value,
166 (unsigned long) extreme_value, NULL); 163 NULL);
167} 164}
168 165
169static int register_pm_qos_misc(struct pm_qos_object *qos) 166static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor)
196 */ 193 */
197int pm_qos_request(int pm_qos_class) 194int pm_qos_request(int pm_qos_class)
198{ 195{
199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 196 unsigned long flags;
197 int value;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return value;
200} 204}
201EXPORT_SYMBOL_GPL(pm_qos_request); 205EXPORT_SYMBOL_GPL(pm_qos_request);
202 206
207int pm_qos_request_active(struct pm_qos_request_list *req)
208{
209 return req->pm_qos_class != 0;
210}
211EXPORT_SYMBOL_GPL(pm_qos_request_active);
212
203/** 213/**
204 * pm_qos_add_request - inserts new qos request into the list 214 * pm_qos_add_request - inserts new qos request into the list
205 * @pm_qos_class: identifies which list of qos request to us 215 * @pm_qos_class: identifies which list of qos request to us
@@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request);
211 * element as a handle for use in updating and removal. Call needs to save 221 * element as a handle for use in updating and removal. Call needs to save
212 * this handle for later use. 222 * this handle for later use.
213 */ 223 */
214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) 224void pm_qos_add_request(struct pm_qos_request_list *dep,
225 int pm_qos_class, s32 value)
215{ 226{
216 struct pm_qos_request_list *dep; 227 struct pm_qos_object *o = pm_qos_array[pm_qos_class];
217 unsigned long flags; 228 int new_value;
218 229
219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); 230 if (pm_qos_request_active(dep)) {
220 if (dep) { 231 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
221 if (value == PM_QOS_DEFAULT_VALUE) 232 return;
222 dep->value = pm_qos_array[pm_qos_class]->default_value;
223 else
224 dep->value = value;
225 dep->pm_qos_class = pm_qos_class;
226
227 spin_lock_irqsave(&pm_qos_lock, flags);
228 list_add(&dep->list,
229 &pm_qos_array[pm_qos_class]->requests.list);
230 spin_unlock_irqrestore(&pm_qos_lock, flags);
231 update_target(pm_qos_class);
232 } 233 }
233 234 if (value == PM_QOS_DEFAULT_VALUE)
234 return dep; 235 new_value = o->default_value;
236 else
237 new_value = value;
238 plist_node_init(&dep->list, new_value);
239 dep->pm_qos_class = pm_qos_class;
240 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
235} 241}
236EXPORT_SYMBOL_GPL(pm_qos_add_request); 242EXPORT_SYMBOL_GPL(pm_qos_add_request);
237 243
@@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
246 * Attempts are made to make this code callable on hot code paths. 252 * Attempts are made to make this code callable on hot code paths.
247 */ 253 */
248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, 254void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value) 255 s32 new_value)
250{ 256{
251 unsigned long flags;
252 int pending_update = 0;
253 s32 temp; 257 s32 temp;
258 struct pm_qos_object *o;
259
260 if (!pm_qos_req) /*guard against callers passing in null */
261 return;
254 262
255 if (pm_qos_req) { /*guard against callers passing in null */ 263 if (!pm_qos_request_active(pm_qos_req)) {
256 spin_lock_irqsave(&pm_qos_lock, flags); 264 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
257 if (new_value == PM_QOS_DEFAULT_VALUE) 265 return;
258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
259 else
260 temp = new_value;
261
262 if (temp != pm_qos_req->value) {
263 pending_update = 1;
264 pm_qos_req->value = temp;
265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
269 } 266 }
267
268 o = pm_qos_array[pm_qos_req->pm_qos_class];
269
270 if (new_value == PM_QOS_DEFAULT_VALUE)
271 temp = o->default_value;
272 else
273 temp = new_value;
274
275 if (temp != pm_qos_req->list.prio)
276 update_target(o, &pm_qos_req->list, 0, temp);
270} 277}
271EXPORT_SYMBOL_GPL(pm_qos_update_request); 278EXPORT_SYMBOL_GPL(pm_qos_update_request);
272 279
@@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
280 */ 287 */
281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) 288void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
282{ 289{
283 unsigned long flags; 290 struct pm_qos_object *o;
284 int qos_class;
285 291
286 if (pm_qos_req == NULL) 292 if (pm_qos_req == NULL)
287 return; 293 return;
288 /* silent return to keep pcm code cleaner */ 294 /* silent return to keep pcm code cleaner */
289 295
290 qos_class = pm_qos_req->pm_qos_class; 296 if (!pm_qos_request_active(pm_qos_req)) {
291 spin_lock_irqsave(&pm_qos_lock, flags); 297 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
292 list_del(&pm_qos_req->list); 298 return;
293 kfree(pm_qos_req); 299 }
294 spin_unlock_irqrestore(&pm_qos_lock, flags); 300
295 update_target(qos_class); 301 o = pm_qos_array[pm_qos_req->pm_qos_class];
302 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
303 memset(pm_qos_req, 0, sizeof(*pm_qos_req));
296} 304}
297EXPORT_SYMBOL_GPL(pm_qos_remove_request); 305EXPORT_SYMBOL_GPL(pm_qos_remove_request);
298 306
@@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
340 348
341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 349 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
342 if (pm_qos_class >= 0) { 350 if (pm_qos_class >= 0) {
343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class, 351 struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req));
344 PM_QOS_DEFAULT_VALUE); 352 if (!req)
353 return -ENOMEM;
354
355 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
356 filp->private_data = req;
345 357
346 if (filp->private_data) 358 if (filp->private_data)
347 return 0; 359 return 0;
@@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
353{ 365{
354 struct pm_qos_request_list *req; 366 struct pm_qos_request_list *req;
355 367
356 req = (struct pm_qos_request_list *)filp->private_data; 368 req = filp->private_data;
357 pm_qos_remove_request(req); 369 pm_qos_remove_request(req);
370 kfree(req);
358 371
359 return 0; 372 return 0;
360} 373}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..6842eeba5879 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -16,13 +16,13 @@
16 * siglock protection since other code may update expiration cache as 16 * siglock protection since other code may update expiration cache as
17 * well. 17 * well.
18 */ 18 */
19void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
20{ 20{
21 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
22 22
23 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&task->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&task->sighand->siglock);
26} 26}
27 27
28static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
232 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{ 234{
235 struct sighand_struct *sighand; 235 struct signal_struct *sig = tsk->signal;
236 struct signal_struct *sig;
237 struct task_struct *t; 236 struct task_struct *t;
238 237
239 *times = INIT_CPUTIME; 238 times->utime = sig->utime;
239 times->stime = sig->stime;
240 times->sum_exec_runtime = sig->sum_sched_runtime;
240 241
241 rcu_read_lock(); 242 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand); 243 /* make sure we can trust tsk->thread_group list */
243 if (!sighand) 244 if (!likely(pid_alive(tsk)))
244 goto out; 245 goto out;
245 246
246 sig = tsk->signal;
247
248 t = tsk; 247 t = tsk;
249 do { 248 do {
250 times->utime = cputime_add(times->utime, t->utime); 249 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime); 250 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime; 251 times->sum_exec_runtime += t->se.sum_exec_runtime;
253 252 } while_each_thread(tsk, t);
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out: 253out:
261 rcu_read_unlock(); 254 rcu_read_unlock();
262} 255}
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1279{ 1272{
1280 struct signal_struct *sig; 1273 struct signal_struct *sig;
1281 1274
1282 /* tsk == current, ensure it is safe to use ->signal/sighand */
1283 if (unlikely(tsk->exit_state))
1284 return 0;
1285
1286 if (!task_cputime_zero(&tsk->cputime_expires)) { 1275 if (!task_cputime_zero(&tsk->cputime_expires)) {
1287 struct task_cputime task_sample = { 1276 struct task_cputime task_sample = {
1288 .utime = tsk->utime, 1277 .utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1298 if (sig->cputimer.running) { 1287 if (sig->cputimer.running) {
1299 struct task_cputime group_sample; 1288 struct task_cputime group_sample;
1300 1289
1301 thread_group_cputimer(tsk, &group_sample); 1290 spin_lock(&sig->cputimer.lock);
1291 group_sample = sig->cputimer.cputime;
1292 spin_unlock(&sig->cputimer.lock);
1293
1302 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1294 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1303 return 1; 1295 return 1;
1304 } 1296 }
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1315{ 1307{
1316 LIST_HEAD(firing); 1308 LIST_HEAD(firing);
1317 struct k_itimer *timer, *next; 1309 struct k_itimer *timer, *next;
1310 unsigned long flags;
1318 1311
1319 BUG_ON(!irqs_disabled()); 1312 BUG_ON(!irqs_disabled());
1320 1313
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1325 if (!fastpath_timer_check(tsk)) 1318 if (!fastpath_timer_check(tsk))
1326 return; 1319 return;
1327 1320
1328 spin_lock(&tsk->sighand->siglock); 1321 if (!lock_task_sighand(tsk, &flags))
1322 return;
1329 /* 1323 /*
1330 * Here we take off tsk->signal->cpu_timers[N] and 1324 * Here we take off tsk->signal->cpu_timers[N] and
1331 * tsk->cpu_timers[N] all the timers that are firing, and 1325 * tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1347 * that gets the timer lock before we do will give it up and 1341 * that gets the timer lock before we do will give it up and
1348 * spin until we've taken care of that timer below. 1342 * spin until we've taken care of that timer below.
1349 */ 1343 */
1350 spin_unlock(&tsk->sighand->siglock); 1344 unlock_task_sighand(tsk, &flags);
1351 1345
1352 /* 1346 /*
1353 * Now that all the timers on our list have the firing flag, 1347 * Now that all the timers on our list have the firing flag,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad723420acc3..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 562
563 if (copy_to_user(created_timer_id,
564 &new_timer_id, sizeof (new_timer_id))) {
565 error = -EFAULT;
566 goto out;
567 }
568 if (timer_event_spec) { 563 if (timer_event_spec) {
569 if (copy_from_user(&event, timer_event_spec, sizeof (event))) { 564 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
570 error = -EFAULT; 565 error = -EFAULT;
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
590 new_timer->sigq->info.si_tid = new_timer->it_id; 585 new_timer->sigq->info.si_tid = new_timer->it_id;
591 new_timer->sigq->info.si_code = SI_TIMER; 586 new_timer->sigq->info.si_code = SI_TIMER;
592 587
588 if (copy_to_user(created_timer_id,
589 &new_timer_id, sizeof (new_timer_id))) {
590 error = -EFAULT;
591 goto out;
592 }
593
593 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 594 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
594 if (error) 595 if (error)
595 goto out; 596 goto out;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..c77963938bca 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
277 goto Enable_irqs; 277 goto Enable_irqs;
278 } 278 }
279 279
280 if (hibernation_test(TEST_CORE)) 280 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
281 goto Power_up; 281 goto Power_up;
282 282
283 in_suspend = 1; 283 in_suspend = 1;
@@ -288,8 +288,10 @@ static int create_image(int platform_mode)
288 error); 288 error);
289 /* Restore control flow magically appears here */ 289 /* Restore control flow magically appears here */
290 restore_processor_state(); 290 restore_processor_state();
291 if (!in_suspend) 291 if (!in_suspend) {
292 events_check_enabled = false;
292 platform_leave(platform_mode); 293 platform_leave(platform_mode);
294 }
293 295
294 Power_up: 296 Power_up:
295 sysdev_resume(); 297 sysdev_resume();
@@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode)
328 330
329 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
330 if (error) 332 if (error)
331 return error; 333 goto Close;
332 334
333 /* Preallocate image memory before shutting down devices. */ 335 /* Preallocate image memory before shutting down devices. */
334 error = hibernate_preallocate_memory(); 336 error = hibernate_preallocate_memory();
@@ -336,6 +338,7 @@ int hibernation_snapshot(int platform_mode)
336 goto Close; 338 goto Close;
337 339
338 suspend_console(); 340 suspend_console();
341 hibernation_freeze_swap();
339 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
340 error = dpm_suspend_start(PMSG_FREEZE); 343 error = dpm_suspend_start(PMSG_FREEZE);
341 if (error) 344 if (error)
@@ -511,18 +514,24 @@ int hibernation_platform_enter(void)
511 514
512 local_irq_disable(); 515 local_irq_disable();
513 sysdev_suspend(PMSG_HIBERNATE); 516 sysdev_suspend(PMSG_HIBERNATE);
517 if (!pm_check_wakeup_events()) {
518 error = -EAGAIN;
519 goto Power_up;
520 }
521
514 hibernation_ops->enter(); 522 hibernation_ops->enter();
515 /* We should never get here */ 523 /* We should never get here */
516 while (1); 524 while (1);
517 525
518 /* 526 Power_up:
519 * We don't need to reenable the nonboot CPUs or resume consoles, since 527 sysdev_resume();
520 * the system is going to be halted anyway. 528 local_irq_enable();
521 */ 529 enable_nonboot_cpus();
530
522 Platform_finish: 531 Platform_finish:
523 hibernation_ops->finish(); 532 hibernation_ops->finish();
524 533
525 dpm_suspend_noirq(PMSG_RESTORE); 534 dpm_resume_noirq(PMSG_RESTORE);
526 535
527 Resume_devices: 536 Resume_devices:
528 entering_platform_hibernation = false; 537 entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
204 204
205power_attr(state); 205power_attr(state);
206 206
207#ifdef CONFIG_PM_SLEEP
208/*
209 * The 'wakeup_count' attribute, along with the functions defined in
210 * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
211 * handled in a non-racy way.
212 *
213 * If a wakeup event occurs when the system is in a sleep state, it simply is
214 * woken up. In turn, if an event that would wake the system up from a sleep
215 * state occurs when it is undergoing a transition to that sleep state, the
216 * transition should be aborted. Moreover, if such an event occurs when the
217 * system is in the working state, an attempt to start a transition to the
218 * given sleep state should fail during certain period after the detection of
219 * the event. Using the 'state' attribute alone is not sufficient to satisfy
220 * these requirements, because a wakeup event may occur exactly when 'state'
221 * is being written to and may be delivered to user space right before it is
222 * frozen, so the event will remain only partially processed until the system is
223 * woken up by another event. In particular, it won't cause the transition to
224 * a sleep state to be aborted.
225 *
226 * This difficulty may be overcome if user space uses 'wakeup_count' before
227 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to.
234 */
235
236static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr,
238 char *buf)
239{
240 unsigned long val;
241
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
243}
244
245static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr,
247 const char *buf, size_t n)
248{
249 unsigned long val;
250
251 if (sscanf(buf, "%lu", &val) == 1) {
252 if (pm_save_wakeup_count(val))
253 return n;
254 }
255 return -EINVAL;
256}
257
258power_attr(wakeup_count);
259#endif /* CONFIG_PM_SLEEP */
260
207#ifdef CONFIG_PM_TRACE 261#ifdef CONFIG_PM_TRACE
208int pm_trace_enabled; 262int pm_trace_enabled;
209 263
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
236#endif 290#endif
237#ifdef CONFIG_PM_SLEEP 291#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr, 292 &pm_async_attr.attr,
293 &wakeup_count_attr.attr,
239#ifdef CONFIG_PM_DEBUG 294#ifdef CONFIG_PM_DEBUG
240 &pm_test_attr.attr, 295 &pm_test_attr.attr,
241#endif 296#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..028a99598f49 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h>
18 19
19/* 20/*
20 * Timeout for stopping processes 21 * Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
35 struct task_struct *g, *p; 36 struct task_struct *g, *p;
36 unsigned long end_time; 37 unsigned long end_time;
37 unsigned int todo; 38 unsigned int todo;
39 bool wq_busy = false;
38 struct timeval start, end; 40 struct timeval start, end;
39 u64 elapsed_csecs64; 41 u64 elapsed_csecs64;
40 unsigned int elapsed_csecs; 42 unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
42 do_gettimeofday(&start); 44 do_gettimeofday(&start);
43 45
44 end_time = jiffies + TIMEOUT; 46 end_time = jiffies + TIMEOUT;
47
48 if (!sig_only)
49 freeze_workqueues_begin();
50
45 while (true) { 51 while (true) {
46 todo = 0; 52 todo = 0;
47 read_lock(&tasklist_lock); 53 read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
63 todo++; 69 todo++;
64 } while_each_thread(g, p); 70 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 71 read_unlock(&tasklist_lock);
72
73 if (!sig_only) {
74 wq_busy = freeze_workqueues_busy();
75 todo += wq_busy;
76 }
77
66 if (!todo || time_after(jiffies, end_time)) 78 if (!todo || time_after(jiffies, end_time))
67 break; 79 break;
68 80
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only)
86 */ 98 */
87 printk("\n"); 99 printk("\n");
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 100 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 101 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 102 elapsed_csecs / 100, elapsed_csecs % 100,
103 todo - wq_busy, wq_busy);
104
105 thaw_workqueues();
106
91 read_lock(&tasklist_lock); 107 read_lock(&tasklist_lock);
92 do_each_thread(g, p) { 108 do_each_thread(g, p) {
93 task_lock(p); 109 task_lock(p);
@@ -157,6 +173,7 @@ void thaw_processes(void)
157 oom_killer_enable(); 173 oom_killer_enable();
158 174
159 printk("Restarting tasks ... "); 175 printk("Restarting tasks ... ");
176 thaw_workqueues();
160 thaw_tasks(true); 177 thaw_tasks(true);
161 thaw_tasks(false); 178 thaw_tasks(false);
162 schedule(); 179 schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8b..5e7edfb05e66 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * This file provides system snapshot/restore functionality for swsusp. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -1086,6 +1086,7 @@ void swsusp_free(void)
1086 buffer = NULL; 1086 buffer = NULL;
1087 alloc_normal = 0; 1087 alloc_normal = 0;
1088 alloc_highmem = 0; 1088 alloc_highmem = 0;
1089 hibernation_thaw_swap();
1089} 1090}
1090 1091
1091/* Helper functions used for the shrinking of memory. */ 1092/* Helper functions used for the shrinking of memory. */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f37cb7dd4402..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
136 if (suspend_ops->prepare) { 136 if (suspend_ops->prepare) {
137 error = suspend_ops->prepare(); 137 error = suspend_ops->prepare();
138 if (error) 138 if (error)
139 return error; 139 goto Platform_finish;
140 } 140 }
141 141
142 error = dpm_suspend_noirq(PMSG_SUSPEND); 142 error = dpm_suspend_noirq(PMSG_SUSPEND);
143 if (error) { 143 if (error) {
144 printk(KERN_ERR "PM: Some devices failed to power down\n"); 144 printk(KERN_ERR "PM: Some devices failed to power down\n");
145 goto Platfrom_finish; 145 goto Platform_finish;
146 } 146 }
147 147
148 if (suspend_ops->prepare_late) { 148 if (suspend_ops->prepare_late) {
149 error = suspend_ops->prepare_late(); 149 error = suspend_ops->prepare_late();
150 if (error) 150 if (error)
151 goto Power_up_devices; 151 goto Platform_wake;
152 } 152 }
153 153
154 if (suspend_test(TEST_PLATFORM)) 154 if (suspend_test(TEST_PLATFORM))
@@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
163 163
164 error = sysdev_suspend(PMSG_SUSPEND); 164 error = sysdev_suspend(PMSG_SUSPEND);
165 if (!error) { 165 if (!error) {
166 if (!suspend_test(TEST_CORE)) 166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
167 error = suspend_ops->enter(state); 167 error = suspend_ops->enter(state);
168 events_check_enabled = false;
169 }
168 sysdev_resume(); 170 sysdev_resume();
169 } 171 }
170 172
@@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
178 if (suspend_ops->wake) 180 if (suspend_ops->wake)
179 suspend_ops->wake(); 181 suspend_ops->wake();
180 182
181 Power_up_devices:
182 dpm_resume_noirq(PMSG_RESUME); 183 dpm_resume_noirq(PMSG_RESUME);
183 184
184 Platfrom_finish: 185 Platform_finish:
185 if (suspend_ops->finish) 186 if (suspend_ops->finish)
186 suspend_ops->finish(); 187 suspend_ops->finish();
187 188
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb21778391..5d0059eed3e4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
4 * This file provides functions for reading the suspend image from 4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition. 5 * and writing it to a swap partition.
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * 9 *
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
@@ -32,7 +32,7 @@
32/* 32/*
33 * The swap map is a data structure used for keeping track of each page 33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page 34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries. 35 * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
36 * These structures are stored on the swap and linked together with the 36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member. 37 * help of the .next_swap member.
38 * 38 *
@@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap)
136{ 136{
137 unsigned long offset; 137 unsigned long offset;
138 138
139 offset = swp_offset(get_swap_page_of_type(swap)); 139 offset = swp_offset(get_swap_for_hibernation(swap));
140 if (offset) { 140 if (offset) {
141 if (swsusp_extents_insert(offset)) 141 if (swsusp_extents_insert(offset))
142 swap_free(swp_entry(swap, offset)); 142 swap_free_for_hibernation(swp_entry(swap, offset));
143 else 143 else
144 return swapdev_block(swap, offset); 144 return swapdev_block(swap, offset);
145 } 145 }
@@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
148 148
149/** 149/**
150 * free_all_swap_pages - free swap pages allocated for saving image data. 150 * free_all_swap_pages - free swap pages allocated for saving image data.
151 * It also frees the extents used to register which swap entres had been 151 * It also frees the extents used to register which swap entries had been
152 * allocated. 152 * allocated.
153 */ 153 */
154 154
@@ -163,7 +163,7 @@ void free_all_swap_pages(int swap)
163 ext = container_of(node, struct swsusp_extent, node); 163 ext = container_of(node, struct swsusp_extent, node);
164 rb_erase(node, &swsusp_extents); 164 rb_erase(node, &swsusp_extents);
165 for (offset = ext->start; offset <= ext->end; offset++) 165 for (offset = ext->start; offset <= ext->end; offset++)
166 swap_free(swp_entry(swap, offset)); 166 swap_free_for_hibernation(swp_entry(swap, offset));
167 167
168 kfree(ext); 168 kfree(ext);
169 } 169 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 444b770c9595..8fe465ac008a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,8 @@
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h>
41#include <linux/notifier.h>
40 42
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
42 44
@@ -985,6 +987,32 @@ void resume_console(void)
985} 987}
986 988
987/** 989/**
990 * console_cpu_notify - print deferred console messages after CPU hotplug
991 * @self: notifier struct
992 * @action: CPU hotplug event
993 * @hcpu: unused
994 *
995 * If printk() is called from a CPU that is not online yet, the messages
996 * will be spooled but will not show up on the console. This function is
997 * called when a new CPU comes online (or fails to come up), and ensures
998 * that any such output gets printed.
999 */
1000static int __cpuinit console_cpu_notify(struct notifier_block *self,
1001 unsigned long action, void *hcpu)
1002{
1003 switch (action) {
1004 case CPU_ONLINE:
1005 case CPU_DEAD:
1006 case CPU_DYING:
1007 case CPU_DOWN_FAILED:
1008 case CPU_UP_CANCELED:
1009 acquire_console_sem();
1010 release_console_sem();
1011 }
1012 return NOTIFY_OK;
1013}
1014
1015/**
988 * acquire_console_sem - lock the console system for exclusive use. 1016 * acquire_console_sem - lock the console system for exclusive use.
989 * 1017 *
990 * Acquires a semaphore which guarantees that the caller has 1018 * Acquires a semaphore which guarantees that the caller has
@@ -1371,7 +1399,7 @@ int unregister_console(struct console *console)
1371} 1399}
1372EXPORT_SYMBOL(unregister_console); 1400EXPORT_SYMBOL(unregister_console);
1373 1401
1374static int __init disable_boot_consoles(void) 1402static int __init printk_late_init(void)
1375{ 1403{
1376 struct console *con; 1404 struct console *con;
1377 1405
@@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void)
1382 unregister_console(con); 1410 unregister_console(con);
1383 } 1411 }
1384 } 1412 }
1413 hotcpu_notifier(console_cpu_notify, 0);
1385 return 0; 1414 return 0;
1386} 1415}
1387late_initcall(disable_boot_consoles); 1416late_initcall(printk_late_init);
1388 1417
1389#if defined CONFIG_PRINTK 1418#if defined CONFIG_PRINTK
1390 1419
@@ -1520,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1520 chars = logged_chars; 1549 chars = logged_chars;
1521 spin_unlock_irqrestore(&logbuf_lock, flags); 1550 spin_unlock_irqrestore(&logbuf_lock, flags);
1522 1551
1523 if (logged_chars > end) { 1552 if (chars > end) {
1524 s1 = log_buf + log_buf_len - logged_chars + end; 1553 s1 = log_buf + log_buf_len - chars + end;
1525 l1 = logged_chars - end; 1554 l1 = chars - end;
1526 1555
1527 s2 = log_buf; 1556 s2 = log_buf;
1528 l2 = end; 1557 l2 = end;
@@ -1530,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1530 s1 = ""; 1559 s1 = "";
1531 l1 = 0; 1560 l1 = 0;
1532 1561
1533 s2 = log_buf + end - logged_chars; 1562 s2 = log_buf + end - chars;
1534 l2 = logged_chars; 1563 l2 = chars;
1535 } 1564 }
1536 1565
1537 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1566 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/range.c b/kernel/range.c
index 74e2e6114927..471b66acabb5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -7,10 +7,6 @@
7 7
8#include <linux/range.h> 8#include <linux/range.h>
9 9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) 10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{ 11{
16 if (start >= end) 12 if (start >= end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void)
114} 114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); 115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */ 116#endif /* #ifdef CONFIG_PROVE_RCU */
117
118#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
119static inline void debug_init_rcu_head(struct rcu_head *head)
120{
121 debug_object_init(head, &rcuhead_debug_descr);
122}
123
124static inline void debug_rcu_head_free(struct rcu_head *head)
125{
126 debug_object_free(head, &rcuhead_debug_descr);
127}
128
129/*
130 * fixup_init is called when:
131 * - an active object is initialized
132 */
133static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
134{
135 struct rcu_head *head = addr;
136
137 switch (state) {
138 case ODEBUG_STATE_ACTIVE:
139 /*
140 * Ensure that queued callbacks are all executed.
141 * If we detect that we are nested in a RCU read-side critical
142 * section, we should simply fail, otherwise we would deadlock.
143 */
144 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
145 irqs_disabled()) {
146 WARN_ON(1);
147 return 0;
148 }
149 rcu_barrier();
150 rcu_barrier_sched();
151 rcu_barrier_bh();
152 debug_object_init(head, &rcuhead_debug_descr);
153 return 1;
154 default:
155 return 0;
156 }
157}
158
159/*
160 * fixup_activate is called when:
161 * - an active object is activated
162 * - an unknown object is activated (might be a statically initialized object)
163 * Activation is performed internally by call_rcu().
164 */
165static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
166{
167 struct rcu_head *head = addr;
168
169 switch (state) {
170
171 case ODEBUG_STATE_NOTAVAILABLE:
172 /*
173 * This is not really a fixup. We just make sure that it is
174 * tracked in the object tracker.
175 */
176 debug_object_init(head, &rcuhead_debug_descr);
177 debug_object_activate(head, &rcuhead_debug_descr);
178 return 0;
179
180 case ODEBUG_STATE_ACTIVE:
181 /*
182 * Ensure that queued callbacks are all executed.
183 * If we detect that we are nested in a RCU read-side critical
184 * section, we should simply fail, otherwise we would deadlock.
185 */
186 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
187 irqs_disabled()) {
188 WARN_ON(1);
189 return 0;
190 }
191 rcu_barrier();
192 rcu_barrier_sched();
193 rcu_barrier_bh();
194 debug_object_activate(head, &rcuhead_debug_descr);
195 return 1;
196 default:
197 return 0;
198 }
199}
200
201/*
202 * fixup_free is called when:
203 * - an active object is freed
204 */
205static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
206{
207 struct rcu_head *head = addr;
208
209 switch (state) {
210 case ODEBUG_STATE_ACTIVE:
211 /*
212 * Ensure that queued callbacks are all executed.
213 * If we detect that we are nested in a RCU read-side critical
214 * section, we should simply fail, otherwise we would deadlock.
215 */
216#ifndef CONFIG_PREEMPT
217 WARN_ON(1);
218 return 0;
219#else
220 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
221 irqs_disabled()) {
222 WARN_ON(1);
223 return 0;
224 }
225 rcu_barrier();
226 rcu_barrier_sched();
227 rcu_barrier_bh();
228 debug_object_free(head, &rcuhead_debug_descr);
229 return 1;
230#endif
231 default:
232 return 0;
233 }
234}
235
236/**
237 * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
238 * @head: pointer to rcu_head structure to be initialized
239 *
240 * This function informs debugobjects of a new rcu_head structure that
241 * has been allocated as an auto variable on the stack. This function
242 * is not required for rcu_head structures that are statically defined or
243 * that are dynamically allocated on the heap. This function has no
244 * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
245 */
246void init_rcu_head_on_stack(struct rcu_head *head)
247{
248 debug_object_init_on_stack(head, &rcuhead_debug_descr);
249}
250EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
251
252/**
253 * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
254 * @head: pointer to rcu_head structure to be initialized
255 *
256 * This function informs debugobjects that an on-stack rcu_head structure
257 * is about to go out of scope. As with init_rcu_head_on_stack(), this
258 * function is not required for rcu_head structures that are statically
259 * defined or that are dynamically allocated on the heap. Also as with
260 * init_rcu_head_on_stack(), this function has no effect for
261 * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
262 */
263void destroy_rcu_head_on_stack(struct rcu_head *head)
264{
265 debug_object_free(head, &rcuhead_debug_descr);
266}
267EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
268
269struct debug_obj_descr rcuhead_debug_descr = {
270 .name = "rcu_head",
271 .fixup_init = rcuhead_fixup_init,
272 .fixup_activate = rcuhead_fixup_activate,
273 .fixup_free = rcuhead_fixup_free,
274};
275EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
276#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
169 while (list) { 169 while (list) {
170 next = list->next; 170 next = list->next;
171 prefetch(next); 171 prefetch(next);
172 debug_rcu_head_unqueue(list);
172 list->func(list); 173 list->func(list);
173 list = next; 174 list = next;
174 } 175 }
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
211{ 212{
212 unsigned long flags; 213 unsigned long flags;
213 214
215 debug_rcu_head_queue(head);
214 head->func = func; 216 head->func = func;
215 head->next = NULL; 217 head->next = NULL;
216 218
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
239rcu_random(struct rcu_random_state *rrsp) 239rcu_random(struct rcu_random_state *rrsp)
240{ 240{
241 if (--rrsp->rrs_count < 0) { 241 if (--rrsp->rrs_count < 0) {
242 rrsp->rrs_state += 242 rrsp->rrs_state += (unsigned long)local_clock();
243 (unsigned long)cpu_clock(raw_smp_processor_id());
244 rrsp->rrs_count = RCU_RANDOM_REFRESH; 243 rrsp->rrs_count = RCU_RANDOM_REFRESH;
245 } 244 }
246 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 245 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1112 while (list) { 1112 while (list) {
1113 next = list->next; 1113 next = list->next;
1114 prefetch(next); 1114 prefetch(next);
1115 debug_rcu_head_unqueue(list);
1115 list->func(list); 1116 list->func(list);
1116 list = next; 1117 list = next;
1117 if (++count >= rdp->blimit) 1118 if (++count >= rdp->blimit)
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1388 unsigned long flags; 1389 unsigned long flags;
1389 struct rcu_data *rdp; 1390 struct rcu_data *rdp;
1390 1391
1392 debug_rcu_head_queue(head);
1391 head->func = func; 1393 head->func = func;
1392 head->next = NULL; 1394 head->next = NULL;
1393 1395
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..41541d79e3c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
456 unsigned long nr_running; 457 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 462 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
462#endif 464#endif
463 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
464 466
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
1193 1195
1194#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1195/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
1232 smp_send_reschedule(cpu); 1255 smp_send_reschedule(cpu);
1233} 1256}
1234 1257
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1245#endif /* CONFIG_NO_HZ */ 1258#endif /* CONFIG_NO_HZ */
1246 1259
1247static u64 sched_avg_period(void) 1260static u64 sched_avg_period(void)
@@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
1652 if (root_task_group_empty()) 1665 if (root_task_group_empty())
1653 return; 1666 return;
1654 1667
1655 now = cpu_clock(raw_smp_processor_id()); 1668 now = local_clock();
1656 elapsed = now - sd->last_update; 1669 elapsed = now - sd->last_update;
1657 1670
1658 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1671 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805static void calc_load_account_idle(struct rq *this_rq); 1818static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1819static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1820static int get_update_sysctl_factor(void);
1821static void update_cpu_load(struct rq *this_rq);
1808 1822
1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1810{ 1824{
@@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample)
2267} 2281}
2268#endif 2282#endif
2269 2283
2270/*** 2284static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2285 bool is_sync, bool is_migrate, bool is_local,
2286 unsigned long en_flags)
2287{
2288 schedstat_inc(p, se.statistics.nr_wakeups);
2289 if (is_sync)
2290 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2291 if (is_migrate)
2292 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2293 if (is_local)
2294 schedstat_inc(p, se.statistics.nr_wakeups_local);
2295 else
2296 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2297
2298 activate_task(rq, p, en_flags);
2299}
2300
2301static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2302 int wake_flags, bool success)
2303{
2304 trace_sched_wakeup(p, success);
2305 check_preempt_curr(rq, p, wake_flags);
2306
2307 p->state = TASK_RUNNING;
2308#ifdef CONFIG_SMP
2309 if (p->sched_class->task_woken)
2310 p->sched_class->task_woken(rq, p);
2311
2312 if (unlikely(rq->idle_stamp)) {
2313 u64 delta = rq->clock - rq->idle_stamp;
2314 u64 max = 2*sysctl_sched_migration_cost;
2315
2316 if (delta > max)
2317 rq->avg_idle = max;
2318 else
2319 update_avg(&rq->avg_idle, delta);
2320 rq->idle_stamp = 0;
2321 }
2322#endif
2323 /* if a worker is waking up, notify workqueue */
2324 if ((p->flags & PF_WQ_WORKER) && success)
2325 wq_worker_waking_up(p, cpu_of(rq));
2326}
2327
2328/**
2271 * try_to_wake_up - wake up a thread 2329 * try_to_wake_up - wake up a thread
2272 * @p: the to-be-woken-up thread 2330 * @p: the thread to be awakened
2273 * @state: the mask of task states that can be woken 2331 * @state: the mask of task states that can be woken
2274 * @sync: do a synchronous wakeup? 2332 * @wake_flags: wake modifier flags (WF_*)
2275 * 2333 *
2276 * Put it on the run-queue if it's not already there. The "current" 2334 * Put it on the run-queue if it's not already there. The "current"
2277 * thread is always on the run-queue (except when the actual 2335 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample)
2279 * the simpler "current->state = TASK_RUNNING" to mark yourself 2337 * the simpler "current->state = TASK_RUNNING" to mark yourself
2280 * runnable without the overhead of this. 2338 * runnable without the overhead of this.
2281 * 2339 *
2282 * returns failure only if the task is already active. 2340 * Returns %true if @p was woken up, %false if it was already running
2341 * or @state didn't match @p's state.
2283 */ 2342 */
2284static int try_to_wake_up(struct task_struct *p, unsigned int state, 2343static int try_to_wake_up(struct task_struct *p, unsigned int state,
2285 int wake_flags) 2344 int wake_flags)
@@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359 2418
2360out_activate: 2419out_activate:
2361#endif /* CONFIG_SMP */ 2420#endif /* CONFIG_SMP */
2362 schedstat_inc(p, se.statistics.nr_wakeups); 2421 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2363 if (wake_flags & WF_SYNC) 2422 cpu == this_cpu, en_flags);
2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2365 if (orig_cpu != cpu)
2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2367 if (cpu == this_cpu)
2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2369 else
2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2371 activate_task(rq, p, en_flags);
2372 success = 1; 2423 success = 1;
2373
2374out_running: 2424out_running:
2375 trace_sched_wakeup(p, success); 2425 ttwu_post_activation(p, rq, wake_flags, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394out: 2426out:
2395 task_rq_unlock(rq, &flags); 2427 task_rq_unlock(rq, &flags);
2396 put_cpu(); 2428 put_cpu();
@@ -2399,6 +2431,37 @@ out:
2399} 2431}
2400 2432
2401/** 2433/**
2434 * try_to_wake_up_local - try to wake up a local task with rq lock held
2435 * @p: the thread to be awakened
2436 *
2437 * Put @p on the run-queue if it's not alredy there. The caller must
2438 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2439 * the current task. this_rq() stays locked over invocation.
2440 */
2441static void try_to_wake_up_local(struct task_struct *p)
2442{
2443 struct rq *rq = task_rq(p);
2444 bool success = false;
2445
2446 BUG_ON(rq != this_rq());
2447 BUG_ON(p == current);
2448 lockdep_assert_held(&rq->lock);
2449
2450 if (!(p->state & TASK_NORMAL))
2451 return;
2452
2453 if (!p->se.on_rq) {
2454 if (likely(!task_running(rq, p))) {
2455 schedstat_inc(rq, ttwu_count);
2456 schedstat_inc(rq, ttwu_local);
2457 }
2458 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2459 success = true;
2460 }
2461 ttwu_post_activation(p, rq, 0, success);
2462}
2463
2464/**
2402 * wake_up_process - Wake up a specific process 2465 * wake_up_process - Wake up a specific process
2403 * @p: The process to be woken up. 2466 * @p: The process to be woken up.
2404 * 2467 *
@@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq)
3012} 3075}
3013 3076
3014/* 3077/*
3078 * The exact cpuload at various idx values, calculated at every tick would be
3079 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3080 *
3081 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3082 * on nth tick when cpu may be busy, then we have:
3083 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3084 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3085 *
3086 * decay_load_missed() below does efficient calculation of
3087 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3088 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3089 *
3090 * The calculation is approximated on a 128 point scale.
3091 * degrade_zero_ticks is the number of ticks after which load at any
3092 * particular idx is approximated to be zero.
3093 * degrade_factor is a precomputed table, a row for each load idx.
3094 * Each column corresponds to degradation factor for a power of two ticks,
3095 * based on 128 point scale.
3096 * Example:
3097 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3098 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3099 *
3100 * With this power of 2 load factors, we can degrade the load n times
3101 * by looking at 1 bits in n and doing as many mult/shift instead of
3102 * n mult/shifts needed by the exact degradation.
3103 */
3104#define DEGRADE_SHIFT 7
3105static const unsigned char
3106 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3107static const unsigned char
3108 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3109 {0, 0, 0, 0, 0, 0, 0, 0},
3110 {64, 32, 8, 0, 0, 0, 0, 0},
3111 {96, 72, 40, 12, 1, 0, 0},
3112 {112, 98, 75, 43, 15, 1, 0},
3113 {120, 112, 98, 76, 45, 16, 2} };
3114
3115/*
3116 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3117 * would be when CPU is idle and so we just decay the old load without
3118 * adding any new load.
3119 */
3120static unsigned long
3121decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3122{
3123 int j = 0;
3124
3125 if (!missed_updates)
3126 return load;
3127
3128 if (missed_updates >= degrade_zero_ticks[idx])
3129 return 0;
3130
3131 if (idx == 1)
3132 return load >> missed_updates;
3133
3134 while (missed_updates) {
3135 if (missed_updates % 2)
3136 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3137
3138 missed_updates >>= 1;
3139 j++;
3140 }
3141 return load;
3142}
3143
3144/*
3015 * Update rq->cpu_load[] statistics. This function is usually called every 3145 * Update rq->cpu_load[] statistics. This function is usually called every
3016 * scheduler tick (TICK_NSEC). 3146 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3147 * every tick. We fix it up based on jiffies.
3017 */ 3148 */
3018static void update_cpu_load(struct rq *this_rq) 3149static void update_cpu_load(struct rq *this_rq)
3019{ 3150{
3020 unsigned long this_load = this_rq->load.weight; 3151 unsigned long this_load = this_rq->load.weight;
3152 unsigned long curr_jiffies = jiffies;
3153 unsigned long pending_updates;
3021 int i, scale; 3154 int i, scale;
3022 3155
3023 this_rq->nr_load_updates++; 3156 this_rq->nr_load_updates++;
3024 3157
3158 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3159 if (curr_jiffies == this_rq->last_load_update_tick)
3160 return;
3161
3162 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3163 this_rq->last_load_update_tick = curr_jiffies;
3164
3025 /* Update our load: */ 3165 /* Update our load: */
3026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3166 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3167 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3027 unsigned long old_load, new_load; 3168 unsigned long old_load, new_load;
3028 3169
3029 /* scale is effectively 1 << i now, and >> i divides by scale */ 3170 /* scale is effectively 1 << i now, and >> i divides by scale */
3030 3171
3031 old_load = this_rq->cpu_load[i]; 3172 old_load = this_rq->cpu_load[i];
3173 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3032 new_load = this_load; 3174 new_load = this_load;
3033 /* 3175 /*
3034 * Round up the averaging division if load is increasing. This 3176 * Round up the averaging division if load is increasing. This
@@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq)
3036 * example. 3178 * example.
3037 */ 3179 */
3038 if (new_load > old_load) 3180 if (new_load > old_load)
3039 new_load += scale-1; 3181 new_load += scale - 1;
3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3182
3183 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3041 } 3184 }
3185}
3186
3187static void update_cpu_load_active(struct rq *this_rq)
3188{
3189 update_cpu_load(this_rq);
3042 3190
3043 calc_load_account_active(this_rq); 3191 calc_load_account_active(this_rq);
3044} 3192}
@@ -3426,7 +3574,7 @@ void scheduler_tick(void)
3426 3574
3427 raw_spin_lock(&rq->lock); 3575 raw_spin_lock(&rq->lock);
3428 update_rq_clock(rq); 3576 update_rq_clock(rq);
3429 update_cpu_load(rq); 3577 update_cpu_load_active(rq);
3430 curr->sched_class->task_tick(rq, curr, 0); 3578 curr->sched_class->task_tick(rq, curr, 0);
3431 raw_spin_unlock(&rq->lock); 3579 raw_spin_unlock(&rq->lock);
3432 3580
@@ -3598,7 +3746,6 @@ need_resched:
3598 rq = cpu_rq(cpu); 3746 rq = cpu_rq(cpu);
3599 rcu_note_context_switch(cpu); 3747 rcu_note_context_switch(cpu);
3600 prev = rq->curr; 3748 prev = rq->curr;
3601 switch_count = &prev->nivcsw;
3602 3749
3603 release_kernel_lock(prev); 3750 release_kernel_lock(prev);
3604need_resched_nonpreemptible: 3751need_resched_nonpreemptible:
@@ -3611,11 +3758,26 @@ need_resched_nonpreemptible:
3611 raw_spin_lock_irq(&rq->lock); 3758 raw_spin_lock_irq(&rq->lock);
3612 clear_tsk_need_resched(prev); 3759 clear_tsk_need_resched(prev);
3613 3760
3761 switch_count = &prev->nivcsw;
3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3762 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3615 if (unlikely(signal_pending_state(prev->state, prev))) 3763 if (unlikely(signal_pending_state(prev->state, prev))) {
3616 prev->state = TASK_RUNNING; 3764 prev->state = TASK_RUNNING;
3617 else 3765 } else {
3766 /*
3767 * If a worker is going to sleep, notify and
3768 * ask workqueue whether it wants to wake up a
3769 * task to maintain concurrency. If so, wake
3770 * up the task.
3771 */
3772 if (prev->flags & PF_WQ_WORKER) {
3773 struct task_struct *to_wakeup;
3774
3775 to_wakeup = wq_worker_sleeping(prev, cpu);
3776 if (to_wakeup)
3777 try_to_wake_up_local(to_wakeup);
3778 }
3618 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3779 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3780 }
3619 switch_count = &prev->nvcsw; 3781 switch_count = &prev->nvcsw;
3620 } 3782 }
3621 3783
@@ -3637,8 +3799,10 @@ need_resched_nonpreemptible:
3637 3799
3638 context_switch(rq, prev, next); /* unlocks the rq */ 3800 context_switch(rq, prev, next); /* unlocks the rq */
3639 /* 3801 /*
3640 * the context switch might have flipped the stack from under 3802 * The context switch have flipped the stack from under us
3641 * us, hence refresh the local variables. 3803 * and restored the local variables which were saved when
3804 * this task called schedule() in the past. prev == current
3805 * is still correct, but it can be moved to another cpu/rq.
3642 */ 3806 */
3643 cpu = smp_processor_id(); 3807 cpu = smp_processor_id();
3644 rq = cpu_rq(cpu); 3808 rq = cpu_rq(cpu);
@@ -3647,11 +3811,8 @@ need_resched_nonpreemptible:
3647 3811
3648 post_schedule(rq); 3812 post_schedule(rq);
3649 3813
3650 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3814 if (unlikely(reacquire_kernel_lock(prev)))
3651 prev = rq->curr;
3652 switch_count = &prev->nivcsw;
3653 goto need_resched_nonpreemptible; 3815 goto need_resched_nonpreemptible;
3654 }
3655 3816
3656 preempt_enable_no_resched(); 3817 preempt_enable_no_resched();
3657 if (need_resched()) 3818 if (need_resched())
@@ -3726,7 +3887,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3726 * off of preempt_enable. Kernel preemptions off return from interrupt 3887 * off of preempt_enable. Kernel preemptions off return from interrupt
3727 * occur there and call schedule directly. 3888 * occur there and call schedule directly.
3728 */ 3889 */
3729asmlinkage void __sched preempt_schedule(void) 3890asmlinkage void __sched notrace preempt_schedule(void)
3730{ 3891{
3731 struct thread_info *ti = current_thread_info(); 3892 struct thread_info *ti = current_thread_info();
3732 3893
@@ -3738,9 +3899,9 @@ asmlinkage void __sched preempt_schedule(void)
3738 return; 3899 return;
3739 3900
3740 do { 3901 do {
3741 add_preempt_count(PREEMPT_ACTIVE); 3902 add_preempt_count_notrace(PREEMPT_ACTIVE);
3742 schedule(); 3903 schedule();
3743 sub_preempt_count(PREEMPT_ACTIVE); 3904 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3744 3905
3745 /* 3906 /*
3746 * Check again in case we missed a preemption opportunity 3907 * Check again in case we missed a preemption opportunity
@@ -4441,12 +4602,8 @@ recheck:
4441 */ 4602 */
4442 if (user && !capable(CAP_SYS_NICE)) { 4603 if (user && !capable(CAP_SYS_NICE)) {
4443 if (rt_policy(policy)) { 4604 if (rt_policy(policy)) {
4444 unsigned long rlim_rtprio; 4605 unsigned long rlim_rtprio =
4445 4606 task_rlimit(p, RLIMIT_RTPRIO);
4446 if (!lock_task_sighand(p, &flags))
4447 return -ESRCH;
4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4449 unlock_task_sighand(p, &flags);
4450 4607
4451 /* can't set/change the rt policy */ 4608 /* can't set/change the rt policy */
4452 if (policy != p->policy && !rlim_rtprio) 4609 if (policy != p->policy && !rlim_rtprio)
@@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5816 */ 5973 */
5817static struct notifier_block __cpuinitdata migration_notifier = { 5974static struct notifier_block __cpuinitdata migration_notifier = {
5818 .notifier_call = migration_call, 5975 .notifier_call = migration_call,
5819 .priority = 10 5976 .priority = CPU_PRI_MIGRATION,
5820}; 5977};
5821 5978
5979static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5980 unsigned long action, void *hcpu)
5981{
5982 switch (action & ~CPU_TASKS_FROZEN) {
5983 case CPU_ONLINE:
5984 case CPU_DOWN_FAILED:
5985 set_cpu_active((long)hcpu, true);
5986 return NOTIFY_OK;
5987 default:
5988 return NOTIFY_DONE;
5989 }
5990}
5991
5992static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5993 unsigned long action, void *hcpu)
5994{
5995 switch (action & ~CPU_TASKS_FROZEN) {
5996 case CPU_DOWN_PREPARE:
5997 set_cpu_active((long)hcpu, false);
5998 return NOTIFY_OK;
5999 default:
6000 return NOTIFY_DONE;
6001 }
6002}
6003
5822static int __init migration_init(void) 6004static int __init migration_init(void)
5823{ 6005{
5824 void *cpu = (void *)(long)smp_processor_id(); 6006 void *cpu = (void *)(long)smp_processor_id();
5825 int err; 6007 int err;
5826 6008
5827 /* Start one for the boot CPU: */ 6009 /* Initialize migration for the boot CPU */
5828 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6010 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5829 BUG_ON(err == NOTIFY_BAD); 6011 BUG_ON(err == NOTIFY_BAD);
5830 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6012 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5831 register_cpu_notifier(&migration_notifier); 6013 register_cpu_notifier(&migration_notifier);
5832 6014
6015 /* Register cpu active notifiers */
6016 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6017 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6018
5833 return 0; 6019 return 0;
5834} 6020}
5835early_initcall(migration_init); 6021early_initcall(migration_init);
@@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6064 free_rootdomain(old_rd); 6250 free_rootdomain(old_rd);
6065} 6251}
6066 6252
6067static int init_rootdomain(struct root_domain *rd, bool bootmem) 6253static int init_rootdomain(struct root_domain *rd)
6068{ 6254{
6069 gfp_t gfp = GFP_KERNEL;
6070
6071 memset(rd, 0, sizeof(*rd)); 6255 memset(rd, 0, sizeof(*rd));
6072 6256
6073 if (bootmem) 6257 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6074 gfp = GFP_NOWAIT;
6075
6076 if (!alloc_cpumask_var(&rd->span, gfp))
6077 goto out; 6258 goto out;
6078 if (!alloc_cpumask_var(&rd->online, gfp)) 6259 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6079 goto free_span; 6260 goto free_span;
6080 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6261 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6081 goto free_online; 6262 goto free_online;
6082 6263
6083 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6264 if (cpupri_init(&rd->cpupri) != 0)
6084 goto free_rto_mask; 6265 goto free_rto_mask;
6085 return 0; 6266 return 0;
6086 6267
@@ -6096,7 +6277,7 @@ out:
6096 6277
6097static void init_defrootdomain(void) 6278static void init_defrootdomain(void)
6098{ 6279{
6099 init_rootdomain(&def_root_domain, true); 6280 init_rootdomain(&def_root_domain);
6100 6281
6101 atomic_set(&def_root_domain.refcount, 1); 6282 atomic_set(&def_root_domain.refcount, 1);
6102} 6283}
@@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void)
6109 if (!rd) 6290 if (!rd)
6110 return NULL; 6291 return NULL;
6111 6292
6112 if (init_rootdomain(rd, false) != 0) { 6293 if (init_rootdomain(rd) != 0) {
6113 kfree(rd); 6294 kfree(rd);
6114 return NULL; 6295 return NULL;
6115 } 6296 }
@@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7288} 7469}
7289#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7470#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7290 7471
7291#ifndef CONFIG_CPUSETS
7292/* 7472/*
7293 * Add online and remove offline CPUs from the scheduler domains. 7473 * Update cpusets according to cpu_active mask. If cpusets are
7294 * When cpusets are enabled they take over this function. 7474 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7475 * around partition_sched_domains().
7295 */ 7476 */
7296static int update_sched_domains(struct notifier_block *nfb, 7477static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7297 unsigned long action, void *hcpu) 7478 void *hcpu)
7298{ 7479{
7299 switch (action) { 7480 switch (action & ~CPU_TASKS_FROZEN) {
7300 case CPU_ONLINE: 7481 case CPU_ONLINE:
7301 case CPU_ONLINE_FROZEN:
7302 case CPU_DOWN_PREPARE:
7303 case CPU_DOWN_PREPARE_FROZEN:
7304 case CPU_DOWN_FAILED: 7482 case CPU_DOWN_FAILED:
7305 case CPU_DOWN_FAILED_FROZEN: 7483 cpuset_update_active_cpus();
7306 partition_sched_domains(1, NULL, NULL);
7307 return NOTIFY_OK; 7484 return NOTIFY_OK;
7485 default:
7486 return NOTIFY_DONE;
7487 }
7488}
7308 7489
7490static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7491 void *hcpu)
7492{
7493 switch (action & ~CPU_TASKS_FROZEN) {
7494 case CPU_DOWN_PREPARE:
7495 cpuset_update_active_cpus();
7496 return NOTIFY_OK;
7309 default: 7497 default:
7310 return NOTIFY_DONE; 7498 return NOTIFY_DONE;
7311 } 7499 }
7312} 7500}
7313#endif
7314 7501
7315static int update_runtime(struct notifier_block *nfb, 7502static int update_runtime(struct notifier_block *nfb,
7316 unsigned long action, void *hcpu) 7503 unsigned long action, void *hcpu)
@@ -7356,10 +7543,8 @@ void __init sched_init_smp(void)
7356 mutex_unlock(&sched_domains_mutex); 7543 mutex_unlock(&sched_domains_mutex);
7357 put_online_cpus(); 7544 put_online_cpus();
7358 7545
7359#ifndef CONFIG_CPUSETS 7546 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7360 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7547 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7361 hotcpu_notifier(update_sched_domains, 0);
7362#endif
7363 7548
7364 /* RT runtime code needs to handle some hotplug events */ 7549 /* RT runtime code needs to handle some hotplug events */
7365 hotcpu_notifier(update_runtime, 0); 7550 hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +7789,9 @@ void __init sched_init(void)
7604 7789
7605 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7790 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7606 rq->cpu_load[j] = 0; 7791 rq->cpu_load[j] = 0;
7792
7793 rq->last_load_update_tick = jiffies;
7794
7607#ifdef CONFIG_SMP 7795#ifdef CONFIG_SMP
7608 rq->sd = NULL; 7796 rq->sd = NULL;
7609 rq->rd = NULL; 7797 rq->rd = NULL;
@@ -7617,6 +7805,10 @@ void __init sched_init(void)
7617 rq->idle_stamp = 0; 7805 rq->idle_stamp = 0;
7618 rq->avg_idle = 2*sysctl_sched_migration_cost; 7806 rq->avg_idle = 2*sysctl_sched_migration_cost;
7619 rq_attach_root(rq, &def_root_domain); 7807 rq_attach_root(rq, &def_root_domain);
7808#ifdef CONFIG_NO_HZ
7809 rq->nohz_balance_kick = 0;
7810 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7811#endif
7620#endif 7812#endif
7621 init_rq_hrtick(rq); 7813 init_rq_hrtick(rq);
7622 atomic_set(&rq->nr_iowait, 0); 7814 atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +7853,11 @@ void __init sched_init(void)
7661 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7853 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7662#ifdef CONFIG_SMP 7854#ifdef CONFIG_SMP
7663#ifdef CONFIG_NO_HZ 7855#ifdef CONFIG_NO_HZ
7664 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7856 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7665 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7857 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7858 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7859 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7860 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7666#endif 7861#endif
7667 /* May be allocated at isolcpus cmdline parse time */ 7862 /* May be allocated at isolcpus cmdline parse time */
7668 if (cpu_isolated_map == NULL) 7863 if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
10 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 *
14 * - gtod 14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
15 * - sched_clock() 43 * - sched_clock()
16 * - explicit idle events 44 * - explicit idle events
17 * 45 *
18 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
19 * making it monotonic and keeping it within an expected window. 47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
20 * 49 *
21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
22 * that is otherwise invisible (TSC gets stopped). 51 * that is otherwise invisible (TSC gets stopped).
23 * 52 *
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 53 *
25 * consistent between cpus (never more than 2 jiffies difference). 54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
26 */ 62 */
27#include <linux/spinlock.h> 63#include <linux/spinlock.h>
28#include <linux/hardirq.h> 64#include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
170 return val; 206 return val;
171} 207}
172 208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
173u64 sched_clock_cpu(int cpu) 214u64 sched_clock_cpu(int cpu)
174{ 215{
175 struct sched_clock_data *scd; 216 struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
237} 278}
238EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
239 280
240unsigned long long cpu_clock(int cpu) 281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
241{ 292{
242 unsigned long long clock; 293 u64 clock;
243 unsigned long flags; 294 unsigned long flags;
244 295
245 local_irq_save(flags); 296 local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
249 return clock; 300 return clock;
250} 301}
251 302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
252#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
253 323
254void sched_clock_init(void) 324void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
264 return sched_clock(); 334 return sched_clock();
265} 335}
266 336
267 337u64 cpu_clock(int cpu)
268unsigned long long cpu_clock(int cpu)
269{ 338{
270 return sched_clock_cpu(cpu); 339 return sched_clock_cpu(cpu);
271} 340}
272 341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
273#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
274 348
275EXPORT_SYMBOL_GPL(cpu_clock); 349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
166 * 166 *
167 * Returns: -ENOMEM if memory fails. 167 * Returns: -ENOMEM if memory fails.
168 */ 168 */
169int cpupri_init(struct cpupri *cp, bool bootmem) 169int cpupri_init(struct cpupri *cp)
170{ 170{
171 gfp_t gfp = GFP_KERNEL;
172 int i; 171 int i;
173 172
174 if (bootmem)
175 gfp = GFP_NOWAIT;
176
177 memset(cp, 0, sizeof(*cp)); 173 memset(cp, 0, sizeof(*cp));
178 174
179 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
181 177
182 raw_spin_lock_init(&vec->lock); 178 raw_spin_lock_init(&vec->lock);
183 vec->count = 0; 179 vec->count = 0;
184 if (!zalloc_cpumask_var(&vec->mask, gfp)) 180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
185 goto cleanup; 181 goto cleanup;
186 } 182 }
187 183
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, struct cpumask *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
32#else 32#else
33#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 35565395d00d..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
332 PN(sysctl_sched_latency); 332 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 333 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 334 PN(sysctl_sched_wakeup_granularity);
335 PN(sysctl_sched_child_runs_first); 335 P(sysctl_sched_child_runs_first);
336 P(sysctl_sched_features); 336 P(sysctl_sched_features);
337#undef PN 337#undef PN
338#undef P 338#undef P
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..806d1b227a21 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2287 unsigned long power = SCHED_LOAD_SCALE; 2287 unsigned long power = SCHED_LOAD_SCALE;
2288 struct sched_group *sdg = sd->groups; 2288 struct sched_group *sdg = sd->groups;
2289 2289
2290 if (sched_feat(ARCH_POWER))
2291 power *= arch_scale_freq_power(sd, cpu);
2292 else
2293 power *= default_scale_freq_power(sd, cpu);
2294
2295 power >>= SCHED_LOAD_SHIFT;
2296
2297 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2290 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2298 if (sched_feat(ARCH_POWER)) 2291 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_smt_power(sd, cpu); 2292 power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2303 power >>= SCHED_LOAD_SHIFT; 2296 power >>= SCHED_LOAD_SHIFT;
2304 } 2297 }
2305 2298
2299 sdg->cpu_power_orig = power;
2300
2301 if (sched_feat(ARCH_POWER))
2302 power *= arch_scale_freq_power(sd, cpu);
2303 else
2304 power *= default_scale_freq_power(sd, cpu);
2305
2306 power >>= SCHED_LOAD_SHIFT;
2307
2306 power *= scale_rt_power(cpu); 2308 power *= scale_rt_power(cpu);
2307 power >>= SCHED_LOAD_SHIFT; 2309 power >>= SCHED_LOAD_SHIFT;
2308 2310
@@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2335 sdg->cpu_power = power; 2337 sdg->cpu_power = power;
2336} 2338}
2337 2339
2340/*
2341 * Try and fix up capacity for tiny siblings, this is needed when
2342 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2343 * which on its own isn't powerful enough.
2344 *
2345 * See update_sd_pick_busiest() and check_asym_packing().
2346 */
2347static inline int
2348fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2349{
2350 /*
2351 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2352 */
2353 if (sd->level != SD_LV_SIBLING)
2354 return 0;
2355
2356 /*
2357 * If ~90% of the cpu_power is still there, we're good.
2358 */
2359 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2360 return 1;
2361
2362 return 0;
2363}
2364
2338/** 2365/**
2339 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2366 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2340 * @sd: The sched_domain whose statistics are to be updated. 2367 * @sd: The sched_domain whose statistics are to be updated.
@@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2400 * domains. In the newly idle case, we will allow all the cpu's 2427 * domains. In the newly idle case, we will allow all the cpu's
2401 * to do the newly idle load balance. 2428 * to do the newly idle load balance.
2402 */ 2429 */
2403 if (idle != CPU_NEWLY_IDLE && local_group && 2430 if (idle != CPU_NEWLY_IDLE && local_group) {
2404 balance_cpu != this_cpu) { 2431 if (balance_cpu != this_cpu) {
2405 *balance = 0; 2432 *balance = 0;
2406 return; 2433 return;
2434 }
2435 update_group_power(sd, this_cpu);
2407 } 2436 }
2408 2437
2409 update_group_power(sd, this_cpu);
2410
2411 /* Adjust by relative CPU power of the group */ 2438 /* Adjust by relative CPU power of the group */
2412 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2439 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2413 2440
@@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2428 2455
2429 sgs->group_capacity = 2456 sgs->group_capacity =
2430 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2457 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2458 if (!sgs->group_capacity)
2459 sgs->group_capacity = fix_small_capacity(sd, group);
2460}
2461
2462/**
2463 * update_sd_pick_busiest - return 1 on busiest group
2464 * @sd: sched_domain whose statistics are to be checked
2465 * @sds: sched_domain statistics
2466 * @sg: sched_group candidate to be checked for being the busiest
2467 * @sgs: sched_group statistics
2468 * @this_cpu: the current cpu
2469 *
2470 * Determine if @sg is a busier group than the previously selected
2471 * busiest group.
2472 */
2473static bool update_sd_pick_busiest(struct sched_domain *sd,
2474 struct sd_lb_stats *sds,
2475 struct sched_group *sg,
2476 struct sg_lb_stats *sgs,
2477 int this_cpu)
2478{
2479 if (sgs->avg_load <= sds->max_load)
2480 return false;
2481
2482 if (sgs->sum_nr_running > sgs->group_capacity)
2483 return true;
2484
2485 if (sgs->group_imb)
2486 return true;
2487
2488 /*
2489 * ASYM_PACKING needs to move all the work to the lowest
2490 * numbered CPUs in the group, therefore mark all groups
2491 * higher than ourself as busy.
2492 */
2493 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2494 this_cpu < group_first_cpu(sg)) {
2495 if (!sds->busiest)
2496 return true;
2497
2498 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2499 return true;
2500 }
2501
2502 return false;
2431} 2503}
2432 2504
2433/** 2505/**
@@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2435 * @sd: sched_domain whose statistics are to be updated. 2507 * @sd: sched_domain whose statistics are to be updated.
2436 * @this_cpu: Cpu for which load balance is currently performed. 2508 * @this_cpu: Cpu for which load balance is currently performed.
2437 * @idle: Idle status of this_cpu 2509 * @idle: Idle status of this_cpu
2438 * @sd_idle: Idle status of the sched_domain containing group. 2510 * @sd_idle: Idle status of the sched_domain containing sg.
2439 * @cpus: Set of cpus considered for load balancing. 2511 * @cpus: Set of cpus considered for load balancing.
2440 * @balance: Should we balance. 2512 * @balance: Should we balance.
2441 * @sds: variable to hold the statistics for this sched_domain. 2513 * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2446 struct sd_lb_stats *sds) 2518 struct sd_lb_stats *sds)
2447{ 2519{
2448 struct sched_domain *child = sd->child; 2520 struct sched_domain *child = sd->child;
2449 struct sched_group *group = sd->groups; 2521 struct sched_group *sg = sd->groups;
2450 struct sg_lb_stats sgs; 2522 struct sg_lb_stats sgs;
2451 int load_idx, prefer_sibling = 0; 2523 int load_idx, prefer_sibling = 0;
2452 2524
@@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2459 do { 2531 do {
2460 int local_group; 2532 int local_group;
2461 2533
2462 local_group = cpumask_test_cpu(this_cpu, 2534 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2463 sched_group_cpus(group));
2464 memset(&sgs, 0, sizeof(sgs)); 2535 memset(&sgs, 0, sizeof(sgs));
2465 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2536 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2466 local_group, cpus, balance, &sgs); 2537 local_group, cpus, balance, &sgs);
2467 2538
2468 if (local_group && !(*balance)) 2539 if (local_group && !(*balance))
2469 return; 2540 return;
2470 2541
2471 sds->total_load += sgs.group_load; 2542 sds->total_load += sgs.group_load;
2472 sds->total_pwr += group->cpu_power; 2543 sds->total_pwr += sg->cpu_power;
2473 2544
2474 /* 2545 /*
2475 * In case the child domain prefers tasks go to siblings 2546 * In case the child domain prefers tasks go to siblings
2476 * first, lower the group capacity to one so that we'll try 2547 * first, lower the sg capacity to one so that we'll try
2477 * and move all the excess tasks away. 2548 * and move all the excess tasks away.
2478 */ 2549 */
2479 if (prefer_sibling) 2550 if (prefer_sibling)
@@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2481 2552
2482 if (local_group) { 2553 if (local_group) {
2483 sds->this_load = sgs.avg_load; 2554 sds->this_load = sgs.avg_load;
2484 sds->this = group; 2555 sds->this = sg;
2485 sds->this_nr_running = sgs.sum_nr_running; 2556 sds->this_nr_running = sgs.sum_nr_running;
2486 sds->this_load_per_task = sgs.sum_weighted_load; 2557 sds->this_load_per_task = sgs.sum_weighted_load;
2487 } else if (sgs.avg_load > sds->max_load && 2558 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2488 (sgs.sum_nr_running > sgs.group_capacity ||
2489 sgs.group_imb)) {
2490 sds->max_load = sgs.avg_load; 2559 sds->max_load = sgs.avg_load;
2491 sds->busiest = group; 2560 sds->busiest = sg;
2492 sds->busiest_nr_running = sgs.sum_nr_running; 2561 sds->busiest_nr_running = sgs.sum_nr_running;
2493 sds->busiest_group_capacity = sgs.group_capacity; 2562 sds->busiest_group_capacity = sgs.group_capacity;
2494 sds->busiest_load_per_task = sgs.sum_weighted_load; 2563 sds->busiest_load_per_task = sgs.sum_weighted_load;
2495 sds->group_imb = sgs.group_imb; 2564 sds->group_imb = sgs.group_imb;
2496 } 2565 }
2497 2566
2498 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2567 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2499 group = group->next; 2568 sg = sg->next;
2500 } while (group != sd->groups); 2569 } while (sg != sd->groups);
2570}
2571
2572int __weak arch_sd_sibling_asym_packing(void)
2573{
2574 return 0*SD_ASYM_PACKING;
2575}
2576
2577/**
2578 * check_asym_packing - Check to see if the group is packed into the
2579 * sched doman.
2580 *
2581 * This is primarily intended to used at the sibling level. Some
2582 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2583 * case of POWER7, it can move to lower SMT modes only when higher
2584 * threads are idle. When in lower SMT modes, the threads will
2585 * perform better since they share less core resources. Hence when we
2586 * have idle threads, we want them to be the higher ones.
2587 *
2588 * This packing function is run on idle threads. It checks to see if
2589 * the busiest CPU in this domain (core in the P7 case) has a higher
2590 * CPU number than the packing function is being run on. Here we are
2591 * assuming lower CPU number will be equivalent to lower a SMT thread
2592 * number.
2593 *
2594 * Returns 1 when packing is required and a task should be moved to
2595 * this CPU. The amount of the imbalance is returned in *imbalance.
2596 *
2597 * @sd: The sched_domain whose packing is to be checked.
2598 * @sds: Statistics of the sched_domain which is to be packed
2599 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2600 * @imbalance: returns amount of imbalanced due to packing.
2601 */
2602static int check_asym_packing(struct sched_domain *sd,
2603 struct sd_lb_stats *sds,
2604 int this_cpu, unsigned long *imbalance)
2605{
2606 int busiest_cpu;
2607
2608 if (!(sd->flags & SD_ASYM_PACKING))
2609 return 0;
2610
2611 if (!sds->busiest)
2612 return 0;
2613
2614 busiest_cpu = group_first_cpu(sds->busiest);
2615 if (this_cpu > busiest_cpu)
2616 return 0;
2617
2618 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2619 SCHED_LOAD_SCALE);
2620 return 1;
2501} 2621}
2502 2622
2503/** 2623/**
@@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2692 if (!(*balance)) 2812 if (!(*balance))
2693 goto ret; 2813 goto ret;
2694 2814
2815 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2816 check_asym_packing(sd, &sds, this_cpu, imbalance))
2817 return sds.busiest;
2818
2695 if (!sds.busiest || sds.busiest_nr_running == 0) 2819 if (!sds.busiest || sds.busiest_nr_running == 0)
2696 goto out_balanced; 2820 goto out_balanced;
2697 2821
@@ -2726,8 +2850,9 @@ ret:
2726 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2850 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2727 */ 2851 */
2728static struct rq * 2852static struct rq *
2729find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2853find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2730 unsigned long imbalance, const struct cpumask *cpus) 2854 enum cpu_idle_type idle, unsigned long imbalance,
2855 const struct cpumask *cpus)
2731{ 2856{
2732 struct rq *busiest = NULL, *rq; 2857 struct rq *busiest = NULL, *rq;
2733 unsigned long max_load = 0; 2858 unsigned long max_load = 0;
@@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2738 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2863 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2739 unsigned long wl; 2864 unsigned long wl;
2740 2865
2866 if (!capacity)
2867 capacity = fix_small_capacity(sd, group);
2868
2741 if (!cpumask_test_cpu(i, cpus)) 2869 if (!cpumask_test_cpu(i, cpus))
2742 continue; 2870 continue;
2743 2871
@@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2777/* Working cpumask for load_balance and load_balance_newidle. */ 2905/* Working cpumask for load_balance and load_balance_newidle. */
2778static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2906static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2779 2907
2780static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2908static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2909 int busiest_cpu, int this_cpu)
2781{ 2910{
2782 if (idle == CPU_NEWLY_IDLE) { 2911 if (idle == CPU_NEWLY_IDLE) {
2912
2913 /*
2914 * ASYM_PACKING needs to force migrate tasks from busy but
2915 * higher numbered CPUs in order to pack all tasks in the
2916 * lowest numbered CPUs.
2917 */
2918 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2919 return 1;
2920
2783 /* 2921 /*
2784 * The only task running in a non-idle cpu can be moved to this 2922 * The only task running in a non-idle cpu can be moved to this
2785 * cpu in an attempt to completely freeup the other CPU 2923 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +2992,7 @@ redo:
2854 goto out_balanced; 2992 goto out_balanced;
2855 } 2993 }
2856 2994
2857 busiest = find_busiest_queue(group, idle, imbalance, cpus); 2995 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2858 if (!busiest) { 2996 if (!busiest) {
2859 schedstat_inc(sd, lb_nobusyq[idle]); 2997 schedstat_inc(sd, lb_nobusyq[idle]);
2860 goto out_balanced; 2998 goto out_balanced;
@@ -2898,7 +3036,8 @@ redo:
2898 schedstat_inc(sd, lb_failed[idle]); 3036 schedstat_inc(sd, lb_failed[idle]);
2899 sd->nr_balance_failed++; 3037 sd->nr_balance_failed++;
2900 3038
2901 if (need_active_balance(sd, sd_idle, idle)) { 3039 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3040 this_cpu)) {
2902 raw_spin_lock_irqsave(&busiest->lock, flags); 3041 raw_spin_lock_irqsave(&busiest->lock, flags);
2903 3042
2904 /* don't kick the active_load_balance_cpu_stop, 3043 /* don't kick the active_load_balance_cpu_stop,
@@ -3093,13 +3232,40 @@ out_unlock:
3093} 3232}
3094 3233
3095#ifdef CONFIG_NO_HZ 3234#ifdef CONFIG_NO_HZ
3235
3236static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3237
3238static void trigger_sched_softirq(void *data)
3239{
3240 raise_softirq_irqoff(SCHED_SOFTIRQ);
3241}
3242
3243static inline void init_sched_softirq_csd(struct call_single_data *csd)
3244{
3245 csd->func = trigger_sched_softirq;
3246 csd->info = NULL;
3247 csd->flags = 0;
3248 csd->priv = 0;
3249}
3250
3251/*
3252 * idle load balancing details
3253 * - One of the idle CPUs nominates itself as idle load_balancer, while
3254 * entering idle.
3255 * - This idle load balancer CPU will also go into tickless mode when
3256 * it is idle, just like all other idle CPUs
3257 * - When one of the busy CPUs notice that there may be an idle rebalancing
3258 * needed, they will kick the idle load balancer, which then does idle
3259 * load balancing for all the idle CPUs.
3260 */
3096static struct { 3261static struct {
3097 atomic_t load_balancer; 3262 atomic_t load_balancer;
3098 cpumask_var_t cpu_mask; 3263 atomic_t first_pick_cpu;
3099 cpumask_var_t ilb_grp_nohz_mask; 3264 atomic_t second_pick_cpu;
3100} nohz ____cacheline_aligned = { 3265 cpumask_var_t idle_cpus_mask;
3101 .load_balancer = ATOMIC_INIT(-1), 3266 cpumask_var_t grp_idle_mask;
3102}; 3267 unsigned long next_balance; /* in jiffy units */
3268} nohz ____cacheline_aligned;
3103 3269
3104int get_nohz_load_balancer(void) 3270int get_nohz_load_balancer(void)
3105{ 3271{
@@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3153 */ 3319 */
3154static inline int is_semi_idle_group(struct sched_group *ilb_group) 3320static inline int is_semi_idle_group(struct sched_group *ilb_group)
3155{ 3321{
3156 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3322 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3157 sched_group_cpus(ilb_group)); 3323 sched_group_cpus(ilb_group));
3158 3324
3159 /* 3325 /*
3160 * A sched_group is semi-idle when it has atleast one busy cpu 3326 * A sched_group is semi-idle when it has atleast one busy cpu
3161 * and atleast one idle cpu. 3327 * and atleast one idle cpu.
3162 */ 3328 */
3163 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3329 if (cpumask_empty(nohz.grp_idle_mask))
3164 return 0; 3330 return 0;
3165 3331
3166 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3332 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3167 return 0; 3333 return 0;
3168 3334
3169 return 1; 3335 return 1;
@@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu)
3196 * Optimize for the case when we have no idle CPUs or only one 3362 * Optimize for the case when we have no idle CPUs or only one
3197 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3363 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3198 */ 3364 */
3199 if (cpumask_weight(nohz.cpu_mask) < 2) 3365 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3200 goto out_done; 3366 goto out_done;
3201 3367
3202 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3368 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu)
3204 3370
3205 do { 3371 do {
3206 if (is_semi_idle_group(ilb_group)) 3372 if (is_semi_idle_group(ilb_group))
3207 return cpumask_first(nohz.ilb_grp_nohz_mask); 3373 return cpumask_first(nohz.grp_idle_mask);
3208 3374
3209 ilb_group = ilb_group->next; 3375 ilb_group = ilb_group->next;
3210 3376
@@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu)
3212 } 3378 }
3213 3379
3214out_done: 3380out_done:
3215 return cpumask_first(nohz.cpu_mask); 3381 return nr_cpu_ids;
3216} 3382}
3217#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3383#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3218static inline int find_new_ilb(int call_cpu) 3384static inline int find_new_ilb(int call_cpu)
3219{ 3385{
3220 return cpumask_first(nohz.cpu_mask); 3386 return nr_cpu_ids;
3221} 3387}
3222#endif 3388#endif
3223 3389
3224/* 3390/*
3391 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3392 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3393 * CPU (if there is one).
3394 */
3395static void nohz_balancer_kick(int cpu)
3396{
3397 int ilb_cpu;
3398
3399 nohz.next_balance++;
3400
3401 ilb_cpu = get_nohz_load_balancer();
3402
3403 if (ilb_cpu >= nr_cpu_ids) {
3404 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3405 if (ilb_cpu >= nr_cpu_ids)
3406 return;
3407 }
3408
3409 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3410 struct call_single_data *cp;
3411
3412 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3413 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3414 __smp_call_function_single(ilb_cpu, cp, 0);
3415 }
3416 return;
3417}
3418
3419/*
3225 * This routine will try to nominate the ilb (idle load balancing) 3420 * This routine will try to nominate the ilb (idle load balancing)
3226 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3421 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3227 * load balancing on behalf of all those cpus. If all the cpus in the system 3422 * load balancing on behalf of all those cpus.
3228 * go into this tickless mode, then there will be no ilb owner (as there is
3229 * no need for one) and all the cpus will sleep till the next wakeup event
3230 * arrives...
3231 *
3232 * For the ilb owner, tick is not stopped. And this tick will be used
3233 * for idle load balancing. ilb owner will still be part of
3234 * nohz.cpu_mask..
3235 * 3423 *
3236 * While stopping the tick, this cpu will become the ilb owner if there 3424 * When the ilb owner becomes busy, we will not have new ilb owner until some
3237 * is no other owner. And will be the owner till that cpu becomes busy 3425 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3238 * or if all cpus in the system stop their ticks at which point 3426 * idle load balancing by kicking one of the idle CPUs.
3239 * there is no need for ilb owner.
3240 * 3427 *
3241 * When the ilb owner becomes busy, it nominates another owner, during the 3428 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3242 * next busy scheduler_tick() 3429 * ilb owner CPU in future (when there is a need for idle load balancing on
3430 * behalf of all idle CPUs).
3243 */ 3431 */
3244int select_nohz_load_balancer(int stop_tick) 3432void select_nohz_load_balancer(int stop_tick)
3245{ 3433{
3246 int cpu = smp_processor_id(); 3434 int cpu = smp_processor_id();
3247 3435
3248 if (stop_tick) { 3436 if (stop_tick) {
3249 cpu_rq(cpu)->in_nohz_recently = 1;
3250
3251 if (!cpu_active(cpu)) { 3437 if (!cpu_active(cpu)) {
3252 if (atomic_read(&nohz.load_balancer) != cpu) 3438 if (atomic_read(&nohz.load_balancer) != cpu)
3253 return 0; 3439 return;
3254 3440
3255 /* 3441 /*
3256 * If we are going offline and still the leader, 3442 * If we are going offline and still the leader,
3257 * give up! 3443 * give up!
3258 */ 3444 */
3259 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3445 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3446 nr_cpu_ids) != cpu)
3260 BUG(); 3447 BUG();
3261 3448
3262 return 0; 3449 return;
3263 } 3450 }
3264 3451
3265 cpumask_set_cpu(cpu, nohz.cpu_mask); 3452 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3266 3453
3267 /* time for ilb owner also to sleep */ 3454 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3268 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3455 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3269 if (atomic_read(&nohz.load_balancer) == cpu) 3456 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3270 atomic_set(&nohz.load_balancer, -1); 3457 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3271 return 0;
3272 }
3273 3458
3274 if (atomic_read(&nohz.load_balancer) == -1) { 3459 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3275 /* make me the ilb owner */
3276 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3277 return 1;
3278 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3279 int new_ilb; 3460 int new_ilb;
3280 3461
3281 if (!(sched_smt_power_savings || 3462 /* make me the ilb owner */
3282 sched_mc_power_savings)) 3463 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3283 return 1; 3464 cpu) != nr_cpu_ids)
3465 return;
3466
3284 /* 3467 /*
3285 * Check to see if there is a more power-efficient 3468 * Check to see if there is a more power-efficient
3286 * ilb. 3469 * ilb.
3287 */ 3470 */
3288 new_ilb = find_new_ilb(cpu); 3471 new_ilb = find_new_ilb(cpu);
3289 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3472 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3290 atomic_set(&nohz.load_balancer, -1); 3473 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3291 resched_cpu(new_ilb); 3474 resched_cpu(new_ilb);
3292 return 0; 3475 return;
3293 } 3476 }
3294 return 1; 3477 return;
3295 } 3478 }
3296 } else { 3479 } else {
3297 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3480 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3298 return 0; 3481 return;
3299 3482
3300 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3483 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3301 3484
3302 if (atomic_read(&nohz.load_balancer) == cpu) 3485 if (atomic_read(&nohz.load_balancer) == cpu)
3303 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3486 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3487 nr_cpu_ids) != cpu)
3304 BUG(); 3488 BUG();
3305 } 3489 }
3306 return 0; 3490 return;
3307} 3491}
3308#endif 3492#endif
3309 3493
@@ -3385,11 +3569,102 @@ out:
3385 rq->next_balance = next_balance; 3569 rq->next_balance = next_balance;
3386} 3570}
3387 3571
3572#ifdef CONFIG_NO_HZ
3388/* 3573/*
3389 * run_rebalance_domains is triggered when needed from the scheduler tick. 3574 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3390 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3391 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3575 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3392 */ 3576 */
3577static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3578{
3579 struct rq *this_rq = cpu_rq(this_cpu);
3580 struct rq *rq;
3581 int balance_cpu;
3582
3583 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3584 return;
3585
3586 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3587 if (balance_cpu == this_cpu)
3588 continue;
3589
3590 /*
3591 * If this cpu gets work to do, stop the load balancing
3592 * work being done for other cpus. Next load
3593 * balancing owner will pick it up.
3594 */
3595 if (need_resched()) {
3596 this_rq->nohz_balance_kick = 0;
3597 break;
3598 }
3599
3600 raw_spin_lock_irq(&this_rq->lock);
3601 update_rq_clock(this_rq);
3602 update_cpu_load(this_rq);
3603 raw_spin_unlock_irq(&this_rq->lock);
3604
3605 rebalance_domains(balance_cpu, CPU_IDLE);
3606
3607 rq = cpu_rq(balance_cpu);
3608 if (time_after(this_rq->next_balance, rq->next_balance))
3609 this_rq->next_balance = rq->next_balance;
3610 }
3611 nohz.next_balance = this_rq->next_balance;
3612 this_rq->nohz_balance_kick = 0;
3613}
3614
3615/*
3616 * Current heuristic for kicking the idle load balancer
3617 * - first_pick_cpu is the one of the busy CPUs. It will kick
3618 * idle load balancer when it has more than one process active. This
3619 * eliminates the need for idle load balancing altogether when we have
3620 * only one running process in the system (common case).
3621 * - If there are more than one busy CPU, idle load balancer may have
3622 * to run for active_load_balance to happen (i.e., two busy CPUs are
3623 * SMT or core siblings and can run better if they move to different
3624 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3625 * which will kick idle load balancer as soon as it has any load.
3626 */
3627static inline int nohz_kick_needed(struct rq *rq, int cpu)
3628{
3629 unsigned long now = jiffies;
3630 int ret;
3631 int first_pick_cpu, second_pick_cpu;
3632
3633 if (time_before(now, nohz.next_balance))
3634 return 0;
3635
3636 if (!rq->nr_running)
3637 return 0;
3638
3639 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3640 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3641
3642 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3643 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3644 return 0;
3645
3646 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3647 if (ret == nr_cpu_ids || ret == cpu) {
3648 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3649 if (rq->nr_running > 1)
3650 return 1;
3651 } else {
3652 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3653 if (ret == nr_cpu_ids || ret == cpu) {
3654 if (rq->nr_running)
3655 return 1;
3656 }
3657 }
3658 return 0;
3659}
3660#else
3661static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3662#endif
3663
3664/*
3665 * run_rebalance_domains is triggered when needed from the scheduler tick.
3666 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3667 */
3393static void run_rebalance_domains(struct softirq_action *h) 3668static void run_rebalance_domains(struct softirq_action *h)
3394{ 3669{
3395 int this_cpu = smp_processor_id(); 3670 int this_cpu = smp_processor_id();
@@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3399 3674
3400 rebalance_domains(this_cpu, idle); 3675 rebalance_domains(this_cpu, idle);
3401 3676
3402#ifdef CONFIG_NO_HZ
3403 /* 3677 /*
3404 * If this cpu is the owner for idle load balancing, then do the 3678 * If this cpu has a pending nohz_balance_kick, then do the
3405 * balancing on behalf of the other idle cpus whose ticks are 3679 * balancing on behalf of the other idle cpus whose ticks are
3406 * stopped. 3680 * stopped.
3407 */ 3681 */
3408 if (this_rq->idle_at_tick && 3682 nohz_idle_balance(this_cpu, idle);
3409 atomic_read(&nohz.load_balancer) == this_cpu) {
3410 struct rq *rq;
3411 int balance_cpu;
3412
3413 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3414 if (balance_cpu == this_cpu)
3415 continue;
3416
3417 /*
3418 * If this cpu gets work to do, stop the load balancing
3419 * work being done for other cpus. Next load
3420 * balancing owner will pick it up.
3421 */
3422 if (need_resched())
3423 break;
3424
3425 rebalance_domains(balance_cpu, CPU_IDLE);
3426
3427 rq = cpu_rq(balance_cpu);
3428 if (time_after(this_rq->next_balance, rq->next_balance))
3429 this_rq->next_balance = rq->next_balance;
3430 }
3431 }
3432#endif
3433} 3683}
3434 3684
3435static inline int on_null_domain(int cpu) 3685static inline int on_null_domain(int cpu)
@@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu)
3439 3689
3440/* 3690/*
3441 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3691 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3442 *
3443 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3444 * idle load balancing owner or decide to stop the periodic load balancing,
3445 * if the whole system is idle.
3446 */ 3692 */
3447static inline void trigger_load_balance(struct rq *rq, int cpu) 3693static inline void trigger_load_balance(struct rq *rq, int cpu)
3448{ 3694{
3449#ifdef CONFIG_NO_HZ
3450 /*
3451 * If we were in the nohz mode recently and busy at the current
3452 * scheduler tick, then check if we need to nominate new idle
3453 * load balancer.
3454 */
3455 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3456 rq->in_nohz_recently = 0;
3457
3458 if (atomic_read(&nohz.load_balancer) == cpu) {
3459 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3460 atomic_set(&nohz.load_balancer, -1);
3461 }
3462
3463 if (atomic_read(&nohz.load_balancer) == -1) {
3464 int ilb = find_new_ilb(cpu);
3465
3466 if (ilb < nr_cpu_ids)
3467 resched_cpu(ilb);
3468 }
3469 }
3470
3471 /*
3472 * If this cpu is idle and doing idle load balancing for all the
3473 * cpus with ticks stopped, is it time for that to stop?
3474 */
3475 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3476 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3477 resched_cpu(cpu);
3478 return;
3479 }
3480
3481 /*
3482 * If this cpu is idle and the idle load balancing is done by
3483 * someone else, then no need raise the SCHED_SOFTIRQ
3484 */
3485 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3486 cpumask_test_cpu(cpu, nohz.cpu_mask))
3487 return;
3488#endif
3489 /* Don't need to rebalance while attached to NULL domain */ 3695 /* Don't need to rebalance while attached to NULL domain */
3490 if (time_after_eq(jiffies, rq->next_balance) && 3696 if (time_after_eq(jiffies, rq->next_balance) &&
3491 likely(!on_null_domain(cpu))) 3697 likely(!on_null_domain(cpu)))
3492 raise_softirq(SCHED_SOFTIRQ); 3698 raise_softirq(SCHED_SOFTIRQ);
3699#ifdef CONFIG_NO_HZ
3700 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3701 nohz_balancer_kick(cpu);
3702#endif
3493} 3703}
3494 3704
3495static void rq_online_fair(struct rq *rq) 3705static void rq_online_fair(struct rq *rq)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..d10c80ebb67a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1663{ 1663{
1664 unsigned long soft, hard; 1664 unsigned long soft, hard;
1665 1665
1666 if (!p->signal)
1667 return;
1668
1669 /* max may change after cur was read, this will be fixed next tick */ 1666 /* max may change after cur was read, this will be fixed next tick */
1670 soft = task_rlimit(p, RLIMIT_RTTIME); 1667 soft = task_rlimit(p, RLIMIT_RTTIME);
1671 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1668 hard = task_rlimit_max(p, RLIMIT_RTTIME);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
295static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime) 296 cputime_t cputime)
297{ 297{
298 struct thread_group_cputimer *cputimer; 298 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
299
300 /* tsk == current, ensure it is safe to use ->signal */
301 if (unlikely(tsk->exit_state))
302 return;
303
304 cputimer = &tsk->signal->cputimer;
305 299
306 if (!cputimer->running) 300 if (!cputimer->running)
307 return; 301 return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 319static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 320 cputime_t cputime)
327{ 321{
328 struct thread_group_cputimer *cputimer; 322 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
329
330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state))
332 return;
333
334 cputimer = &tsk->signal->cputimer;
335 323
336 if (!cputimer->running) 324 if (!cputimer->running)
337 return; 325 return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
355static inline void account_group_exec_runtime(struct task_struct *tsk, 343static inline void account_group_exec_runtime(struct task_struct *tsk,
356 unsigned long long ns) 344 unsigned long long ns)
357{ 345{
358 struct thread_group_cputimer *cputimer; 346 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
359 struct signal_struct *sig;
360
361 sig = tsk->signal;
362 /* see __exit_signal()->task_rq_unlock_wait() */
363 barrier();
364 if (unlikely(!sig))
365 return;
366
367 cputimer = &sig->cputimer;
368 347
369 if (!cputimer->running) 348 if (!cputimer->running)
370 return; 349 return;
diff --git a/kernel/signal.c b/kernel/signal.c
index 906ae5a1779c..bded65187780 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info)
637 637
638/* 638/*
639 * Bad permissions for sending the signal 639 * Bad permissions for sending the signal
640 * - the caller must hold at least the RCU read lock 640 * - the caller must hold the RCU read lock
641 */ 641 */
642static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 643 struct task_struct *t)
@@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1127 1127
1128/* 1128/*
1129 * send signal info to all the members of a group 1129 * send signal info to all the members of a group
1130 * - the caller must hold the RCU read lock at least
1131 */ 1130 */
1132int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1131int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1133{ 1132{
1134 int ret = check_kill_permission(sig, info, p); 1133 int ret;
1134
1135 rcu_read_lock();
1136 ret = check_kill_permission(sig, info, p);
1137 rcu_read_unlock();
1135 1138
1136 if (!ret && sig) 1139 if (!ret && sig)
1137 ret = do_send_sig_info(sig, info, p, true); 1140 ret = do_send_sig_info(sig, info, p, true);
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19#include <linux/debugfs.h>
20#include "slow-work.h"
21
22static void slow_work_cull_timeout(unsigned long);
23static void slow_work_oom_timeout(unsigned long);
24
25#ifdef CONFIG_SYSCTL
26static int slow_work_min_threads_sysctl(struct ctl_table *, int,
27 void __user *, size_t *, loff_t *);
28
29static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
30 void __user *, size_t *, loff_t *);
31#endif
32
33/*
34 * The pool of threads has at least min threads in it as long as someone is
35 * using the facility, and may have as many as max.
36 *
37 * A portion of the pool may be processing very slow operations.
38 */
39static unsigned slow_work_min_threads = 2;
40static unsigned slow_work_max_threads = 4;
41static unsigned vslow_work_proportion = 50; /* % of threads that may process
42 * very slow work */
43
44#ifdef CONFIG_SYSCTL
45static const int slow_work_min_min_threads = 2;
46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
47static const int slow_work_min_vslow = 1;
48static const int slow_work_max_vslow = 99;
49
50ctl_table slow_work_sysctls[] = {
51 {
52 .procname = "min-threads",
53 .data = &slow_work_min_threads,
54 .maxlen = sizeof(unsigned),
55 .mode = 0644,
56 .proc_handler = slow_work_min_threads_sysctl,
57 .extra1 = (void *) &slow_work_min_min_threads,
58 .extra2 = &slow_work_max_threads,
59 },
60 {
61 .procname = "max-threads",
62 .data = &slow_work_max_threads,
63 .maxlen = sizeof(unsigned),
64 .mode = 0644,
65 .proc_handler = slow_work_max_threads_sysctl,
66 .extra1 = &slow_work_min_threads,
67 .extra2 = (void *) &slow_work_max_max_threads,
68 },
69 {
70 .procname = "vslow-percentage",
71 .data = &vslow_work_proportion,
72 .maxlen = sizeof(unsigned),
73 .mode = 0644,
74 .proc_handler = proc_dointvec_minmax,
75 .extra1 = (void *) &slow_work_min_vslow,
76 .extra2 = (void *) &slow_work_max_vslow,
77 },
78 {}
79};
80#endif
81
82/*
83 * The active state of the thread pool
84 */
85static atomic_t slow_work_thread_count;
86static atomic_t vslow_work_executing_count;
87
88static bool slow_work_may_not_start_new_thread;
89static bool slow_work_cull; /* cull a thread due to lack of activity */
90static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
91static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
92static struct slow_work slow_work_new_thread; /* new thread starter */
93
94/*
95 * slow work ID allocation (use slow_work_queue_lock)
96 */
97static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
98
99/*
100 * Unregistration tracking to prevent put_ref() from disappearing during module
101 * unload
102 */
103#ifdef CONFIG_MODULES
104static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
105static struct module *slow_work_unreg_module;
106static struct slow_work *slow_work_unreg_work_item;
107static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
108static DEFINE_MUTEX(slow_work_unreg_sync_lock);
109
110static void slow_work_set_thread_processing(int id, struct slow_work *work)
111{
112 if (work)
113 slow_work_thread_processing[id] = work->owner;
114}
115static void slow_work_done_thread_processing(int id, struct slow_work *work)
116{
117 struct module *module = slow_work_thread_processing[id];
118
119 slow_work_thread_processing[id] = NULL;
120 smp_mb();
121 if (slow_work_unreg_work_item == work ||
122 slow_work_unreg_module == module)
123 wake_up_all(&slow_work_unreg_wq);
124}
125static void slow_work_clear_thread_processing(int id)
126{
127 slow_work_thread_processing[id] = NULL;
128}
129#else
130static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
131static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
132static void slow_work_clear_thread_processing(int id) {}
133#endif
134
135/*
136 * Data for tracking currently executing items for indication through /proc
137 */
138#ifdef CONFIG_SLOW_WORK_DEBUG
139struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
140pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
141DEFINE_RWLOCK(slow_work_execs_lock);
142#endif
143
144/*
145 * The queues of work items and the lock governing access to them. These are
146 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
147 * as the number of threads bears no relation to the number of CPUs.
148 *
149 * There are two queues of work items: one for slow work items, and one for
150 * very slow work items.
151 */
152LIST_HEAD(slow_work_queue);
153LIST_HEAD(vslow_work_queue);
154DEFINE_SPINLOCK(slow_work_queue_lock);
155
156/*
157 * The following are two wait queues that get pinged when a work item is placed
158 * on an empty queue. These allow work items that are hogging a thread by
159 * sleeping in a way that could be deferred to yield their thread and enqueue
160 * themselves.
161 */
162static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
163static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
164
165/*
166 * The thread controls. A variable used to signal to the threads that they
167 * should exit when the queue is empty, a waitqueue used by the threads to wait
168 * for signals, and a completion set by the last thread to exit.
169 */
170static bool slow_work_threads_should_exit;
171static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
172static DECLARE_COMPLETION(slow_work_last_thread_exited);
173
174/*
175 * The number of users of the thread pool and its lock. Whilst this is zero we
176 * have no threads hanging around, and when this reaches zero, we wait for all
177 * active or queued work items to complete and kill all the threads we do have.
178 */
179static int slow_work_user_count;
180static DEFINE_MUTEX(slow_work_user_lock);
181
182static inline int slow_work_get_ref(struct slow_work *work)
183{
184 if (work->ops->get_ref)
185 return work->ops->get_ref(work);
186
187 return 0;
188}
189
190static inline void slow_work_put_ref(struct slow_work *work)
191{
192 if (work->ops->put_ref)
193 work->ops->put_ref(work);
194}
195
196/*
197 * Calculate the maximum number of active threads in the pool that are
198 * permitted to process very slow work items.
199 *
200 * The answer is rounded up to at least 1, but may not equal or exceed the
201 * maximum number of the threads in the pool. This means we always have at
202 * least one thread that can process slow work items, and we always have at
203 * least one thread that won't get tied up doing so.
204 */
205static unsigned slow_work_calc_vsmax(void)
206{
207 unsigned vsmax;
208
209 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
210 vsmax /= 100;
211 vsmax = max(vsmax, 1U);
212 return min(vsmax, slow_work_max_threads - 1);
213}
214
215/*
216 * Attempt to execute stuff queued on a slow thread. Return true if we managed
217 * it, false if there was nothing to do.
218 */
219static noinline bool slow_work_execute(int id)
220{
221 struct slow_work *work = NULL;
222 unsigned vsmax;
223 bool very_slow;
224
225 vsmax = slow_work_calc_vsmax();
226
227 /* see if we can schedule a new thread to be started if we're not
228 * keeping up with the work */
229 if (!waitqueue_active(&slow_work_thread_wq) &&
230 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
231 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
232 !slow_work_may_not_start_new_thread)
233 slow_work_enqueue(&slow_work_new_thread);
234
235 /* find something to execute */
236 spin_lock_irq(&slow_work_queue_lock);
237 if (!list_empty(&vslow_work_queue) &&
238 atomic_read(&vslow_work_executing_count) < vsmax) {
239 work = list_entry(vslow_work_queue.next,
240 struct slow_work, link);
241 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
242 BUG();
243 list_del_init(&work->link);
244 atomic_inc(&vslow_work_executing_count);
245 very_slow = true;
246 } else if (!list_empty(&slow_work_queue)) {
247 work = list_entry(slow_work_queue.next,
248 struct slow_work, link);
249 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
250 BUG();
251 list_del_init(&work->link);
252 very_slow = false;
253 } else {
254 very_slow = false; /* avoid the compiler warning */
255 }
256
257 slow_work_set_thread_processing(id, work);
258 if (work) {
259 slow_work_mark_time(work);
260 slow_work_begin_exec(id, work);
261 }
262
263 spin_unlock_irq(&slow_work_queue_lock);
264
265 if (!work)
266 return false;
267
268 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
269 BUG();
270
271 /* don't execute if the work is in the process of being cancelled */
272 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
273 work->ops->execute(work);
274
275 if (very_slow)
276 atomic_dec(&vslow_work_executing_count);
277 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
278
279 /* wake up anyone waiting for this work to be complete */
280 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
281
282 slow_work_end_exec(id, work);
283
284 /* if someone tried to enqueue the item whilst we were executing it,
285 * then it'll be left unenqueued to avoid multiple threads trying to
286 * execute it simultaneously
287 *
288 * there is, however, a race between us testing the pending flag and
289 * getting the spinlock, and between the enqueuer setting the pending
290 * flag and getting the spinlock, so we use a deferral bit to tell us
291 * if the enqueuer got there first
292 */
293 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
294 spin_lock_irq(&slow_work_queue_lock);
295
296 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
297 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
298 goto auto_requeue;
299
300 spin_unlock_irq(&slow_work_queue_lock);
301 }
302
303 /* sort out the race between module unloading and put_ref() */
304 slow_work_put_ref(work);
305 slow_work_done_thread_processing(id, work);
306
307 return true;
308
309auto_requeue:
310 /* we must complete the enqueue operation
311 * - we transfer our ref on the item back to the appropriate queue
312 * - don't wake another thread up as we're awake already
313 */
314 slow_work_mark_time(work);
315 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
316 list_add_tail(&work->link, &vslow_work_queue);
317 else
318 list_add_tail(&work->link, &slow_work_queue);
319 spin_unlock_irq(&slow_work_queue_lock);
320 slow_work_clear_thread_processing(id);
321 return true;
322}
323
324/**
325 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
326 * work: The work item under execution that wants to sleep
327 * _timeout: Scheduler sleep timeout
328 *
329 * Allow a requeueable work item to sleep on a slow-work processor thread until
330 * that thread is needed to do some other work or the sleep is interrupted by
331 * some other event.
332 *
333 * The caller must set up a wake up event before calling this and must have set
334 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
335 * condition before calling this function as no test is made here.
336 *
337 * False is returned if there is nothing on the queue; true is returned if the
338 * work item should be requeued
339 */
340bool slow_work_sleep_till_thread_needed(struct slow_work *work,
341 signed long *_timeout)
342{
343 wait_queue_head_t *wfo_wq;
344 struct list_head *queue;
345
346 DEFINE_WAIT(wait);
347
348 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
349 wfo_wq = &vslow_work_queue_waits_for_occupation;
350 queue = &vslow_work_queue;
351 } else {
352 wfo_wq = &slow_work_queue_waits_for_occupation;
353 queue = &slow_work_queue;
354 }
355
356 if (!list_empty(queue))
357 return true;
358
359 add_wait_queue_exclusive(wfo_wq, &wait);
360 if (list_empty(queue))
361 *_timeout = schedule_timeout(*_timeout);
362 finish_wait(wfo_wq, &wait);
363
364 return !list_empty(queue);
365}
366EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
367
368/**
369 * slow_work_enqueue - Schedule a slow work item for processing
370 * @work: The work item to queue
371 *
372 * Schedule a slow work item for processing. If the item is already undergoing
373 * execution, this guarantees not to re-enter the execution routine until the
374 * first execution finishes.
375 *
376 * The item is pinned by this function as it retains a reference to it, managed
377 * through the item operations. The item is unpinned once it has been
378 * executed.
379 *
380 * An item may hog the thread that is running it for a relatively large amount
381 * of time, sufficient, for example, to perform several lookup, mkdir, create
382 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
383 *
384 * Conversely, if a number of items are awaiting processing, it may take some
385 * time before any given item is given attention. The number of threads in the
386 * pool may be increased to deal with demand, but only up to a limit.
387 *
388 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
389 * the very slow queue, from which only a portion of the threads will be
390 * allowed to pick items to execute. This ensures that very slow items won't
391 * overly block ones that are just ordinarily slow.
392 *
393 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
394 * attempted queued)
395 */
396int slow_work_enqueue(struct slow_work *work)
397{
398 wait_queue_head_t *wfo_wq;
399 struct list_head *queue;
400 unsigned long flags;
401 int ret;
402
403 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
404 return -ECANCELED;
405
406 BUG_ON(slow_work_user_count <= 0);
407 BUG_ON(!work);
408 BUG_ON(!work->ops);
409
410 /* when honouring an enqueue request, we only promise that we will run
411 * the work function in the future; we do not promise to run it once
412 * per enqueue request
413 *
414 * we use the PENDING bit to merge together repeat requests without
415 * having to disable IRQs and take the spinlock, whilst still
416 * maintaining our promise
417 */
418 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
419 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
420 wfo_wq = &vslow_work_queue_waits_for_occupation;
421 queue = &vslow_work_queue;
422 } else {
423 wfo_wq = &slow_work_queue_waits_for_occupation;
424 queue = &slow_work_queue;
425 }
426
427 spin_lock_irqsave(&slow_work_queue_lock, flags);
428
429 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
430 goto cancelled;
431
432 /* we promise that we will not attempt to execute the work
433 * function in more than one thread simultaneously
434 *
435 * this, however, leaves us with a problem if we're asked to
436 * enqueue the work whilst someone is executing the work
437 * function as simply queueing the work immediately means that
438 * another thread may try executing it whilst it is already
439 * under execution
440 *
441 * to deal with this, we set the ENQ_DEFERRED bit instead of
442 * enqueueing, and the thread currently executing the work
443 * function will enqueue the work item when the work function
444 * returns and it has cleared the EXECUTING bit
445 */
446 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
447 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
448 } else {
449 ret = slow_work_get_ref(work);
450 if (ret < 0)
451 goto failed;
452 slow_work_mark_time(work);
453 list_add_tail(&work->link, queue);
454 wake_up(&slow_work_thread_wq);
455
456 /* if someone who could be requeued is sleeping on a
457 * thread, then ask them to yield their thread */
458 if (work->link.prev == queue)
459 wake_up(wfo_wq);
460 }
461
462 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
463 }
464 return 0;
465
466cancelled:
467 ret = -ECANCELED;
468failed:
469 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
470 return ret;
471}
472EXPORT_SYMBOL(slow_work_enqueue);
473
474static int slow_work_wait(void *word)
475{
476 schedule();
477 return 0;
478}
479
480/**
481 * slow_work_cancel - Cancel a slow work item
482 * @work: The work item to cancel
483 *
484 * This function will cancel a previously enqueued work item. If we cannot
485 * cancel the work item, it is guarenteed to have run when this function
486 * returns.
487 */
488void slow_work_cancel(struct slow_work *work)
489{
490 bool wait = true, put = false;
491
492 set_bit(SLOW_WORK_CANCELLING, &work->flags);
493 smp_mb();
494
495 /* if the work item is a delayed work item with an active timer, we
496 * need to wait for the timer to finish _before_ getting the spinlock,
497 * lest we deadlock against the timer routine
498 *
499 * the timer routine will leave DELAYED set if it notices the
500 * CANCELLING flag in time
501 */
502 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
503 struct delayed_slow_work *dwork =
504 container_of(work, struct delayed_slow_work, work);
505 del_timer_sync(&dwork->timer);
506 }
507
508 spin_lock_irq(&slow_work_queue_lock);
509
510 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
511 /* the timer routine aborted or never happened, so we are left
512 * holding the timer's reference on the item and should just
513 * drop the pending flag and wait for any ongoing execution to
514 * finish */
515 struct delayed_slow_work *dwork =
516 container_of(work, struct delayed_slow_work, work);
517
518 BUG_ON(timer_pending(&dwork->timer));
519 BUG_ON(!list_empty(&work->link));
520
521 clear_bit(SLOW_WORK_DELAYED, &work->flags);
522 put = true;
523 clear_bit(SLOW_WORK_PENDING, &work->flags);
524
525 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
526 !list_empty(&work->link)) {
527 /* the link in the pending queue holds a reference on the item
528 * that we will need to release */
529 list_del_init(&work->link);
530 wait = false;
531 put = true;
532 clear_bit(SLOW_WORK_PENDING, &work->flags);
533
534 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
535 /* the executor is holding our only reference on the item, so
536 * we merely need to wait for it to finish executing */
537 clear_bit(SLOW_WORK_PENDING, &work->flags);
538 }
539
540 spin_unlock_irq(&slow_work_queue_lock);
541
542 /* the EXECUTING flag is set by the executor whilst the spinlock is set
543 * and before the item is dequeued - so assuming the above doesn't
544 * actually dequeue it, simply waiting for the EXECUTING flag to be
545 * released here should be sufficient */
546 if (wait)
547 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
548 TASK_UNINTERRUPTIBLE);
549
550 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
551 if (put)
552 slow_work_put_ref(work);
553}
554EXPORT_SYMBOL(slow_work_cancel);
555
556/*
557 * Handle expiry of the delay timer, indicating that a delayed slow work item
558 * should now be queued if not cancelled
559 */
560static void delayed_slow_work_timer(unsigned long data)
561{
562 wait_queue_head_t *wfo_wq;
563 struct list_head *queue;
564 struct slow_work *work = (struct slow_work *) data;
565 unsigned long flags;
566 bool queued = false, put = false, first = false;
567
568 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
569 wfo_wq = &vslow_work_queue_waits_for_occupation;
570 queue = &vslow_work_queue;
571 } else {
572 wfo_wq = &slow_work_queue_waits_for_occupation;
573 queue = &slow_work_queue;
574 }
575
576 spin_lock_irqsave(&slow_work_queue_lock, flags);
577 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
578 clear_bit(SLOW_WORK_DELAYED, &work->flags);
579
580 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
581 /* we discard the reference the timer was holding in
582 * favour of the one the executor holds */
583 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
584 put = true;
585 } else {
586 slow_work_mark_time(work);
587 list_add_tail(&work->link, queue);
588 queued = true;
589 if (work->link.prev == queue)
590 first = true;
591 }
592 }
593
594 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
595 if (put)
596 slow_work_put_ref(work);
597 if (first)
598 wake_up(wfo_wq);
599 if (queued)
600 wake_up(&slow_work_thread_wq);
601}
602
603/**
604 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
605 * @dwork: The delayed work item to queue
606 * @delay: When to start executing the work, in jiffies from now
607 *
608 * This is similar to slow_work_enqueue(), but it adds a delay before the work
609 * is actually queued for processing.
610 *
611 * The item can have delayed processing requested on it whilst it is being
612 * executed. The delay will begin immediately, and if it expires before the
613 * item finishes executing, the item will be placed back on the queue when it
614 * has done executing.
615 */
616int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
617 unsigned long delay)
618{
619 struct slow_work *work = &dwork->work;
620 unsigned long flags;
621 int ret;
622
623 if (delay == 0)
624 return slow_work_enqueue(&dwork->work);
625
626 BUG_ON(slow_work_user_count <= 0);
627 BUG_ON(!work);
628 BUG_ON(!work->ops);
629
630 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
631 return -ECANCELED;
632
633 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
634 spin_lock_irqsave(&slow_work_queue_lock, flags);
635
636 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
637 goto cancelled;
638
639 /* the timer holds a reference whilst it is pending */
640 ret = slow_work_get_ref(work);
641 if (ret < 0)
642 goto cant_get_ref;
643
644 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
645 BUG();
646 dwork->timer.expires = jiffies + delay;
647 dwork->timer.data = (unsigned long) work;
648 dwork->timer.function = delayed_slow_work_timer;
649 add_timer(&dwork->timer);
650
651 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
652 }
653
654 return 0;
655
656cancelled:
657 ret = -ECANCELED;
658cant_get_ref:
659 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
660 return ret;
661}
662EXPORT_SYMBOL(delayed_slow_work_enqueue);
663
664/*
665 * Schedule a cull of the thread pool at some time in the near future
666 */
667static void slow_work_schedule_cull(void)
668{
669 mod_timer(&slow_work_cull_timer,
670 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
671}
672
673/*
674 * Worker thread culling algorithm
675 */
676static bool slow_work_cull_thread(void)
677{
678 unsigned long flags;
679 bool do_cull = false;
680
681 spin_lock_irqsave(&slow_work_queue_lock, flags);
682
683 if (slow_work_cull) {
684 slow_work_cull = false;
685
686 if (list_empty(&slow_work_queue) &&
687 list_empty(&vslow_work_queue) &&
688 atomic_read(&slow_work_thread_count) >
689 slow_work_min_threads) {
690 slow_work_schedule_cull();
691 do_cull = true;
692 }
693 }
694
695 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
696 return do_cull;
697}
698
699/*
700 * Determine if there is slow work available for dispatch
701 */
702static inline bool slow_work_available(int vsmax)
703{
704 return !list_empty(&slow_work_queue) ||
705 (!list_empty(&vslow_work_queue) &&
706 atomic_read(&vslow_work_executing_count) < vsmax);
707}
708
709/*
710 * Worker thread dispatcher
711 */
712static int slow_work_thread(void *_data)
713{
714 int vsmax, id;
715
716 DEFINE_WAIT(wait);
717
718 set_freezable();
719 set_user_nice(current, -5);
720
721 /* allocate ourselves an ID */
722 spin_lock_irq(&slow_work_queue_lock);
723 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
724 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
725 __set_bit(id, slow_work_ids);
726 slow_work_set_thread_pid(id, current->pid);
727 spin_unlock_irq(&slow_work_queue_lock);
728
729 sprintf(current->comm, "kslowd%03u", id);
730
731 for (;;) {
732 vsmax = vslow_work_proportion;
733 vsmax *= atomic_read(&slow_work_thread_count);
734 vsmax /= 100;
735
736 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
737 TASK_INTERRUPTIBLE);
738 if (!freezing(current) &&
739 !slow_work_threads_should_exit &&
740 !slow_work_available(vsmax) &&
741 !slow_work_cull)
742 schedule();
743 finish_wait(&slow_work_thread_wq, &wait);
744
745 try_to_freeze();
746
747 vsmax = vslow_work_proportion;
748 vsmax *= atomic_read(&slow_work_thread_count);
749 vsmax /= 100;
750
751 if (slow_work_available(vsmax) && slow_work_execute(id)) {
752 cond_resched();
753 if (list_empty(&slow_work_queue) &&
754 list_empty(&vslow_work_queue) &&
755 atomic_read(&slow_work_thread_count) >
756 slow_work_min_threads)
757 slow_work_schedule_cull();
758 continue;
759 }
760
761 if (slow_work_threads_should_exit)
762 break;
763
764 if (slow_work_cull && slow_work_cull_thread())
765 break;
766 }
767
768 spin_lock_irq(&slow_work_queue_lock);
769 slow_work_set_thread_pid(id, 0);
770 __clear_bit(id, slow_work_ids);
771 spin_unlock_irq(&slow_work_queue_lock);
772
773 if (atomic_dec_and_test(&slow_work_thread_count))
774 complete_and_exit(&slow_work_last_thread_exited, 0);
775 return 0;
776}
777
778/*
779 * Handle thread cull timer expiration
780 */
781static void slow_work_cull_timeout(unsigned long data)
782{
783 slow_work_cull = true;
784 wake_up(&slow_work_thread_wq);
785}
786
787/*
788 * Start a new slow work thread
789 */
790static void slow_work_new_thread_execute(struct slow_work *work)
791{
792 struct task_struct *p;
793
794 if (slow_work_threads_should_exit)
795 return;
796
797 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
798 return;
799
800 if (!mutex_trylock(&slow_work_user_lock))
801 return;
802
803 slow_work_may_not_start_new_thread = true;
804 atomic_inc(&slow_work_thread_count);
805 p = kthread_run(slow_work_thread, NULL, "kslowd");
806 if (IS_ERR(p)) {
807 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
808 if (atomic_dec_and_test(&slow_work_thread_count))
809 BUG(); /* we're running on a slow work thread... */
810 mod_timer(&slow_work_oom_timer,
811 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
812 } else {
813 /* ratelimit the starting of new threads */
814 mod_timer(&slow_work_oom_timer, jiffies + 1);
815 }
816
817 mutex_unlock(&slow_work_user_lock);
818}
819
820static const struct slow_work_ops slow_work_new_thread_ops = {
821 .owner = THIS_MODULE,
822 .execute = slow_work_new_thread_execute,
823#ifdef CONFIG_SLOW_WORK_DEBUG
824 .desc = slow_work_new_thread_desc,
825#endif
826};
827
828/*
829 * post-OOM new thread start suppression expiration
830 */
831static void slow_work_oom_timeout(unsigned long data)
832{
833 slow_work_may_not_start_new_thread = false;
834}
835
836#ifdef CONFIG_SYSCTL
837/*
838 * Handle adjustment of the minimum number of threads
839 */
840static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
841 void __user *buffer,
842 size_t *lenp, loff_t *ppos)
843{
844 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
845 int n;
846
847 if (ret == 0) {
848 mutex_lock(&slow_work_user_lock);
849 if (slow_work_user_count > 0) {
850 /* see if we need to start or stop threads */
851 n = atomic_read(&slow_work_thread_count) -
852 slow_work_min_threads;
853
854 if (n < 0 && !slow_work_may_not_start_new_thread)
855 slow_work_enqueue(&slow_work_new_thread);
856 else if (n > 0)
857 slow_work_schedule_cull();
858 }
859 mutex_unlock(&slow_work_user_lock);
860 }
861
862 return ret;
863}
864
865/*
866 * Handle adjustment of the maximum number of threads
867 */
868static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
869 void __user *buffer,
870 size_t *lenp, loff_t *ppos)
871{
872 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
873 int n;
874
875 if (ret == 0) {
876 mutex_lock(&slow_work_user_lock);
877 if (slow_work_user_count > 0) {
878 /* see if we need to stop threads */
879 n = slow_work_max_threads -
880 atomic_read(&slow_work_thread_count);
881
882 if (n < 0)
883 slow_work_schedule_cull();
884 }
885 mutex_unlock(&slow_work_user_lock);
886 }
887
888 return ret;
889}
890#endif /* CONFIG_SYSCTL */
891
892/**
893 * slow_work_register_user - Register a user of the facility
894 * @module: The module about to make use of the facility
895 *
896 * Register a user of the facility, starting up the initial threads if there
897 * aren't any other users at this point. This will return 0 if successful, or
898 * an error if not.
899 */
900int slow_work_register_user(struct module *module)
901{
902 struct task_struct *p;
903 int loop;
904
905 mutex_lock(&slow_work_user_lock);
906
907 if (slow_work_user_count == 0) {
908 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
909 init_completion(&slow_work_last_thread_exited);
910
911 slow_work_threads_should_exit = false;
912 slow_work_init(&slow_work_new_thread,
913 &slow_work_new_thread_ops);
914 slow_work_may_not_start_new_thread = false;
915 slow_work_cull = false;
916
917 /* start the minimum number of threads */
918 for (loop = 0; loop < slow_work_min_threads; loop++) {
919 atomic_inc(&slow_work_thread_count);
920 p = kthread_run(slow_work_thread, NULL, "kslowd");
921 if (IS_ERR(p))
922 goto error;
923 }
924 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
925 }
926
927 slow_work_user_count++;
928 mutex_unlock(&slow_work_user_lock);
929 return 0;
930
931error:
932 if (atomic_dec_and_test(&slow_work_thread_count))
933 complete(&slow_work_last_thread_exited);
934 if (loop > 0) {
935 printk(KERN_ERR "Slow work thread pool:"
936 " Aborting startup on ENOMEM\n");
937 slow_work_threads_should_exit = true;
938 wake_up_all(&slow_work_thread_wq);
939 wait_for_completion(&slow_work_last_thread_exited);
940 printk(KERN_ERR "Slow work thread pool: Aborted\n");
941 }
942 mutex_unlock(&slow_work_user_lock);
943 return PTR_ERR(p);
944}
945EXPORT_SYMBOL(slow_work_register_user);
946
947/*
948 * wait for all outstanding items from the calling module to complete
949 * - note that more items may be queued whilst we're waiting
950 */
951static void slow_work_wait_for_items(struct module *module)
952{
953#ifdef CONFIG_MODULES
954 DECLARE_WAITQUEUE(myself, current);
955 struct slow_work *work;
956 int loop;
957
958 mutex_lock(&slow_work_unreg_sync_lock);
959 add_wait_queue(&slow_work_unreg_wq, &myself);
960
961 for (;;) {
962 spin_lock_irq(&slow_work_queue_lock);
963
964 /* first of all, we wait for the last queued item in each list
965 * to be processed */
966 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
967 if (work->owner == module) {
968 set_current_state(TASK_UNINTERRUPTIBLE);
969 slow_work_unreg_work_item = work;
970 goto do_wait;
971 }
972 }
973 list_for_each_entry_reverse(work, &slow_work_queue, link) {
974 if (work->owner == module) {
975 set_current_state(TASK_UNINTERRUPTIBLE);
976 slow_work_unreg_work_item = work;
977 goto do_wait;
978 }
979 }
980
981 /* then we wait for the items being processed to finish */
982 slow_work_unreg_module = module;
983 smp_mb();
984 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
985 if (slow_work_thread_processing[loop] == module)
986 goto do_wait;
987 }
988 spin_unlock_irq(&slow_work_queue_lock);
989 break; /* okay, we're done */
990
991 do_wait:
992 spin_unlock_irq(&slow_work_queue_lock);
993 schedule();
994 slow_work_unreg_work_item = NULL;
995 slow_work_unreg_module = NULL;
996 }
997
998 remove_wait_queue(&slow_work_unreg_wq, &myself);
999 mutex_unlock(&slow_work_unreg_sync_lock);
1000#endif /* CONFIG_MODULES */
1001}
1002
1003/**
1004 * slow_work_unregister_user - Unregister a user of the facility
1005 * @module: The module whose items should be cleared
1006 *
1007 * Unregister a user of the facility, killing all the threads if this was the
1008 * last one.
1009 *
1010 * This waits for all the work items belonging to the nominated module to go
1011 * away before proceeding.
1012 */
1013void slow_work_unregister_user(struct module *module)
1014{
1015 /* first of all, wait for all outstanding items from the calling module
1016 * to complete */
1017 if (module)
1018 slow_work_wait_for_items(module);
1019
1020 /* then we can actually go about shutting down the facility if need
1021 * be */
1022 mutex_lock(&slow_work_user_lock);
1023
1024 BUG_ON(slow_work_user_count <= 0);
1025
1026 slow_work_user_count--;
1027 if (slow_work_user_count == 0) {
1028 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
1029 slow_work_threads_should_exit = true;
1030 del_timer_sync(&slow_work_cull_timer);
1031 del_timer_sync(&slow_work_oom_timer);
1032 wake_up_all(&slow_work_thread_wq);
1033 wait_for_completion(&slow_work_last_thread_exited);
1034 printk(KERN_NOTICE "Slow work thread pool:"
1035 " Shut down complete\n");
1036 }
1037
1038 mutex_unlock(&slow_work_user_lock);
1039}
1040EXPORT_SYMBOL(slow_work_unregister_user);
1041
1042/*
1043 * Initialise the slow work facility
1044 */
1045static int __init init_slow_work(void)
1046{
1047 unsigned nr_cpus = num_possible_cpus();
1048
1049 if (slow_work_max_threads < nr_cpus)
1050 slow_work_max_threads = nr_cpus;
1051#ifdef CONFIG_SYSCTL
1052 if (slow_work_max_max_threads < nr_cpus * 2)
1053 slow_work_max_max_threads = nr_cpus * 2;
1054#endif
1055#ifdef CONFIG_SLOW_WORK_DEBUG
1056 {
1057 struct dentry *dbdir;
1058
1059 dbdir = debugfs_create_dir("slow_work", NULL);
1060 if (dbdir && !IS_ERR(dbdir))
1061 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1062 NULL, &slow_work_runqueue_fops);
1063 }
1064#endif
1065 return 0;
1066}
1067
1068subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index 4b493f67dcb5..000000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9#include <linux/mm.h>
10#include <linux/cpu.h>
11#include <linux/nmi.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/freezer.h>
15#include <linux/kthread.h>
16#include <linux/lockdep.h>
17#include <linux/notifier.h>
18#include <linux/module.h>
19#include <linux/sysctl.h>
20
21#include <asm/irq_regs.h>
22
23static DEFINE_SPINLOCK(print_lock);
24
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
29
30static int __read_mostly did_panic;
31int __read_mostly softlockup_thresh = 60;
32
33/*
34 * Should we panic (and reboot, if panic_timeout= is set) when a
35 * soft-lockup occurs:
36 */
37unsigned int __read_mostly softlockup_panic =
38 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
39
40static int __init softlockup_panic_setup(char *str)
41{
42 softlockup_panic = simple_strtoul(str, NULL, 0);
43
44 return 1;
45}
46__setup("softlockup_panic=", softlockup_panic_setup);
47
48static int
49softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
50{
51 did_panic = 1;
52
53 return NOTIFY_DONE;
54}
55
56static struct notifier_block panic_block = {
57 .notifier_call = softlock_panic,
58};
59
60/*
61 * Returns seconds, approximately. We don't need nanosecond
62 * resolution, and we don't need to waste time with a big divide when
63 * 2^30ns == 1.074s.
64 */
65static unsigned long get_timestamp(int this_cpu)
66{
67 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
68}
69
70static void __touch_softlockup_watchdog(void)
71{
72 int this_cpu = raw_smp_processor_id();
73
74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
75}
76
77void touch_softlockup_watchdog(void)
78{
79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
80}
81EXPORT_SYMBOL(touch_softlockup_watchdog);
82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
89void touch_all_softlockup_watchdogs(void)
90{
91 int cpu;
92
93 /* Cause each CPU to re-update its timestamp rather than complain */
94 for_each_online_cpu(cpu)
95 per_cpu(softlockup_touch_ts, cpu) = 0;
96}
97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
98
99int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
100 void __user *buffer,
101 size_t *lenp, loff_t *ppos)
102{
103 touch_all_softlockup_watchdogs();
104 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
105}
106
107/*
108 * This callback runs from the timer interrupt, and checks
109 * whether the watchdog thread has hung or not:
110 */
111void softlockup_tick(void)
112{
113 int this_cpu = smp_processor_id();
114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
115 unsigned long print_ts;
116 struct pt_regs *regs = get_irq_regs();
117 unsigned long now;
118
119 /* Is detection switched off? */
120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
121 /* Be sure we don't false trigger if switched back on */
122 if (touch_ts)
123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
124 return;
125 }
126
127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
136 __touch_softlockup_watchdog();
137 return;
138 }
139
140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
141
142 /* report at most once a second */
143 if (print_ts == touch_ts || did_panic)
144 return;
145
146 /* do not print during early bootup: */
147 if (unlikely(system_state != SYSTEM_RUNNING)) {
148 __touch_softlockup_watchdog();
149 return;
150 }
151
152 now = get_timestamp(this_cpu);
153
154 /*
155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan.
157 */
158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160
161 /* Warn about unreasonable delays: */
162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return;
164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
166
167 spin_lock(&print_lock);
168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
169 this_cpu, now - touch_ts,
170 current->comm, task_pid_nr(current));
171 print_modules();
172 print_irqtrace_events(current);
173 if (regs)
174 show_regs(regs);
175 else
176 dump_stack();
177 spin_unlock(&print_lock);
178
179 if (softlockup_panic)
180 panic("softlockup: hung tasks");
181}
182
183/*
184 * The watchdog thread - runs every second and touches the timestamp.
185 */
186static int watchdog(void *__bind_cpu)
187{
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 sched_setscheduler(current, SCHED_FIFO, &param);
191
192 /* initialize timestamp */
193 __touch_softlockup_watchdog();
194
195 set_current_state(TASK_INTERRUPTIBLE);
196 /*
197 * Run briefly once per second to reset the softlockup timestamp.
198 * If this gets delayed for more than 60 seconds then the
199 * debug-printout triggers in softlockup_tick().
200 */
201 while (!kthread_should_stop()) {
202 __touch_softlockup_watchdog();
203 schedule();
204
205 if (kthread_should_stop())
206 break;
207
208 set_current_state(TASK_INTERRUPTIBLE);
209 }
210 __set_current_state(TASK_RUNNING);
211
212 return 0;
213}
214
215/*
216 * Create/destroy watchdog threads as CPUs come and go:
217 */
218static int __cpuinit
219cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
220{
221 int hotcpu = (unsigned long)hcpu;
222 struct task_struct *p;
223
224 switch (action) {
225 case CPU_UP_PREPARE:
226 case CPU_UP_PREPARE_FROZEN:
227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
229 if (IS_ERR(p)) {
230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
231 return NOTIFY_BAD;
232 }
233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
234 per_cpu(softlockup_watchdog, hotcpu) = p;
235 kthread_bind(p, hotcpu);
236 break;
237 case CPU_ONLINE:
238 case CPU_ONLINE_FROZEN:
239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
240 break;
241#ifdef CONFIG_HOTPLUG_CPU
242 case CPU_UP_CANCELED:
243 case CPU_UP_CANCELED_FROZEN:
244 if (!per_cpu(softlockup_watchdog, hotcpu))
245 break;
246 /* Unbind so it can run. Fall thru. */
247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
248 cpumask_any(cpu_online_mask));
249 case CPU_DEAD:
250 case CPU_DEAD_FROZEN:
251 p = per_cpu(softlockup_watchdog, hotcpu);
252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
253 kthread_stop(p);
254 break;
255#endif /* CONFIG_HOTPLUG_CPU */
256 }
257 return NOTIFY_OK;
258}
259
260static struct notifier_block __cpuinitdata cpu_nfb = {
261 .notifier_call = cpu_callback
262};
263
264static int __initdata nosoftlockup;
265
266static int __init nosoftlockup_setup(char *str)
267{
268 nosoftlockup = 1;
269 return 1;
270}
271__setup("nosoftlockup", nosoftlockup_setup);
272
273static int __init spawn_softlockup_task(void)
274{
275 void *cpu = (void *)(long)smp_processor_id();
276 int err;
277
278 if (nosoftlockup)
279 return 0;
280
281 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
282 if (err == NOTIFY_BAD) {
283 BUG();
284 return 1;
285 }
286 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
287 register_cpu_notifier(&cpu_nfb);
288
289 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
290
291 return 0;
292}
293early_initcall(spawn_softlockup_task);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 70f8d90331e9..4372ccb25127 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,9 +35,9 @@ struct cpu_stop_done {
35/* the actual stopper, one per every possible cpu, enabled on online cpus */ 35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper { 36struct cpu_stopper {
37 spinlock_t lock; 37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */
38 struct list_head works; /* list of pending works */ 39 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */ 40 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41}; 41};
42 42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
diff --git a/kernel/sys.c b/kernel/sys.c
index e83ddbbaf89d..e9ad44489828 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1236,15 +1236,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1236 1236
1237SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1237SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1238{ 1238{
1239 if (resource >= RLIM_NLIMITS) 1239 struct rlimit value;
1240 return -EINVAL; 1240 int ret;
1241 else { 1241
1242 struct rlimit value; 1242 ret = do_prlimit(current, resource, NULL, &value);
1243 task_lock(current->group_leader); 1243 if (!ret)
1244 value = current->signal->rlim[resource]; 1244 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1245 task_unlock(current->group_leader); 1245
1246 return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1246 return ret;
1247 }
1248} 1247}
1249 1248
1250#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1249#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1272,44 +1271,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1272 1271
1273#endif 1272#endif
1274 1273
1275SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1274static inline bool rlim64_is_infinity(__u64 rlim64)
1276{ 1275{
1277 struct rlimit new_rlim, *old_rlim; 1276#if BITS_PER_LONG < 64
1278 int retval; 1277 return rlim64 >= ULONG_MAX;
1278#else
1279 return rlim64 == RLIM64_INFINITY;
1280#endif
1281}
1282
1283static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
1284{
1285 if (rlim->rlim_cur == RLIM_INFINITY)
1286 rlim64->rlim_cur = RLIM64_INFINITY;
1287 else
1288 rlim64->rlim_cur = rlim->rlim_cur;
1289 if (rlim->rlim_max == RLIM_INFINITY)
1290 rlim64->rlim_max = RLIM64_INFINITY;
1291 else
1292 rlim64->rlim_max = rlim->rlim_max;
1293}
1294
1295static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
1296{
1297 if (rlim64_is_infinity(rlim64->rlim_cur))
1298 rlim->rlim_cur = RLIM_INFINITY;
1299 else
1300 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
1301 if (rlim64_is_infinity(rlim64->rlim_max))
1302 rlim->rlim_max = RLIM_INFINITY;
1303 else
1304 rlim->rlim_max = (unsigned long)rlim64->rlim_max;
1305}
1306
1307/* make sure you are allowed to change @tsk limits before calling this */
1308int do_prlimit(struct task_struct *tsk, unsigned int resource,
1309 struct rlimit *new_rlim, struct rlimit *old_rlim)
1310{
1311 struct rlimit *rlim;
1312 int retval = 0;
1279 1313
1280 if (resource >= RLIM_NLIMITS) 1314 if (resource >= RLIM_NLIMITS)
1281 return -EINVAL; 1315 return -EINVAL;
1282 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1316 if (new_rlim) {
1283 return -EFAULT; 1317 if (new_rlim->rlim_cur > new_rlim->rlim_max)
1284 if (new_rlim.rlim_cur > new_rlim.rlim_max) 1318 return -EINVAL;
1285 return -EINVAL; 1319 if (resource == RLIMIT_NOFILE &&
1286 old_rlim = current->signal->rlim + resource; 1320 new_rlim->rlim_max > sysctl_nr_open)
1287 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1321 return -EPERM;
1288 !capable(CAP_SYS_RESOURCE))
1289 return -EPERM;
1290 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1291 return -EPERM;
1292
1293 retval = security_task_setrlimit(resource, &new_rlim);
1294 if (retval)
1295 return retval;
1296
1297 if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
1298 /*
1299 * The caller is asking for an immediate RLIMIT_CPU
1300 * expiry. But we use the zero value to mean "it was
1301 * never set". So let's cheat and make it one second
1302 * instead
1303 */
1304 new_rlim.rlim_cur = 1;
1305 } 1322 }
1306 1323
1307 task_lock(current->group_leader); 1324 /* protect tsk->signal and tsk->sighand from disappearing */
1308 *old_rlim = new_rlim; 1325 read_lock(&tasklist_lock);
1309 task_unlock(current->group_leader); 1326 if (!tsk->sighand) {
1310 1327 retval = -ESRCH;
1311 if (resource != RLIMIT_CPU)
1312 goto out; 1328 goto out;
1329 }
1330
1331 rlim = tsk->signal->rlim + resource;
1332 task_lock(tsk->group_leader);
1333 if (new_rlim) {
1334 if (new_rlim->rlim_max > rlim->rlim_max &&
1335 !capable(CAP_SYS_RESOURCE))
1336 retval = -EPERM;
1337 if (!retval)
1338 retval = security_task_setrlimit(tsk->group_leader,
1339 resource, new_rlim);
1340 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1341 /*
1342 * The caller is asking for an immediate RLIMIT_CPU
1343 * expiry. But we use the zero value to mean "it was
1344 * never set". So let's cheat and make it one second
1345 * instead
1346 */
1347 new_rlim->rlim_cur = 1;
1348 }
1349 }
1350 if (!retval) {
1351 if (old_rlim)
1352 *old_rlim = *rlim;
1353 if (new_rlim)
1354 *rlim = *new_rlim;
1355 }
1356 task_unlock(tsk->group_leader);
1313 1357
1314 /* 1358 /*
1315 * RLIMIT_CPU handling. Note that the kernel fails to return an error 1359 * RLIMIT_CPU handling. Note that the kernel fails to return an error
@@ -1317,14 +1361,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1317 * very long-standing error, and fixing it now risks breakage of 1361 * very long-standing error, and fixing it now risks breakage of
1318 * applications, so we live with it 1362 * applications, so we live with it
1319 */ 1363 */
1320 if (new_rlim.rlim_cur == RLIM_INFINITY) 1364 if (!retval && new_rlim && resource == RLIMIT_CPU &&
1321 goto out; 1365 new_rlim->rlim_cur != RLIM_INFINITY)
1322 1366 update_rlimit_cpu(tsk, new_rlim->rlim_cur);
1323 update_rlimit_cpu(new_rlim.rlim_cur);
1324out: 1367out:
1368 read_unlock(&tasklist_lock);
1369 return retval;
1370}
1371
1372/* rcu lock must be held */
1373static int check_prlimit_permission(struct task_struct *task)
1374{
1375 const struct cred *cred = current_cred(), *tcred;
1376
1377 tcred = __task_cred(task);
1378 if ((cred->uid != tcred->euid ||
1379 cred->uid != tcred->suid ||
1380 cred->uid != tcred->uid ||
1381 cred->gid != tcred->egid ||
1382 cred->gid != tcred->sgid ||
1383 cred->gid != tcred->gid) &&
1384 !capable(CAP_SYS_RESOURCE)) {
1385 return -EPERM;
1386 }
1387
1325 return 0; 1388 return 0;
1326} 1389}
1327 1390
1391SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1392 const struct rlimit64 __user *, new_rlim,
1393 struct rlimit64 __user *, old_rlim)
1394{
1395 struct rlimit64 old64, new64;
1396 struct rlimit old, new;
1397 struct task_struct *tsk;
1398 int ret;
1399
1400 if (new_rlim) {
1401 if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1402 return -EFAULT;
1403 rlim64_to_rlim(&new64, &new);
1404 }
1405
1406 rcu_read_lock();
1407 tsk = pid ? find_task_by_vpid(pid) : current;
1408 if (!tsk) {
1409 rcu_read_unlock();
1410 return -ESRCH;
1411 }
1412 ret = check_prlimit_permission(tsk);
1413 if (ret) {
1414 rcu_read_unlock();
1415 return ret;
1416 }
1417 get_task_struct(tsk);
1418 rcu_read_unlock();
1419
1420 ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
1421 old_rlim ? &old : NULL);
1422
1423 if (!ret && old_rlim) {
1424 rlim_to_rlim64(&old, &old64);
1425 if (copy_to_user(old_rlim, &old64, sizeof(old64)))
1426 ret = -EFAULT;
1427 }
1428
1429 put_task_struct(tsk);
1430 return ret;
1431}
1432
1433SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1434{
1435 struct rlimit new_rlim;
1436
1437 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1438 return -EFAULT;
1439 return do_prlimit(current, resource, &new_rlim, NULL);
1440}
1441
1328/* 1442/*
1329 * It would make sense to put struct rusage in the task_struct, 1443 * It would make sense to put struct rusage in the task_struct,
1330 * except that would make the task_struct be *really big*. After 1444 * except that would make the task_struct be *really big*. After
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..bad369ec5403 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2);
181 181
182/* performance counters: */ 182/* performance counters: */
183cond_syscall(sys_perf_event_open); 183cond_syscall(sys_perf_event_open);
184
185/* fanotify! */
186cond_syscall(sys_fanotify_init);
187cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..ca38e8e3e907 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,16 +44,17 @@
44#include <linux/times.h> 44#include <linux/times.h>
45#include <linux/limits.h> 45#include <linux/limits.h>
46#include <linux/dcache.h> 46#include <linux/dcache.h>
47#include <linux/dnotify.h>
47#include <linux/syscalls.h> 48#include <linux/syscalls.h>
48#include <linux/vmstat.h> 49#include <linux/vmstat.h>
49#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
50#include <linux/acpi.h> 51#include <linux/acpi.h>
51#include <linux/reboot.h> 52#include <linux/reboot.h>
52#include <linux/ftrace.h> 53#include <linux/ftrace.h>
53#include <linux/slow-work.h>
54#include <linux/perf_event.h> 54#include <linux/perf_event.h>
55#include <linux/kprobes.h> 55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59#include <asm/processor.h> 60#include <asm/processor.h>
@@ -76,15 +77,16 @@
76#include <scsi/sg.h> 77#include <scsi/sg.h>
77#endif 78#endif
78 79
80#ifdef CONFIG_LOCKUP_DETECTOR
81#include <linux/nmi.h>
82#endif
83
79 84
80#if defined(CONFIG_SYSCTL) 85#if defined(CONFIG_SYSCTL)
81 86
82/* External variables not in a header file. */ 87/* External variables not in a header file. */
83extern int sysctl_overcommit_memory; 88extern int sysctl_overcommit_memory;
84extern int sysctl_overcommit_ratio; 89extern int sysctl_overcommit_ratio;
85extern int sysctl_panic_on_oom;
86extern int sysctl_oom_kill_allocating_task;
87extern int sysctl_oom_dump_tasks;
88extern int max_threads; 90extern int max_threads;
89extern int core_uses_pid; 91extern int core_uses_pid;
90extern int suid_dumpable; 92extern int suid_dumpable;
@@ -106,7 +108,7 @@ extern int blk_iopoll_enabled;
106#endif 108#endif
107 109
108/* Constants used for minimum and maximum */ 110/* Constants used for minimum and maximum */
109#ifdef CONFIG_DETECT_SOFTLOCKUP 111#ifdef CONFIG_LOCKUP_DETECTOR
110static int sixty = 60; 112static int sixty = 60;
111static int neg_one = -1; 113static int neg_one = -1;
112#endif 114#endif
@@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
130 132
131static int ngroups_max = NGROUPS_MAX; 133static int ngroups_max = NGROUPS_MAX;
132 134
135#ifdef CONFIG_INOTIFY_USER
136#include <linux/inotify.h>
137#endif
133#ifdef CONFIG_SPARC 138#ifdef CONFIG_SPARC
134#include <asm/system.h> 139#include <asm/system.h>
135#endif 140#endif
@@ -206,9 +211,6 @@ static struct ctl_table fs_table[];
206static struct ctl_table debug_table[]; 211static struct ctl_table debug_table[];
207static struct ctl_table dev_table[]; 212static struct ctl_table dev_table[];
208extern struct ctl_table random_table[]; 213extern struct ctl_table random_table[];
209#ifdef CONFIG_INOTIFY_USER
210extern struct ctl_table inotify_table[];
211#endif
212#ifdef CONFIG_EPOLL 214#ifdef CONFIG_EPOLL
213extern struct ctl_table epoll_table[]; 215extern struct ctl_table epoll_table[];
214#endif 216#endif
@@ -562,7 +564,7 @@ static struct ctl_table kern_table[] = {
562 .extra2 = &one, 564 .extra2 = &one,
563 }, 565 },
564#endif 566#endif
565#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 567#ifdef CONFIG_HOTPLUG
566 { 568 {
567 .procname = "hotplug", 569 .procname = "hotplug",
568 .data = &uevent_helper, 570 .data = &uevent_helper,
@@ -710,7 +712,34 @@ static struct ctl_table kern_table[] = {
710 .mode = 0444, 712 .mode = 0444,
711 .proc_handler = proc_dointvec, 713 .proc_handler = proc_dointvec,
712 }, 714 },
713#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 715#if defined(CONFIG_LOCKUP_DETECTOR)
716 {
717 .procname = "watchdog",
718 .data = &watchdog_enabled,
719 .maxlen = sizeof (int),
720 .mode = 0644,
721 .proc_handler = proc_dowatchdog_enabled,
722 },
723 {
724 .procname = "watchdog_thresh",
725 .data = &softlockup_thresh,
726 .maxlen = sizeof(int),
727 .mode = 0644,
728 .proc_handler = proc_dowatchdog_thresh,
729 .extra1 = &neg_one,
730 .extra2 = &sixty,
731 },
732 {
733 .procname = "softlockup_panic",
734 .data = &softlockup_panic,
735 .maxlen = sizeof(int),
736 .mode = 0644,
737 .proc_handler = proc_dointvec_minmax,
738 .extra1 = &zero,
739 .extra2 = &one,
740 },
741#endif
742#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
714 { 743 {
715 .procname = "unknown_nmi_panic", 744 .procname = "unknown_nmi_panic",
716 .data = &unknown_nmi_panic, 745 .data = &unknown_nmi_panic,
@@ -813,26 +842,6 @@ static struct ctl_table kern_table[] = {
813 .proc_handler = proc_dointvec, 842 .proc_handler = proc_dointvec,
814 }, 843 },
815#endif 844#endif
816#ifdef CONFIG_DETECT_SOFTLOCKUP
817 {
818 .procname = "softlockup_panic",
819 .data = &softlockup_panic,
820 .maxlen = sizeof(int),
821 .mode = 0644,
822 .proc_handler = proc_dointvec_minmax,
823 .extra1 = &zero,
824 .extra2 = &one,
825 },
826 {
827 .procname = "softlockup_thresh",
828 .data = &softlockup_thresh,
829 .maxlen = sizeof(int),
830 .mode = 0644,
831 .proc_handler = proc_dosoftlockup_thresh,
832 .extra1 = &neg_one,
833 .extra2 = &sixty,
834 },
835#endif
836#ifdef CONFIG_DETECT_HUNG_TASK 845#ifdef CONFIG_DETECT_HUNG_TASK
837 { 846 {
838 .procname = "hung_task_panic", 847 .procname = "hung_task_panic",
@@ -906,13 +915,6 @@ static struct ctl_table kern_table[] = {
906 .proc_handler = proc_dointvec, 915 .proc_handler = proc_dointvec,
907 }, 916 },
908#endif 917#endif
909#ifdef CONFIG_SLOW_WORK
910 {
911 .procname = "slow-work",
912 .mode = 0555,
913 .child = slow_work_sysctls,
914 },
915#endif
916#ifdef CONFIG_PERF_EVENTS 918#ifdef CONFIG_PERF_EVENTS
917 { 919 {
918 .procname = "perf_event_paranoid", 920 .procname = "perf_event_paranoid",
diff --git a/kernel/time.c b/kernel/time.c
index 848b1c2ab09a..ba9b338d1835 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
300} 300}
301EXPORT_SYMBOL(timespec_trunc); 301EXPORT_SYMBOL(timespec_trunc);
302 302
303#ifndef CONFIG_GENERIC_TIME
304/*
305 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
306 * and therefore only yields usec accuracy
307 */
308void getnstimeofday(struct timespec *tv)
309{
310 struct timeval x;
311
312 do_gettimeofday(&x);
313 tv->tv_sec = x.tv_sec;
314 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
315}
316EXPORT_SYMBOL_GPL(getnstimeofday);
317#endif
318
319/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 303/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
320 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 304 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
321 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. 305 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0a..f06a8a365648 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
6 6
7config NO_HZ 7config NO_HZ
8 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
9 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 9 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
10 select TICK_ONESHOT 10 select TICK_ONESHOT
11 help 11 help
12 This option enables a tickless system: timer interrupts will 12 This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
15 15
16config HIGH_RES_TIMERS 16config HIGH_RES_TIMERS
17 bool "High Resolution Timer Support" 17 bool "High Resolution Timer Support"
18 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 18 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
19 select TICK_ONESHOT 19 select TICK_ONESHOT
20 help 20 help
21 This option enables high resolution timer support. If your 21 This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f08e99c1d561..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
531 return max_nsecs - (max_nsecs >> 5); 531 return max_nsecs - (max_nsecs >> 5);
532} 532}
533 533
534#ifdef CONFIG_GENERIC_TIME 534#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
535 535
536/** 536/**
537 * clocksource_select - Select the best clocksource available 537 * clocksource_select - Select the best clocksource available
@@ -577,7 +577,7 @@ static void clocksource_select(void)
577 } 577 }
578} 578}
579 579
580#else /* CONFIG_GENERIC_TIME */ 580#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
581 581
582static inline void clocksource_select(void) { } 582static inline void clocksource_select(void) { }
583 583
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
639#define MAX_UPDATE_LENGTH 5 /* Seconds */ 639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640 640
641/** 641/**
642 * __clocksource_register_scale - Used to install new clocksources 642 * __clocksource_updatefreq_scale - Used update clocksource with new freq
643 * @t: clocksource to be registered 643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz 644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale 645 * @freq: clocksource frequency (cycles per second) divided by scale
646 * 646 *
647 * Returns -EBUSY if registration fails, zero otherwise. 647 * This should only be called from the clocksource->enable() method.
648 * 648 *
649 * This *SHOULD NOT* be called directly! Please use the 649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions. 650 * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
651 */ 651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 652void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
653{ 653{
654
655 /* 654 /*
656 * Ideally we want to use some of the limits used in 655 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed 656 * clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
662 NSEC_PER_SEC/scale, 661 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale); 662 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 663 cs->max_idle_ns = clocksource_max_deferment(cs);
664}
665EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
666
667/**
668 * __clocksource_register_scale - Used to install new clocksources
669 * @t: clocksource to be registered
670 * @scale: Scale factor multiplied against freq to get clocksource hz
671 * @freq: clocksource frequency (cycles per second) divided by scale
672 *
673 * Returns -EBUSY if registration fails, zero otherwise.
674 *
675 * This *SHOULD NOT* be called directly! Please use the
676 * clocksource_register_hz() or clocksource_register_khz helper functions.
677 */
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{
680
681 /* Intialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq);
665 683
684 /* Add clocksource to the clcoksource list */
666 mutex_lock(&clocksource_mutex); 685 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs); 686 clocksource_enqueue(cs);
668 clocksource_select(); 687 clocksource_select();
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
188 /* 188 /*
189 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
190 * periodic mode. We read dev->next_event first and add to it 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event() 191 * when the event already expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really 192 * sets dev->next_event only when the event is really
193 * programmed to the device. 193 * programmed to the device.
194 */ 194 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..3e216e01bbd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
325 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
326 326
327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { 328 arch_needs_cpu(cpu)) {
329 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
330 delta_jiffies = 1; 330 delta_jiffies = 1;
331 } else { 331 } else {
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
405 * the scheduler tick in nohz_restart_sched_tick. 405 * the scheduler tick in nohz_restart_sched_tick.
406 */ 406 */
407 if (!ts->tick_stopped) { 407 if (!ts->tick_stopped) {
408 if (select_nohz_load_balancer(1)) { 408 select_nohz_load_balancer(1);
409 /*
410 * sched tick not stopped!
411 */
412 cpumask_clear_cpu(cpu, nohz_cpu_mask);
413 goto out;
414 }
415 409
416 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
417 ts->tick_stopped = 1; 411 ts->tick_stopped = 1;
@@ -780,7 +774,6 @@ void tick_setup_sched_timer(void)
780{ 774{
781 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 775 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
782 ktime_t now = ktime_get(); 776 ktime_t now = ktime_get();
783 u64 offset;
784 777
785 /* 778 /*
786 * Emulate tick processing via per-CPU hrtimers: 779 * Emulate tick processing via per-CPU hrtimers:
@@ -790,10 +783,6 @@ void tick_setup_sched_timer(void)
790 783
791 /* Get the next period (per cpu) */ 784 /* Get the next period (per cpu) */
792 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 785 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
793 offset = ktime_to_ns(tick_period) >> 1;
794 do_div(offset, num_possible_cpus());
795 offset *= smp_processor_id();
796 hrtimer_add_expires_ns(&ts->sched_timer, offset);
797 786
798 for (;;) { 787 for (;;) {
799 hrtimer_forward(&ts->sched_timer, now, tick_period); 788 hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..e14c839e9faa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
153 * - wall_to_monotonic is no longer the boot time, getboottime must be 153 * - wall_to_monotonic is no longer the boot time, getboottime must be
154 * used instead. 154 * used instead.
155 */ 155 */
156struct timespec xtime __attribute__ ((aligned (16))); 156static struct timespec xtime __attribute__ ((aligned (16)));
157struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 157static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
158static struct timespec total_sleep_time; 158static struct timespec total_sleep_time;
159 159
160/* 160/*
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond)
170{ 170{
171 xtime.tv_sec += leapsecond; 171 xtime.tv_sec += leapsecond;
172 wall_to_monotonic.tv_sec -= leapsecond; 172 wall_to_monotonic.tv_sec -= leapsecond;
173 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 173 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
174 timekeeper.mult);
174} 175}
175 176
176#ifdef CONFIG_GENERIC_TIME
177
178/** 177/**
179 * timekeeping_forward_now - update clock to the current time 178 * timekeeping_forward_now - update clock to the current time
180 * 179 *
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
328 timekeeper.ntp_error = 0; 327 timekeeper.ntp_error = 0;
329 ntp_clear(); 328 ntp_clear();
330 329
331 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 330 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
331 timekeeper.mult);
332 332
333 write_sequnlock_irqrestore(&xtime_lock, flags); 333 write_sequnlock_irqrestore(&xtime_lock, flags);
334 334
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
376 tick_clock_notify(); 376 tick_clock_notify();
377} 377}
378 378
379#else /* GENERIC_TIME */
380
381static inline void timekeeping_forward_now(void) { }
382
383/**
384 * ktime_get - get the monotonic time in ktime_t format
385 *
386 * returns the time in ktime_t format
387 */
388ktime_t ktime_get(void)
389{
390 struct timespec now;
391
392 ktime_get_ts(&now);
393
394 return timespec_to_ktime(now);
395}
396EXPORT_SYMBOL_GPL(ktime_get);
397
398/**
399 * ktime_get_ts - get the monotonic clock in timespec format
400 * @ts: pointer to timespec variable
401 *
402 * The function calculates the monotonic clock from the realtime
403 * clock and the wall_to_monotonic offset and stores the result
404 * in normalized timespec format in the variable pointed to by @ts.
405 */
406void ktime_get_ts(struct timespec *ts)
407{
408 struct timespec tomono;
409 unsigned long seq;
410
411 do {
412 seq = read_seqbegin(&xtime_lock);
413 getnstimeofday(ts);
414 tomono = wall_to_monotonic;
415
416 } while (read_seqretry(&xtime_lock, seq));
417
418 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
419 ts->tv_nsec + tomono.tv_nsec);
420}
421EXPORT_SYMBOL_GPL(ktime_get_ts);
422
423#endif /* !GENERIC_TIME */
424
425/** 379/**
426 * ktime_get_real - get the real (wall-) time in ktime_t format 380 * ktime_get_real - get the real (wall-) time in ktime_t format
427 * 381 *
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev)
579 533
580 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 534 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
581 ts = timespec_sub(ts, timekeeping_suspend_time); 535 ts = timespec_sub(ts, timekeeping_suspend_time);
582 xtime = timespec_add_safe(xtime, ts); 536 xtime = timespec_add(xtime, ts);
583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 537 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
584 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 538 total_sleep_time = timespec_add(total_sleep_time, ts);
585 } 539 }
586 /* re-base the last cycle value */ 540 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 541 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -784,10 +738,11 @@ void update_wall_time(void)
784 return; 738 return;
785 739
786 clock = timekeeper.clock; 740 clock = timekeeper.clock;
787#ifdef CONFIG_GENERIC_TIME 741
788 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 742#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
789#else
790 offset = timekeeper.cycle_interval; 743 offset = timekeeper.cycle_interval;
744#else
745 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
791#endif 746#endif
792 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 747 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
793 748
@@ -856,7 +811,8 @@ void update_wall_time(void)
856 } 811 }
857 812
858 /* check to see if there is a new clocksource to use */ 813 /* check to see if there is a new clocksource to use */
859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 814 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
815 timekeeper.mult);
860} 816}
861 817
862/** 818/**
@@ -887,7 +843,7 @@ EXPORT_SYMBOL_GPL(getboottime);
887 */ 843 */
888void monotonic_to_bootbased(struct timespec *ts) 844void monotonic_to_bootbased(struct timespec *ts)
889{ 845{
890 *ts = timespec_add_safe(*ts, total_sleep_time); 846 *ts = timespec_add(*ts, total_sleep_time);
891} 847}
892EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 848EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
893 849
@@ -902,6 +858,11 @@ struct timespec __current_kernel_time(void)
902 return xtime; 858 return xtime;
903} 859}
904 860
861struct timespec __get_wall_to_monotonic(void)
862{
863 return wall_to_monotonic;
864}
865
905struct timespec current_kernel_time(void) 866struct timespec current_kernel_time(void)
906{ 867{
907 struct timespec now; 868 struct timespec now;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..f1b8afe1ad86 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/* 91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of 92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB for 93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * the new flag to indicate whether the timer is deferrable 94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
95 */ 100 */
96#define TBASE_DEFERRABLE_FLAG (0x1) 101#define TBASE_DEFERRABLE_FLAG (0x1)
97 102
@@ -577,6 +582,19 @@ static void __init_timer(struct timer_list *timer,
577 lockdep_init_map(&timer->lockdep_map, name, key, 0); 582 lockdep_init_map(&timer->lockdep_map, name, key, 0);
578} 583}
579 584
585void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
586 const char *name,
587 struct lock_class_key *key,
588 void (*function)(unsigned long),
589 unsigned long data)
590{
591 timer->function = function;
592 timer->data = data;
593 init_timer_on_stack_key(timer, name, key);
594 timer_set_deferrable(timer);
595}
596EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
597
580/** 598/**
581 * init_timer_key - initialize a timer 599 * init_timer_key - initialize a timer
582 * @timer: the timer to be initialized 600 * @timer: the timer to be initialized
@@ -679,12 +697,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
679 cpu = smp_processor_id(); 697 cpu = smp_processor_id();
680 698
681#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 699#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
682 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { 700 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
683 int preferred_cpu = get_nohz_load_balancer(); 701 cpu = get_nohz_timer_target();
684
685 if (preferred_cpu >= 0)
686 cpu = preferred_cpu;
687 }
688#endif 702#endif
689 new_base = per_cpu(tvec_bases, cpu); 703 new_base = per_cpu(tvec_bases, cpu);
690 704
@@ -1289,7 +1303,6 @@ void run_local_timers(void)
1289{ 1303{
1290 hrtimer_run_queues(); 1304 hrtimer_run_queues();
1291 raise_softirq(TIMER_SOFTIRQ); 1305 raise_softirq(TIMER_SOFTIRQ);
1292 softlockup_tick();
1293} 1306}
1294 1307
1295/* 1308/*
@@ -1750,3 +1763,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
1750} 1763}
1751 1764
1752EXPORT_SYMBOL(msleep_interruptible); 1765EXPORT_SYMBOL(msleep_interruptible);
1766
1767static int __sched do_usleep_range(unsigned long min, unsigned long max)
1768{
1769 ktime_t kmin;
1770 unsigned long delta;
1771
1772 kmin = ktime_set(0, min * NSEC_PER_USEC);
1773 delta = (max - min) * NSEC_PER_USEC;
1774 return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
1775}
1776
1777/**
1778 * usleep_range - Drop in replacement for udelay where wakeup is flexible
1779 * @min: Minimum time in usecs to sleep
1780 * @max: Maximum time in usecs to sleep
1781 */
1782void usleep_range(unsigned long min, unsigned long max)
1783{
1784 __set_current_state(TASK_UNINTERRUPTIBLE);
1785 do_usleep_range(min, max);
1786}
1787EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..538501c6ea50 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -153,7 +153,7 @@ config IRQSOFF_TRACER
153 bool "Interrupts-off Latency Tracer" 153 bool "Interrupts-off Latency Tracer"
154 default n 154 default n
155 depends on TRACE_IRQFLAGS_SUPPORT 155 depends on TRACE_IRQFLAGS_SUPPORT
156 depends on GENERIC_TIME 156 depends on !ARCH_USES_GETTIMEOFFSET
157 select TRACE_IRQFLAGS 157 select TRACE_IRQFLAGS
158 select GENERIC_TRACER 158 select GENERIC_TRACER
159 select TRACER_MAX_TRACE 159 select TRACER_MAX_TRACE
@@ -175,7 +175,7 @@ config IRQSOFF_TRACER
175config PREEMPT_TRACER 175config PREEMPT_TRACER
176 bool "Preemption-off Latency Tracer" 176 bool "Preemption-off Latency Tracer"
177 default n 177 default n
178 depends on GENERIC_TIME 178 depends on !ARCH_USES_GETTIMEOFFSET
179 depends on PREEMPT 179 depends on PREEMPT
180 select GENERIC_TRACER 180 select GENERIC_TRACER
181 select TRACER_MAX_TRACE 181 select TRACER_MAX_TRACE
@@ -194,15 +194,6 @@ config PREEMPT_TRACER
194 enabled. This option and the irqs-off timing option can be 194 enabled. This option and the irqs-off timing option can be
195 used together or separately.) 195 used together or separately.)
196 196
197config SYSPROF_TRACER
198 bool "Sysprof Tracer"
199 depends on X86
200 select GENERIC_TRACER
201 select CONTEXT_SWITCH_TRACER
202 help
203 This tracer provides the trace needed by the 'Sysprof' userspace
204 tool.
205
206config SCHED_TRACER 197config SCHED_TRACER
207 bool "Scheduling Latency Tracer" 198 bool "Scheduling Latency Tracer"
208 select GENERIC_TRACER 199 select GENERIC_TRACER
@@ -229,23 +220,6 @@ config FTRACE_SYSCALLS
229 help 220 help
230 Basic tracer to catch the syscall entry and exit events. 221 Basic tracer to catch the syscall entry and exit events.
231 222
232config BOOT_TRACER
233 bool "Trace boot initcalls"
234 select GENERIC_TRACER
235 select CONTEXT_SWITCH_TRACER
236 help
237 This tracer helps developers to optimize boot times: it records
238 the timings of the initcalls and traces key events and the identity
239 of tasks that can cause boot delays, such as context-switches.
240
241 Its aim is to be parsed by the scripts/bootgraph.pl tool to
242 produce pretty graphics about boot inefficiencies, giving a visual
243 representation of the delays during initcalls - but the raw
244 /debug/tracing/trace text output is readable too.
245
246 You must pass in initcall_debug and ftrace=initcall to the kernel
247 command line to enable this on bootup.
248
249config TRACE_BRANCH_PROFILING 223config TRACE_BRANCH_PROFILING
250 bool 224 bool
251 select GENERIC_TRACER 225 select GENERIC_TRACER
@@ -325,28 +299,6 @@ config BRANCH_TRACER
325 299
326 Say N if unsure. 300 Say N if unsure.
327 301
328config KSYM_TRACER
329 bool "Trace read and write access on kernel memory locations"
330 depends on HAVE_HW_BREAKPOINT
331 select TRACING
332 help
333 This tracer helps find read and write operations on any given kernel
334 symbol i.e. /proc/kallsyms.
335
336config PROFILE_KSYM_TRACER
337 bool "Profile all kernel memory accesses on 'watched' variables"
338 depends on KSYM_TRACER
339 help
340 This tracer profiles kernel accesses on variables watched through the
341 ksym tracer ftrace plugin. Depending upon the hardware, all read
342 and write operations on kernel variables can be monitored for
343 accesses.
344
345 The results will be displayed in:
346 /debugfs/tracing/profile_ksym
347
348 Say N if unsure.
349
350config STACK_TRACER 302config STACK_TRACER
351 bool "Trace max stack" 303 bool "Trace max stack"
352 depends on HAVE_FUNCTION_TRACER 304 depends on HAVE_FUNCTION_TRACER
@@ -371,37 +323,6 @@ config STACK_TRACER
371 323
372 Say N if unsure. 324 Say N if unsure.
373 325
374config KMEMTRACE
375 bool "Trace SLAB allocations"
376 select GENERIC_TRACER
377 help
378 kmemtrace provides tracing for slab allocator functions, such as
379 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
380 data is then fed to the userspace application in order to analyse
381 allocation hotspots, internal fragmentation and so on, making it
382 possible to see how well an allocator performs, as well as debug
383 and profile kernel code.
384
385 This requires an userspace application to use. See
386 Documentation/trace/kmemtrace.txt for more information.
387
388 Saying Y will make the kernel somewhat larger and slower. However,
389 if you disable kmemtrace at run-time or boot-time, the performance
390 impact is minimal (depending on the arch the kernel is built for).
391
392 If unsure, say N.
393
394config WORKQUEUE_TRACER
395 bool "Trace workqueues"
396 select GENERIC_TRACER
397 help
398 The workqueue tracer provides some statistical information
399 about each cpu workqueue thread such as the number of the
400 works inserted and executed since their creation. It can help
401 to evaluate the amount of work each of them has to perform.
402 For example it can help a developer to decide whether he should
403 choose a per-cpu workqueue instead of a singlethreaded one.
404
405config BLK_DEV_IO_TRACE 326config BLK_DEV_IO_TRACE
406 bool "Support for tracing block IO actions" 327 bool "Support for tracing block IO actions"
407 depends on SYSFS 328 depends on SYSFS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
30obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
31obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
33obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
34obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 33obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
35obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 34obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
36obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 35obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
38obj-$(CONFIG_NOP_TRACER) += trace_nop.o 37obj-$(CONFIG_NOP_TRACER) += trace_nop.o
39obj-$(CONFIG_STACK_TRACER) += trace_stack.o 38obj-$(CONFIG_STACK_TRACER) += trace_stack.o
40obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 39obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 40obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 41obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 42obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 43obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
47ifeq ($(CONFIG_BLOCK),y) 44ifeq ($(CONFIG_BLOCK),y)
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
55endif 52endif
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
58obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
59obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_EVENT_TRACING) += power-traces.o
56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif
60 59
61libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..0d88ce9b9fb8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1883 struct hlist_head *hhd; 1883 struct hlist_head *hhd;
1884 struct hlist_node *n; 1884 struct hlist_node *n;
1885 unsigned long key; 1885 unsigned long key;
1886 int resched;
1887 1886
1888 key = hash_long(ip, FTRACE_HASH_BITS); 1887 key = hash_long(ip, FTRACE_HASH_BITS);
1889 1888
@@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1897 * period. This syncs the hash iteration and freeing of items 1896 * period. This syncs the hash iteration and freeing of items
1898 * on the hash. rcu_read_lock is too dangerous here. 1897 * on the hash. rcu_read_lock is too dangerous here.
1899 */ 1898 */
1900 resched = ftrace_preempt_disable(); 1899 preempt_disable_notrace();
1901 hlist_for_each_entry_rcu(entry, n, hhd, node) { 1900 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1902 if (entry->ip == ip) 1901 if (entry->ip == ip)
1903 entry->ops->func(ip, parent_ip, &entry->data); 1902 entry->ops->func(ip, parent_ip, &entry->data);
1904 } 1903 }
1905 ftrace_preempt_enable(resched); 1904 preempt_enable_notrace();
1906} 1905}
1907 1906
1908static struct ftrace_ops trace_probe_ops __read_mostly = 1907static struct ftrace_ops trace_probe_ops __read_mostly =
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
11#include <linux/debugfs.h>
12#include <linux/dcache.h>
13#include <linux/fs.h>
14
15#include <linux/kmemtrace.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20/* Select an alternative, minimalistic output than the original one */
21#define TRACE_KMEM_OPT_MINIMAL 0x1
22
23static struct tracer_opt kmem_opts[] = {
24 /* Default disable the minimalistic output */
25 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
26 { }
27};
28
29static struct tracer_flags kmem_tracer_flags = {
30 .val = 0,
31 .opts = kmem_opts
32};
33
34static struct trace_array *kmemtrace_array;
35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
46 struct trace_array *tr = kmemtrace_array;
47 struct kmemtrace_alloc_entry *entry;
48 struct ring_buffer_event *event;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
51 if (!event)
52 return;
53
54 entry = ring_buffer_event_data(event);
55 tracing_generic_entry_update(&entry->ent, 0, 0);
56
57 entry->ent.type = TRACE_KMEM_ALLOC;
58 entry->type_id = type_id;
59 entry->call_site = call_site;
60 entry->ptr = ptr;
61 entry->bytes_req = bytes_req;
62 entry->bytes_alloc = bytes_alloc;
63 entry->gfp_flags = gfp_flags;
64 entry->node = node;
65
66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
68
69 trace_wake_up();
70}
71
72static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
73 unsigned long call_site,
74 const void *ptr)
75{
76 struct ftrace_event_call *call = &event_kmem_free;
77 struct trace_array *tr = kmemtrace_array;
78 struct kmemtrace_free_entry *entry;
79 struct ring_buffer_event *event;
80
81 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
82 if (!event)
83 return;
84 entry = ring_buffer_event_data(event);
85 tracing_generic_entry_update(&entry->ent, 0, 0);
86
87 entry->ent.type = TRACE_KMEM_FREE;
88 entry->type_id = type_id;
89 entry->call_site = call_site;
90 entry->ptr = ptr;
91
92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
94
95 trace_wake_up();
96}
97
98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
100 const void *ptr,
101 size_t bytes_req,
102 size_t bytes_alloc,
103 gfp_t gfp_flags)
104{
105 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
106 bytes_req, bytes_alloc, gfp_flags, -1);
107}
108
109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
111 const void *ptr,
112 size_t bytes_req,
113 size_t bytes_alloc,
114 gfp_t gfp_flags)
115{
116 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
117 bytes_req, bytes_alloc, gfp_flags, -1);
118}
119
120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
122 const void *ptr,
123 size_t bytes_req,
124 size_t bytes_alloc,
125 gfp_t gfp_flags,
126 int node)
127{
128 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
129 bytes_req, bytes_alloc, gfp_flags, node);
130}
131
132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
134 const void *ptr,
135 size_t bytes_req,
136 size_t bytes_alloc,
137 gfp_t gfp_flags,
138 int node)
139{
140 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
141 bytes_req, bytes_alloc, gfp_flags, node);
142}
143
144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
146{
147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
148}
149
150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
152{
153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
154}
155
156static int kmemtrace_start_probes(void)
157{
158 int err;
159
160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
161 if (err)
162 return err;
163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
164 if (err)
165 return err;
166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
167 if (err)
168 return err;
169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
170 if (err)
171 return err;
172 err = register_trace_kfree(kmemtrace_kfree, NULL);
173 if (err)
174 return err;
175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
176
177 return err;
178}
179
180static void kmemtrace_stop_probes(void)
181{
182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
186 unregister_trace_kfree(kmemtrace_kfree, NULL);
187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
188}
189
190static int kmem_trace_init(struct trace_array *tr)
191{
192 kmemtrace_array = tr;
193
194 tracing_reset_online_cpus(tr);
195
196 kmemtrace_start_probes();
197
198 return 0;
199}
200
201static void kmem_trace_reset(struct trace_array *tr)
202{
203 kmemtrace_stop_probes();
204}
205
206static void kmemtrace_headers(struct seq_file *s)
207{
208 /* Don't need headers for the original kmemtrace output */
209 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
210 return;
211
212 seq_printf(s, "#\n");
213 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
214 " POINTER NODE CALLER\n");
215 seq_printf(s, "# FREE | | | | "
216 " | | | |\n");
217 seq_printf(s, "# |\n\n");
218}
219
220/*
221 * The following functions give the original output from kmemtrace,
222 * plus the origin CPU, since reordering occurs in-kernel now.
223 */
224
225#define KMEMTRACE_USER_ALLOC 0
226#define KMEMTRACE_USER_FREE 1
227
228struct kmemtrace_user_event {
229 u8 event_id;
230 u8 type_id;
231 u16 event_size;
232 u32 cpu;
233 u64 timestamp;
234 unsigned long call_site;
235 unsigned long ptr;
236};
237
238struct kmemtrace_user_event_alloc {
239 size_t bytes_req;
240 size_t bytes_alloc;
241 unsigned gfp_flags;
242 int node;
243};
244
245static enum print_line_t
246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
248{
249 struct trace_seq *s = &iter->seq;
250 struct kmemtrace_alloc_entry *entry;
251 int ret;
252
253 trace_assign_type(entry, iter->ent);
254
255 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
256 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
257 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
258 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
259 (unsigned long)entry->gfp_flags, entry->node);
260
261 if (!ret)
262 return TRACE_TYPE_PARTIAL_LINE;
263 return TRACE_TYPE_HANDLED;
264}
265
266static enum print_line_t
267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
269{
270 struct trace_seq *s = &iter->seq;
271 struct kmemtrace_free_entry *entry;
272 int ret;
273
274 trace_assign_type(entry, iter->ent);
275
276 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
277 entry->type_id, (void *)entry->call_site,
278 (unsigned long)entry->ptr);
279
280 if (!ret)
281 return TRACE_TYPE_PARTIAL_LINE;
282 return TRACE_TYPE_HANDLED;
283}
284
285static enum print_line_t
286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
288{
289 struct trace_seq *s = &iter->seq;
290 struct kmemtrace_alloc_entry *entry;
291 struct kmemtrace_user_event *ev;
292 struct kmemtrace_user_event_alloc *ev_alloc;
293
294 trace_assign_type(entry, iter->ent);
295
296 ev = trace_seq_reserve(s, sizeof(*ev));
297 if (!ev)
298 return TRACE_TYPE_PARTIAL_LINE;
299
300 ev->event_id = KMEMTRACE_USER_ALLOC;
301 ev->type_id = entry->type_id;
302 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
303 ev->cpu = iter->cpu;
304 ev->timestamp = iter->ts;
305 ev->call_site = entry->call_site;
306 ev->ptr = (unsigned long)entry->ptr;
307
308 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
309 if (!ev_alloc)
310 return TRACE_TYPE_PARTIAL_LINE;
311
312 ev_alloc->bytes_req = entry->bytes_req;
313 ev_alloc->bytes_alloc = entry->bytes_alloc;
314 ev_alloc->gfp_flags = entry->gfp_flags;
315 ev_alloc->node = entry->node;
316
317 return TRACE_TYPE_HANDLED;
318}
319
320static enum print_line_t
321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
323{
324 struct trace_seq *s = &iter->seq;
325 struct kmemtrace_free_entry *entry;
326 struct kmemtrace_user_event *ev;
327
328 trace_assign_type(entry, iter->ent);
329
330 ev = trace_seq_reserve(s, sizeof(*ev));
331 if (!ev)
332 return TRACE_TYPE_PARTIAL_LINE;
333
334 ev->event_id = KMEMTRACE_USER_FREE;
335 ev->type_id = entry->type_id;
336 ev->event_size = sizeof(*ev);
337 ev->cpu = iter->cpu;
338 ev->timestamp = iter->ts;
339 ev->call_site = entry->call_site;
340 ev->ptr = (unsigned long)entry->ptr;
341
342 return TRACE_TYPE_HANDLED;
343}
344
345/* The two other following provide a more minimalistic output */
346static enum print_line_t
347kmemtrace_print_alloc_compress(struct trace_iterator *iter)
348{
349 struct kmemtrace_alloc_entry *entry;
350 struct trace_seq *s = &iter->seq;
351 int ret;
352
353 trace_assign_type(entry, iter->ent);
354
355 /* Alloc entry */
356 ret = trace_seq_printf(s, " + ");
357 if (!ret)
358 return TRACE_TYPE_PARTIAL_LINE;
359
360 /* Type */
361 switch (entry->type_id) {
362 case KMEMTRACE_TYPE_KMALLOC:
363 ret = trace_seq_printf(s, "K ");
364 break;
365 case KMEMTRACE_TYPE_CACHE:
366 ret = trace_seq_printf(s, "C ");
367 break;
368 case KMEMTRACE_TYPE_PAGES:
369 ret = trace_seq_printf(s, "P ");
370 break;
371 default:
372 ret = trace_seq_printf(s, "? ");
373 }
374
375 if (!ret)
376 return TRACE_TYPE_PARTIAL_LINE;
377
378 /* Requested */
379 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
380 if (!ret)
381 return TRACE_TYPE_PARTIAL_LINE;
382
383 /* Allocated */
384 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
385 if (!ret)
386 return TRACE_TYPE_PARTIAL_LINE;
387
388 /* Flags
389 * TODO: would be better to see the name of the GFP flag names
390 */
391 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
392 if (!ret)
393 return TRACE_TYPE_PARTIAL_LINE;
394
395 /* Pointer to allocated */
396 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
397 if (!ret)
398 return TRACE_TYPE_PARTIAL_LINE;
399
400 /* Node and call site*/
401 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
402 (void *)entry->call_site);
403 if (!ret)
404 return TRACE_TYPE_PARTIAL_LINE;
405
406 return TRACE_TYPE_HANDLED;
407}
408
409static enum print_line_t
410kmemtrace_print_free_compress(struct trace_iterator *iter)
411{
412 struct kmemtrace_free_entry *entry;
413 struct trace_seq *s = &iter->seq;
414 int ret;
415
416 trace_assign_type(entry, iter->ent);
417
418 /* Free entry */
419 ret = trace_seq_printf(s, " - ");
420 if (!ret)
421 return TRACE_TYPE_PARTIAL_LINE;
422
423 /* Type */
424 switch (entry->type_id) {
425 case KMEMTRACE_TYPE_KMALLOC:
426 ret = trace_seq_printf(s, "K ");
427 break;
428 case KMEMTRACE_TYPE_CACHE:
429 ret = trace_seq_printf(s, "C ");
430 break;
431 case KMEMTRACE_TYPE_PAGES:
432 ret = trace_seq_printf(s, "P ");
433 break;
434 default:
435 ret = trace_seq_printf(s, "? ");
436 }
437
438 if (!ret)
439 return TRACE_TYPE_PARTIAL_LINE;
440
441 /* Skip requested/allocated/flags */
442 ret = trace_seq_printf(s, " ");
443 if (!ret)
444 return TRACE_TYPE_PARTIAL_LINE;
445
446 /* Pointer to allocated */
447 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
448 if (!ret)
449 return TRACE_TYPE_PARTIAL_LINE;
450
451 /* Skip node and print call site*/
452 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
453 if (!ret)
454 return TRACE_TYPE_PARTIAL_LINE;
455
456 return TRACE_TYPE_HANDLED;
457}
458
459static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
460{
461 struct trace_entry *entry = iter->ent;
462
463 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
464 return TRACE_TYPE_UNHANDLED;
465
466 switch (entry->type) {
467 case TRACE_KMEM_ALLOC:
468 return kmemtrace_print_alloc_compress(iter);
469 case TRACE_KMEM_FREE:
470 return kmemtrace_print_free_compress(iter);
471 default:
472 return TRACE_TYPE_UNHANDLED;
473 }
474}
475
476static struct trace_event_functions kmem_trace_alloc_funcs = {
477 .trace = kmemtrace_print_alloc,
478 .binary = kmemtrace_print_alloc_user,
479};
480
481static struct trace_event kmem_trace_alloc = {
482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
487 .trace = kmemtrace_print_free,
488 .binary = kmemtrace_print_free_user,
489};
490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
496static struct tracer kmem_tracer __read_mostly = {
497 .name = "kmemtrace",
498 .init = kmem_trace_init,
499 .reset = kmem_trace_reset,
500 .print_line = kmemtrace_print_line,
501 .print_header = kmemtrace_headers,
502 .flags = &kmem_tracer_flags
503};
504
505void kmemtrace_init(void)
506{
507 /* earliest opportunity to start kmem tracing */
508}
509
510static int __init init_kmem_tracer(void)
511{
512 if (!register_ftrace_event(&kmem_trace_alloc)) {
513 pr_warning("Warning: could not register kmem events\n");
514 return 1;
515 }
516
517 if (!register_ftrace_event(&kmem_trace_free)) {
518 pr_warning("Warning: could not register kmem events\n");
519 return 1;
520 }
521
522 if (register_tracer(&kmem_tracer) != 0) {
523 pr_warning("Warning: could not register the kmem tracer\n");
524 return 1;
525 }
526
527 return 0;
528}
529device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1da7b6ea8b85..3632ce87674f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
443 */ 443 */
444struct ring_buffer_per_cpu { 444struct ring_buffer_per_cpu {
445 int cpu; 445 int cpu;
446 atomic_t record_disabled;
446 struct ring_buffer *buffer; 447 struct ring_buffer *buffer;
447 spinlock_t reader_lock; /* serialize readers */ 448 spinlock_t reader_lock; /* serialize readers */
448 arch_spinlock_t lock; 449 arch_spinlock_t lock;
@@ -462,7 +463,6 @@ struct ring_buffer_per_cpu {
462 unsigned long read; 463 unsigned long read;
463 u64 write_stamp; 464 u64 write_stamp;
464 u64 read_stamp; 465 u64 read_stamp;
465 atomic_t record_disabled;
466}; 466};
467 467
468struct ring_buffer { 468struct ring_buffer {
@@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void)
2242 2242
2243#endif 2243#endif
2244 2244
2245static DEFINE_PER_CPU(int, rb_need_resched);
2246
2247/** 2245/**
2248 * ring_buffer_lock_reserve - reserve a part of the buffer 2246 * ring_buffer_lock_reserve - reserve a part of the buffer
2249 * @buffer: the ring buffer to reserve from 2247 * @buffer: the ring buffer to reserve from
@@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2264{ 2262{
2265 struct ring_buffer_per_cpu *cpu_buffer; 2263 struct ring_buffer_per_cpu *cpu_buffer;
2266 struct ring_buffer_event *event; 2264 struct ring_buffer_event *event;
2267 int cpu, resched; 2265 int cpu;
2268 2266
2269 if (ring_buffer_flags != RB_BUFFERS_ON) 2267 if (ring_buffer_flags != RB_BUFFERS_ON)
2270 return NULL; 2268 return NULL;
2271 2269
2272 /* If we are tracing schedule, we don't want to recurse */ 2270 /* If we are tracing schedule, we don't want to recurse */
2273 resched = ftrace_preempt_disable(); 2271 preempt_disable_notrace();
2274 2272
2275 if (atomic_read(&buffer->record_disabled)) 2273 if (atomic_read(&buffer->record_disabled))
2276 goto out_nocheck; 2274 goto out_nocheck;
@@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2295 if (!event) 2293 if (!event)
2296 goto out; 2294 goto out;
2297 2295
2298 /*
2299 * Need to store resched state on this cpu.
2300 * Only the first needs to.
2301 */
2302
2303 if (preempt_count() == 1)
2304 per_cpu(rb_need_resched, cpu) = resched;
2305
2306 return event; 2296 return event;
2307 2297
2308 out: 2298 out:
2309 trace_recursive_unlock(); 2299 trace_recursive_unlock();
2310 2300
2311 out_nocheck: 2301 out_nocheck:
2312 ftrace_preempt_enable(resched); 2302 preempt_enable_notrace();
2313 return NULL; 2303 return NULL;
2314} 2304}
2315EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2305EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2355 2345
2356 trace_recursive_unlock(); 2346 trace_recursive_unlock();
2357 2347
2358 /* 2348 preempt_enable_notrace();
2359 * Only the last preempt count needs to restore preemption.
2360 */
2361 if (preempt_count() == 1)
2362 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2363 else
2364 preempt_enable_no_resched_notrace();
2365 2349
2366 return 0; 2350 return 0;
2367} 2351}
@@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
2469 2453
2470 trace_recursive_unlock(); 2454 trace_recursive_unlock();
2471 2455
2472 /* 2456 preempt_enable_notrace();
2473 * Only the last preempt count needs to restore preemption.
2474 */
2475 if (preempt_count() == 1)
2476 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2477 else
2478 preempt_enable_no_resched_notrace();
2479 2457
2480} 2458}
2481EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); 2459EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
2501 struct ring_buffer_event *event; 2479 struct ring_buffer_event *event;
2502 void *body; 2480 void *body;
2503 int ret = -EBUSY; 2481 int ret = -EBUSY;
2504 int cpu, resched; 2482 int cpu;
2505 2483
2506 if (ring_buffer_flags != RB_BUFFERS_ON) 2484 if (ring_buffer_flags != RB_BUFFERS_ON)
2507 return -EBUSY; 2485 return -EBUSY;
2508 2486
2509 resched = ftrace_preempt_disable(); 2487 preempt_disable_notrace();
2510 2488
2511 if (atomic_read(&buffer->record_disabled)) 2489 if (atomic_read(&buffer->record_disabled))
2512 goto out; 2490 goto out;
@@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
2536 2514
2537 ret = 0; 2515 ret = 0;
2538 out: 2516 out:
2539 ftrace_preempt_enable(resched); 2517 preempt_enable_notrace();
2540 2518
2541 return ret; 2519 return ret;
2542} 2520}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..ba14a22be4cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
101 preempt_enable(); 101 preempt_enable();
102} 102}
103 103
104static cpumask_var_t __read_mostly tracing_buffer_mask; 104cpumask_var_t __read_mostly tracing_buffer_mask;
105
106#define for_each_tracing_cpu(cpu) \
107 for_each_cpu(cpu, tracing_buffer_mask)
108 105
109/* 106/*
110 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 107 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344/* trace_flags holds trace_options default values */ 341/* trace_flags holds trace_options default values */
345unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 342unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
346 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 343 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
347 TRACE_ITER_GRAPH_TIME; 344 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
348 345
349static int trace_stop_count; 346static int trace_stop_count;
350static DEFINE_SPINLOCK(tracing_start_lock); 347static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +425,7 @@ static const char *trace_options[] = {
428 "latency-format", 425 "latency-format",
429 "sleep-time", 426 "sleep-time",
430 "graph-time", 427 "graph-time",
428 "record-cmd",
431 NULL 429 NULL
432}; 430};
433 431
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
659 return; 657 return;
660 658
661 WARN_ON_ONCE(!irqs_disabled()); 659 WARN_ON_ONCE(!irqs_disabled());
660 if (!current_trace->use_max_tr) {
661 WARN_ON_ONCE(1);
662 return;
663 }
662 arch_spin_lock(&ftrace_max_lock); 664 arch_spin_lock(&ftrace_max_lock);
663 665
664 tr->buffer = max_tr.buffer; 666 tr->buffer = max_tr.buffer;
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
685 return; 687 return;
686 688
687 WARN_ON_ONCE(!irqs_disabled()); 689 WARN_ON_ONCE(!irqs_disabled());
690 if (!current_trace->use_max_tr) {
691 WARN_ON_ONCE(1);
692 return;
693 }
694
688 arch_spin_lock(&ftrace_max_lock); 695 arch_spin_lock(&ftrace_max_lock);
689 696
690 ftrace_disable_cpu(); 697 ftrace_disable_cpu();
@@ -729,18 +736,11 @@ __acquires(kernel_lock)
729 return -1; 736 return -1;
730 } 737 }
731 738
732 if (strlen(type->name) > MAX_TRACER_SIZE) { 739 if (strlen(type->name) >= MAX_TRACER_SIZE) {
733 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); 740 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
734 return -1; 741 return -1;
735 } 742 }
736 743
737 /*
738 * When this gets called we hold the BKL which means that
739 * preemption is disabled. Various trace selftests however
740 * need to disable and enable preemption for successful tests.
741 * So we drop the BKL here and grab it after the tests again.
742 */
743 unlock_kernel();
744 mutex_lock(&trace_types_lock); 744 mutex_lock(&trace_types_lock);
745 745
746 tracing_selftest_running = true; 746 tracing_selftest_running = true;
@@ -822,7 +822,6 @@ __acquires(kernel_lock)
822#endif 822#endif
823 823
824 out_unlock: 824 out_unlock:
825 lock_kernel();
826 return ret; 825 return ret;
827} 826}
828 827
@@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1331 1330
1332#endif /* CONFIG_STACKTRACE */ 1331#endif /* CONFIG_STACKTRACE */
1333 1332
1334static void
1335ftrace_trace_special(void *__tr,
1336 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1337 int pc)
1338{
1339 struct ftrace_event_call *call = &event_special;
1340 struct ring_buffer_event *event;
1341 struct trace_array *tr = __tr;
1342 struct ring_buffer *buffer = tr->buffer;
1343 struct special_entry *entry;
1344
1345 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1346 sizeof(*entry), 0, pc);
1347 if (!event)
1348 return;
1349 entry = ring_buffer_event_data(event);
1350 entry->arg1 = arg1;
1351 entry->arg2 = arg2;
1352 entry->arg3 = arg3;
1353
1354 if (!filter_check_discard(call, entry, buffer, event))
1355 trace_buffer_unlock_commit(buffer, event, 0, pc);
1356}
1357
1358void
1359__trace_special(void *__tr, void *__data,
1360 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1361{
1362 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1363}
1364
1365void
1366ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1367{
1368 struct trace_array *tr = &global_trace;
1369 struct trace_array_cpu *data;
1370 unsigned long flags;
1371 int cpu;
1372 int pc;
1373
1374 if (tracing_disabled)
1375 return;
1376
1377 pc = preempt_count();
1378 local_irq_save(flags);
1379 cpu = raw_smp_processor_id();
1380 data = tr->data[cpu];
1381
1382 if (likely(atomic_inc_return(&data->disabled) == 1))
1383 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1384
1385 atomic_dec(&data->disabled);
1386 local_irq_restore(flags);
1387}
1388
1389/** 1333/**
1390 * trace_vbprintk - write binary msg to tracing buffer 1334 * trace_vbprintk - write binary msg to tracing buffer
1391 * 1335 *
@@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1404 struct bprint_entry *entry; 1348 struct bprint_entry *entry;
1405 unsigned long flags; 1349 unsigned long flags;
1406 int disable; 1350 int disable;
1407 int resched;
1408 int cpu, len = 0, size, pc; 1351 int cpu, len = 0, size, pc;
1409 1352
1410 if (unlikely(tracing_selftest_running || tracing_disabled)) 1353 if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1414 pause_graph_tracing(); 1357 pause_graph_tracing();
1415 1358
1416 pc = preempt_count(); 1359 pc = preempt_count();
1417 resched = ftrace_preempt_disable(); 1360 preempt_disable_notrace();
1418 cpu = raw_smp_processor_id(); 1361 cpu = raw_smp_processor_id();
1419 data = tr->data[cpu]; 1362 data = tr->data[cpu];
1420 1363
@@ -1452,7 +1395,7 @@ out_unlock:
1452 1395
1453out: 1396out:
1454 atomic_dec_return(&data->disabled); 1397 atomic_dec_return(&data->disabled);
1455 ftrace_preempt_enable(resched); 1398 preempt_enable_notrace();
1456 unpause_graph_tracing(); 1399 unpause_graph_tracing();
1457 1400
1458 return len; 1401 return len;
@@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1539} 1482}
1540EXPORT_SYMBOL_GPL(trace_vprintk); 1483EXPORT_SYMBOL_GPL(trace_vprintk);
1541 1484
1542enum trace_file_type {
1543 TRACE_FILE_LAT_FMT = 1,
1544 TRACE_FILE_ANNOTATE = 2,
1545};
1546
1547static void trace_iterator_increment(struct trace_iterator *iter) 1485static void trace_iterator_increment(struct trace_iterator *iter)
1548{ 1486{
1549 /* Don't allow ftrace to trace into the ring buffers */ 1487 /* Don't allow ftrace to trace into the ring buffers */
@@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1641} 1579}
1642 1580
1643/* Find the next real entry, and increment the iterator to the next entry */ 1581/* Find the next real entry, and increment the iterator to the next entry */
1644static void *find_next_entry_inc(struct trace_iterator *iter) 1582void *trace_find_next_entry_inc(struct trace_iterator *iter)
1645{ 1583{
1646 iter->ent = __find_next_entry(iter, &iter->cpu, 1584 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts); 1585 &iter->lost_events, &iter->ts);
@@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1676 return NULL; 1614 return NULL;
1677 1615
1678 if (iter->idx < 0) 1616 if (iter->idx < 0)
1679 ent = find_next_entry_inc(iter); 1617 ent = trace_find_next_entry_inc(iter);
1680 else 1618 else
1681 ent = iter; 1619 ent = iter;
1682 1620
1683 while (ent && iter->idx < i) 1621 while (ent && iter->idx < i)
1684 ent = find_next_entry_inc(iter); 1622 ent = trace_find_next_entry_inc(iter);
1685 1623
1686 iter->pos = *pos; 1624 iter->pos = *pos;
1687 1625
1688 return ent; 1626 return ent;
1689} 1627}
1690 1628
1691static void tracing_iter_reset(struct trace_iterator *iter, int cpu) 1629void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1692{ 1630{
1693 struct trace_array *tr = iter->tr; 1631 struct trace_array *tr = iter->tr;
1694 struct ring_buffer_event *event; 1632 struct ring_buffer_event *event;
@@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter)
2049} 1987}
2050 1988
2051/* Called with trace_event_read_lock() held. */ 1989/* Called with trace_event_read_lock() held. */
2052static enum print_line_t print_trace_line(struct trace_iterator *iter) 1990enum print_line_t print_trace_line(struct trace_iterator *iter)
2053{ 1991{
2054 enum print_line_t ret; 1992 enum print_line_t ret;
2055 1993
@@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = {
2394 .open = show_traces_open, 2332 .open = show_traces_open,
2395 .read = seq_read, 2333 .read = seq_read,
2396 .release = seq_release, 2334 .release = seq_release,
2335 .llseek = seq_lseek,
2397}; 2336};
2398 2337
2399/* 2338/*
@@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = {
2487 .open = tracing_open_generic, 2426 .open = tracing_open_generic,
2488 .read = tracing_cpumask_read, 2427 .read = tracing_cpumask_read,
2489 .write = tracing_cpumask_write, 2428 .write = tracing_cpumask_write,
2429 .llseek = generic_file_llseek,
2490}; 2430};
2491 2431
2492static int tracing_trace_options_show(struct seq_file *m, void *v) 2432static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2562 trace_flags |= mask; 2502 trace_flags |= mask;
2563 else 2503 else
2564 trace_flags &= ~mask; 2504 trace_flags &= ~mask;
2505
2506 if (mask == TRACE_ITER_RECORD_CMD)
2507 trace_event_enable_cmd_record(enabled);
2565} 2508}
2566 2509
2567static ssize_t 2510static ssize_t
@@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2653static const struct file_operations tracing_readme_fops = { 2596static const struct file_operations tracing_readme_fops = {
2654 .open = tracing_open_generic, 2597 .open = tracing_open_generic,
2655 .read = tracing_readme_read, 2598 .read = tracing_readme_read,
2599 .llseek = generic_file_llseek,
2656}; 2600};
2657 2601
2658static ssize_t 2602static ssize_t
@@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2703static const struct file_operations tracing_saved_cmdlines_fops = { 2647static const struct file_operations tracing_saved_cmdlines_fops = {
2704 .open = tracing_open_generic, 2648 .open = tracing_open_generic,
2705 .read = tracing_saved_cmdlines_read, 2649 .read = tracing_saved_cmdlines_read,
2650 .llseek = generic_file_llseek,
2706}; 2651};
2707 2652
2708static ssize_t 2653static ssize_t
@@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
2798 if (ret < 0) 2743 if (ret < 0)
2799 return ret; 2744 return ret;
2800 2745
2746 if (!current_trace->use_max_tr)
2747 goto out;
2748
2801 ret = ring_buffer_resize(max_tr.buffer, size); 2749 ret = ring_buffer_resize(max_tr.buffer, size);
2802 if (ret < 0) { 2750 if (ret < 0) {
2803 int r; 2751 int r;
@@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
2825 return ret; 2773 return ret;
2826 } 2774 }
2827 2775
2776 max_tr.entries = size;
2777 out:
2828 global_trace.entries = size; 2778 global_trace.entries = size;
2829 2779
2830 return ret; 2780 return ret;
2831} 2781}
2832 2782
2783
2833/** 2784/**
2834 * tracing_update_buffers - used by tracing facility to expand ring buffers 2785 * tracing_update_buffers - used by tracing facility to expand ring buffers
2835 * 2786 *
@@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf)
2890 trace_branch_disable(); 2841 trace_branch_disable();
2891 if (current_trace && current_trace->reset) 2842 if (current_trace && current_trace->reset)
2892 current_trace->reset(tr); 2843 current_trace->reset(tr);
2893 2844 if (current_trace && current_trace->use_max_tr) {
2845 /*
2846 * We don't free the ring buffer. instead, resize it because
2847 * The max_tr ring buffer has some state (e.g. ring->clock) and
2848 * we want preserve it.
2849 */
2850 ring_buffer_resize(max_tr.buffer, 1);
2851 max_tr.entries = 1;
2852 }
2894 destroy_trace_option_files(topts); 2853 destroy_trace_option_files(topts);
2895 2854
2896 current_trace = t; 2855 current_trace = t;
2897 2856
2898 topts = create_trace_option_files(current_trace); 2857 topts = create_trace_option_files(current_trace);
2858 if (current_trace->use_max_tr) {
2859 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
2860 if (ret < 0)
2861 goto out;
2862 max_tr.entries = global_trace.entries;
2863 }
2899 2864
2900 if (t->init) { 2865 if (t->init) {
2901 ret = tracer_init(t, tr); 2866 ret = tracer_init(t, tr);
@@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3032 if (iter->trace->pipe_open) 2997 if (iter->trace->pipe_open)
3033 iter->trace->pipe_open(iter); 2998 iter->trace->pipe_open(iter);
3034 2999
3000 nonseekable_open(inode, filp);
3035out: 3001out:
3036 mutex_unlock(&trace_types_lock); 3002 mutex_unlock(&trace_types_lock);
3037 return ret; 3003 return ret;
@@ -3211,7 +3177,7 @@ waitagain:
3211 3177
3212 trace_event_read_lock(); 3178 trace_event_read_lock();
3213 trace_access_lock(iter->cpu_file); 3179 trace_access_lock(iter->cpu_file);
3214 while (find_next_entry_inc(iter) != NULL) { 3180 while (trace_find_next_entry_inc(iter) != NULL) {
3215 enum print_line_t ret; 3181 enum print_line_t ret;
3216 int len = iter->seq.len; 3182 int len = iter->seq.len;
3217 3183
@@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3294 if (ret != TRACE_TYPE_NO_CONSUME) 3260 if (ret != TRACE_TYPE_NO_CONSUME)
3295 trace_consume(iter); 3261 trace_consume(iter);
3296 rem -= count; 3262 rem -= count;
3297 if (!find_next_entry_inc(iter)) { 3263 if (!trace_find_next_entry_inc(iter)) {
3298 rem = 0; 3264 rem = 0;
3299 iter->ent = NULL; 3265 iter->ent = NULL;
3300 break; 3266 break;
@@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3350 if (ret <= 0) 3316 if (ret <= 0)
3351 goto out_err; 3317 goto out_err;
3352 3318
3353 if (!iter->ent && !find_next_entry_inc(iter)) { 3319 if (!iter->ent && !trace_find_next_entry_inc(iter)) {
3354 ret = -EFAULT; 3320 ret = -EFAULT;
3355 goto out_err; 3321 goto out_err;
3356 } 3322 }
@@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3477 } 3443 }
3478 3444
3479 tracing_start(); 3445 tracing_start();
3480 max_tr.entries = global_trace.entries;
3481 mutex_unlock(&trace_types_lock); 3446 mutex_unlock(&trace_types_lock);
3482 3447
3483 return cnt; 3448 return cnt;
@@ -3590,18 +3555,21 @@ static const struct file_operations tracing_max_lat_fops = {
3590 .open = tracing_open_generic, 3555 .open = tracing_open_generic,
3591 .read = tracing_max_lat_read, 3556 .read = tracing_max_lat_read,
3592 .write = tracing_max_lat_write, 3557 .write = tracing_max_lat_write,
3558 .llseek = generic_file_llseek,
3593}; 3559};
3594 3560
3595static const struct file_operations tracing_ctrl_fops = { 3561static const struct file_operations tracing_ctrl_fops = {
3596 .open = tracing_open_generic, 3562 .open = tracing_open_generic,
3597 .read = tracing_ctrl_read, 3563 .read = tracing_ctrl_read,
3598 .write = tracing_ctrl_write, 3564 .write = tracing_ctrl_write,
3565 .llseek = generic_file_llseek,
3599}; 3566};
3600 3567
3601static const struct file_operations set_tracer_fops = { 3568static const struct file_operations set_tracer_fops = {
3602 .open = tracing_open_generic, 3569 .open = tracing_open_generic,
3603 .read = tracing_set_trace_read, 3570 .read = tracing_set_trace_read,
3604 .write = tracing_set_trace_write, 3571 .write = tracing_set_trace_write,
3572 .llseek = generic_file_llseek,
3605}; 3573};
3606 3574
3607static const struct file_operations tracing_pipe_fops = { 3575static const struct file_operations tracing_pipe_fops = {
@@ -3610,17 +3578,20 @@ static const struct file_operations tracing_pipe_fops = {
3610 .read = tracing_read_pipe, 3578 .read = tracing_read_pipe,
3611 .splice_read = tracing_splice_read_pipe, 3579 .splice_read = tracing_splice_read_pipe,
3612 .release = tracing_release_pipe, 3580 .release = tracing_release_pipe,
3581 .llseek = no_llseek,
3613}; 3582};
3614 3583
3615static const struct file_operations tracing_entries_fops = { 3584static const struct file_operations tracing_entries_fops = {
3616 .open = tracing_open_generic, 3585 .open = tracing_open_generic,
3617 .read = tracing_entries_read, 3586 .read = tracing_entries_read,
3618 .write = tracing_entries_write, 3587 .write = tracing_entries_write,
3588 .llseek = generic_file_llseek,
3619}; 3589};
3620 3590
3621static const struct file_operations tracing_mark_fops = { 3591static const struct file_operations tracing_mark_fops = {
3622 .open = tracing_open_generic, 3592 .open = tracing_open_generic,
3623 .write = tracing_mark_write, 3593 .write = tracing_mark_write,
3594 .llseek = generic_file_llseek,
3624}; 3595};
3625 3596
3626static const struct file_operations trace_clock_fops = { 3597static const struct file_operations trace_clock_fops = {
@@ -3926,6 +3897,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3926static const struct file_operations tracing_stats_fops = { 3897static const struct file_operations tracing_stats_fops = {
3927 .open = tracing_open_generic, 3898 .open = tracing_open_generic,
3928 .read = tracing_stats_read, 3899 .read = tracing_stats_read,
3900 .llseek = generic_file_llseek,
3929}; 3901};
3930 3902
3931#ifdef CONFIG_DYNAMIC_FTRACE 3903#ifdef CONFIG_DYNAMIC_FTRACE
@@ -3962,6 +3934,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3962static const struct file_operations tracing_dyn_info_fops = { 3934static const struct file_operations tracing_dyn_info_fops = {
3963 .open = tracing_open_generic, 3935 .open = tracing_open_generic,
3964 .read = tracing_read_dyn_info, 3936 .read = tracing_read_dyn_info,
3937 .llseek = generic_file_llseek,
3965}; 3938};
3966#endif 3939#endif
3967 3940
@@ -4115,6 +4088,7 @@ static const struct file_operations trace_options_fops = {
4115 .open = tracing_open_generic, 4088 .open = tracing_open_generic,
4116 .read = trace_options_read, 4089 .read = trace_options_read,
4117 .write = trace_options_write, 4090 .write = trace_options_write,
4091 .llseek = generic_file_llseek,
4118}; 4092};
4119 4093
4120static ssize_t 4094static ssize_t
@@ -4166,6 +4140,7 @@ static const struct file_operations trace_options_core_fops = {
4166 .open = tracing_open_generic, 4140 .open = tracing_open_generic,
4167 .read = trace_options_core_read, 4141 .read = trace_options_core_read,
4168 .write = trace_options_core_write, 4142 .write = trace_options_core_write,
4143 .llseek = generic_file_llseek,
4169}; 4144};
4170 4145
4171struct dentry *trace_create_file(const char *name, 4146struct dentry *trace_create_file(const char *name,
@@ -4355,9 +4330,6 @@ static __init int tracer_init_debugfs(void)
4355 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4330 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4356 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4331 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4357#endif 4332#endif
4358#ifdef CONFIG_SYSPROF_TRACER
4359 init_tracer_sysprof_debugfs(d_tracer);
4360#endif
4361 4333
4362 create_trace_options_dir(); 4334 create_trace_options_dir();
4363 4335
@@ -4414,7 +4386,7 @@ static struct notifier_block trace_die_notifier = {
4414 */ 4386 */
4415#define KERN_TRACE KERN_EMERG 4387#define KERN_TRACE KERN_EMERG
4416 4388
4417static void 4389void
4418trace_printk_seq(struct trace_seq *s) 4390trace_printk_seq(struct trace_seq *s)
4419{ 4391{
4420 /* Probably should print a warning here. */ 4392 /* Probably should print a warning here. */
@@ -4429,6 +4401,13 @@ trace_printk_seq(struct trace_seq *s)
4429 trace_seq_init(s); 4401 trace_seq_init(s);
4430} 4402}
4431 4403
4404void trace_init_global_iter(struct trace_iterator *iter)
4405{
4406 iter->tr = &global_trace;
4407 iter->trace = current_trace;
4408 iter->cpu_file = TRACE_PIPE_ALL_CPU;
4409}
4410
4432static void 4411static void
4433__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) 4412__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4434{ 4413{
@@ -4454,8 +4433,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4454 if (disable_tracing) 4433 if (disable_tracing)
4455 ftrace_kill(); 4434 ftrace_kill();
4456 4435
4436 trace_init_global_iter(&iter);
4437
4457 for_each_tracing_cpu(cpu) { 4438 for_each_tracing_cpu(cpu) {
4458 atomic_inc(&global_trace.data[cpu]->disabled); 4439 atomic_inc(&iter.tr->data[cpu]->disabled);
4459 } 4440 }
4460 4441
4461 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 4442 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4504,7 +4485,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4504 iter.iter_flags |= TRACE_FILE_LAT_FMT; 4485 iter.iter_flags |= TRACE_FILE_LAT_FMT;
4505 iter.pos = -1; 4486 iter.pos = -1;
4506 4487
4507 if (find_next_entry_inc(&iter) != NULL) { 4488 if (trace_find_next_entry_inc(&iter) != NULL) {
4508 int ret; 4489 int ret;
4509 4490
4510 ret = print_trace_line(&iter); 4491 ret = print_trace_line(&iter);
@@ -4526,7 +4507,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4526 trace_flags |= old_userobj; 4507 trace_flags |= old_userobj;
4527 4508
4528 for_each_tracing_cpu(cpu) { 4509 for_each_tracing_cpu(cpu) {
4529 atomic_dec(&global_trace.data[cpu]->disabled); 4510 atomic_dec(&iter.tr->data[cpu]->disabled);
4530 } 4511 }
4531 tracing_on(); 4512 tracing_on();
4532 } 4513 }
@@ -4575,16 +4556,14 @@ __init static int tracer_alloc_buffers(void)
4575 4556
4576 4557
4577#ifdef CONFIG_TRACER_MAX_TRACE 4558#ifdef CONFIG_TRACER_MAX_TRACE
4578 max_tr.buffer = ring_buffer_alloc(ring_buf_size, 4559 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
4579 TRACE_BUFFER_FLAGS);
4580 if (!max_tr.buffer) { 4560 if (!max_tr.buffer) {
4581 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4561 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4582 WARN_ON(1); 4562 WARN_ON(1);
4583 ring_buffer_free(global_trace.buffer); 4563 ring_buffer_free(global_trace.buffer);
4584 goto out_free_cpumask; 4564 goto out_free_cpumask;
4585 } 4565 }
4586 max_tr.entries = ring_buffer_size(max_tr.buffer); 4566 max_tr.entries = 1;
4587 WARN_ON(max_tr.entries != global_trace.entries);
4588#endif 4567#endif
4589 4568
4590 /* Allocate the first page for all buffers */ 4569 /* Allocate the first page for all buffers */
@@ -4597,9 +4576,6 @@ __init static int tracer_alloc_buffers(void)
4597 4576
4598 register_tracer(&nop_trace); 4577 register_tracer(&nop_trace);
4599 current_trace = &nop_trace; 4578 current_trace = &nop_trace;
4600#ifdef CONFIG_BOOT_TRACER
4601 register_tracer(&boot_tracer);
4602#endif
4603 /* All seems OK, enable tracing */ 4579 /* All seems OK, enable tracing */
4604 tracing_disabled = 0; 4580 tracing_disabled = 0;
4605 4581
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..d39b3c5454a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h> 10#include <linux/tracepoint.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h>
13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h> 12#include <linux/hw_breakpoint.h>
15
16#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
17#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
18 15
@@ -25,30 +22,17 @@ enum trace_type {
25 TRACE_STACK, 22 TRACE_STACK,
26 TRACE_PRINT, 23 TRACE_PRINT,
27 TRACE_BPRINT, 24 TRACE_BPRINT,
28 TRACE_SPECIAL,
29 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
30 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
31 TRACE_BRANCH, 27 TRACE_BRANCH,
32 TRACE_BOOT_CALL,
33 TRACE_BOOT_RET,
34 TRACE_GRAPH_RET, 28 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 29 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 30 TRACE_USER_STACK,
37 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE,
39 TRACE_BLK, 31 TRACE_BLK,
40 TRACE_KSYM,
41 32
42 __TRACE_LAST_TYPE, 33 __TRACE_LAST_TYPE,
43}; 34};
44 35
45enum kmemtrace_type_id {
46 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49};
50
51extern struct tracer boot_tracer;
52 36
53#undef __field 37#undef __field
54#define __field(type, item) type item; 38#define __field(type, item) type item;
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void);
204 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 188 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
205 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 189 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
206 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 190 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
207 IF_ASSIGN(var, ent, struct special_entry, 0); \
208 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 191 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
209 TRACE_MMIO_RW); \ 192 TRACE_MMIO_RW); \
210 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 193 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
211 TRACE_MMIO_MAP); \ 194 TRACE_MMIO_MAP); \
212 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
213 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
214 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ 195 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
215 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ 196 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
216 TRACE_GRAPH_ENT); \ 197 TRACE_GRAPH_ENT); \
217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 198 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
218 TRACE_GRAPH_RET); \ 199 TRACE_GRAPH_RET); \
219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
220 TRACE_KMEM_ALLOC); \
221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
222 TRACE_KMEM_FREE); \
223 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
224 __ftrace_bad_type(); \ 200 __ftrace_bad_type(); \
225 } while (0) 201 } while (0)
226 202
@@ -298,6 +274,7 @@ struct tracer {
298 struct tracer *next; 274 struct tracer *next;
299 int print_max; 275 int print_max;
300 struct tracer_flags *flags; 276 struct tracer_flags *flags;
277 int use_max_tr;
301}; 278};
302 279
303 280
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
318 const struct file_operations *fops); 295 const struct file_operations *fops);
319 296
320struct dentry *tracing_init_dentry(void); 297struct dentry *tracing_init_dentry(void);
321void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
322 298
323struct ring_buffer_event; 299struct ring_buffer_event;
324 300
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
338struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 314struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
339 int *ent_cpu, u64 *ent_ts); 315 int *ent_cpu, u64 *ent_ts);
340 316
317int trace_empty(struct trace_iterator *iter);
318
319void *trace_find_next_entry_inc(struct trace_iterator *iter);
320
321void trace_init_global_iter(struct trace_iterator *iter);
322
323void tracing_iter_reset(struct trace_iterator *iter, int cpu);
324
341void default_wait_pipe(struct trace_iterator *iter); 325void default_wait_pipe(struct trace_iterator *iter);
342void poll_wait_pipe(struct trace_iterator *iter); 326void poll_wait_pipe(struct trace_iterator *iter);
343 327
@@ -355,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
355 struct task_struct *wakee, 339 struct task_struct *wakee,
356 struct task_struct *cur, 340 struct task_struct *cur,
357 unsigned long flags, int pc); 341 unsigned long flags, int pc);
358void trace_special(struct trace_array *tr,
359 struct trace_array_cpu *data,
360 unsigned long arg1,
361 unsigned long arg2,
362 unsigned long arg3, int pc);
363void trace_function(struct trace_array *tr, 342void trace_function(struct trace_array *tr,
364 unsigned long ip, 343 unsigned long ip,
365 unsigned long parent_ip, 344 unsigned long parent_ip,
@@ -380,8 +359,15 @@ void tracing_start_sched_switch_record(void);
380int register_tracer(struct tracer *type); 359int register_tracer(struct tracer *type);
381void unregister_tracer(struct tracer *type); 360void unregister_tracer(struct tracer *type);
382int is_tracing_stopped(void); 361int is_tracing_stopped(void);
362enum trace_file_type {
363 TRACE_FILE_LAT_FMT = 1,
364 TRACE_FILE_ANNOTATE = 2,
365};
366
367extern cpumask_var_t __read_mostly tracing_buffer_mask;
383 368
384extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); 369#define for_each_tracing_cpu(cpu) \
370 for_each_cpu(cpu, tracing_buffer_mask)
385 371
386extern unsigned long nsecs_to_usecs(unsigned long nsecs); 372extern unsigned long nsecs_to_usecs(unsigned long nsecs);
387 373
@@ -452,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
452 struct trace_array *tr); 438 struct trace_array *tr);
453extern int trace_selftest_startup_sched_switch(struct tracer *trace, 439extern int trace_selftest_startup_sched_switch(struct tracer *trace,
454 struct trace_array *tr); 440 struct trace_array *tr);
455extern int trace_selftest_startup_sysprof(struct tracer *trace,
456 struct trace_array *tr);
457extern int trace_selftest_startup_branch(struct tracer *trace, 441extern int trace_selftest_startup_branch(struct tracer *trace,
458 struct trace_array *tr); 442 struct trace_array *tr);
459extern int trace_selftest_startup_ksym(struct tracer *trace,
460 struct trace_array *tr);
461#endif /* CONFIG_FTRACE_STARTUP_TEST */ 443#endif /* CONFIG_FTRACE_STARTUP_TEST */
462 444
463extern void *head_page(struct trace_array_cpu *data); 445extern void *head_page(struct trace_array_cpu *data);
@@ -471,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr,
471 unsigned long ip, const char *fmt, va_list args); 453 unsigned long ip, const char *fmt, va_list args);
472int trace_array_printk(struct trace_array *tr, 454int trace_array_printk(struct trace_array *tr,
473 unsigned long ip, const char *fmt, ...); 455 unsigned long ip, const char *fmt, ...);
456void trace_printk_seq(struct trace_seq *s);
457enum print_line_t print_trace_line(struct trace_iterator *iter);
474 458
475extern unsigned long trace_flags; 459extern unsigned long trace_flags;
476 460
@@ -617,6 +601,7 @@ enum trace_iterator_flags {
617 TRACE_ITER_LATENCY_FMT = 0x20000, 601 TRACE_ITER_LATENCY_FMT = 0x20000,
618 TRACE_ITER_SLEEP_TIME = 0x40000, 602 TRACE_ITER_SLEEP_TIME = 0x40000,
619 TRACE_ITER_GRAPH_TIME = 0x80000, 603 TRACE_ITER_GRAPH_TIME = 0x80000,
604 TRACE_ITER_RECORD_CMD = 0x100000,
620}; 605};
621 606
622/* 607/*
@@ -628,54 +613,6 @@ enum trace_iterator_flags {
628 613
629extern struct tracer nop_trace; 614extern struct tracer nop_trace;
630 615
631/**
632 * ftrace_preempt_disable - disable preemption scheduler safe
633 *
634 * When tracing can happen inside the scheduler, there exists
635 * cases that the tracing might happen before the need_resched
636 * flag is checked. If this happens and the tracer calls
637 * preempt_enable (after a disable), a schedule might take place
638 * causing an infinite recursion.
639 *
640 * To prevent this, we read the need_resched flag before
641 * disabling preemption. When we want to enable preemption we
642 * check the flag, if it is set, then we call preempt_enable_no_resched.
643 * Otherwise, we call preempt_enable.
644 *
645 * The rational for doing the above is that if need_resched is set
646 * and we have yet to reschedule, we are either in an atomic location
647 * (where we do not need to check for scheduling) or we are inside
648 * the scheduler and do not want to resched.
649 */
650static inline int ftrace_preempt_disable(void)
651{
652 int resched;
653
654 resched = need_resched();
655 preempt_disable_notrace();
656
657 return resched;
658}
659
660/**
661 * ftrace_preempt_enable - enable preemption scheduler safe
662 * @resched: the return value from ftrace_preempt_disable
663 *
664 * This is a scheduler safe way to enable preemption and not miss
665 * any preemption checks. The disabled saved the state of preemption.
666 * If resched is set, then we are either inside an atomic or
667 * are inside the scheduler (we would have already scheduled
668 * otherwise). In this case, we do not want to call normal
669 * preempt_enable, but preempt_enable_no_resched instead.
670 */
671static inline void ftrace_preempt_enable(int resched)
672{
673 if (resched)
674 preempt_enable_no_resched_notrace();
675 else
676 preempt_enable_notrace();
677}
678
679#ifdef CONFIG_BRANCH_TRACER 616#ifdef CONFIG_BRANCH_TRACER
680extern int enable_branch_tracing(struct trace_array *tr); 617extern int enable_branch_tracing(struct trace_array *tr);
681extern void disable_branch_tracing(void); 618extern void disable_branch_tracing(void);
@@ -766,6 +703,8 @@ struct filter_pred {
766 int pop_n; 703 int pop_n;
767}; 704};
768 705
706extern struct list_head ftrace_common_fields;
707
769extern enum regex_type 708extern enum regex_type
770filter_parse_regex(char *buff, int len, char **search, int *not); 709filter_parse_regex(char *buff, int len, char **search, int *not);
771extern void print_event_filter(struct ftrace_event_call *call, 710extern void print_event_filter(struct ftrace_event_call *call,
@@ -795,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
795 return 0; 734 return 0;
796} 735}
797 736
737extern void trace_event_enable_cmd_record(bool enable);
738
798extern struct mutex event_mutex; 739extern struct mutex event_mutex;
799extern struct list_head ftrace_events; 740extern struct list_head ftrace_events;
800 741
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12#include <linux/time.h>
13
14#include "trace.h"
15#include "trace_output.h"
16
17static struct trace_array *boot_trace;
18static bool pre_initcalls_finished;
19
20/* Tells the boot tracer that the pre_smp_initcalls are finished.
21 * So we are ready .
22 * It doesn't enable sched events tracing however.
23 * You have to call enable_boot_trace to do so.
24 */
25void start_boot_trace(void)
26{
27 pre_initcalls_finished = true;
28}
29
30void enable_boot_trace(void)
31{
32 if (boot_trace && pre_initcalls_finished)
33 tracing_start_sched_switch_record();
34}
35
36void disable_boot_trace(void)
37{
38 if (boot_trace && pre_initcalls_finished)
39 tracing_stop_sched_switch_record();
40}
41
42static int boot_trace_init(struct trace_array *tr)
43{
44 boot_trace = tr;
45
46 if (!tr)
47 return 0;
48
49 tracing_reset_online_cpus(tr);
50
51 tracing_sched_switch_assign_trace(tr);
52 return 0;
53}
54
55static enum print_line_t
56initcall_call_print_line(struct trace_iterator *iter)
57{
58 struct trace_entry *entry = iter->ent;
59 struct trace_seq *s = &iter->seq;
60 struct trace_boot_call *field;
61 struct boot_trace_call *call;
62 u64 ts;
63 unsigned long nsec_rem;
64 int ret;
65
66 trace_assign_type(field, entry);
67 call = &field->boot_call;
68 ts = iter->ts;
69 nsec_rem = do_div(ts, NSEC_PER_SEC);
70
71 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
72 (unsigned long)ts, nsec_rem, call->func, call->caller);
73
74 if (!ret)
75 return TRACE_TYPE_PARTIAL_LINE;
76 else
77 return TRACE_TYPE_HANDLED;
78}
79
80static enum print_line_t
81initcall_ret_print_line(struct trace_iterator *iter)
82{
83 struct trace_entry *entry = iter->ent;
84 struct trace_seq *s = &iter->seq;
85 struct trace_boot_ret *field;
86 struct boot_trace_ret *init_ret;
87 u64 ts;
88 unsigned long nsec_rem;
89 int ret;
90
91 trace_assign_type(field, entry);
92 init_ret = &field->boot_ret;
93 ts = iter->ts;
94 nsec_rem = do_div(ts, NSEC_PER_SEC);
95
96 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
97 "returned %d after %llu msecs\n",
98 (unsigned long) ts,
99 nsec_rem,
100 init_ret->func, init_ret->result, init_ret->duration);
101
102 if (!ret)
103 return TRACE_TYPE_PARTIAL_LINE;
104 else
105 return TRACE_TYPE_HANDLED;
106}
107
108static enum print_line_t initcall_print_line(struct trace_iterator *iter)
109{
110 struct trace_entry *entry = iter->ent;
111
112 switch (entry->type) {
113 case TRACE_BOOT_CALL:
114 return initcall_call_print_line(iter);
115 case TRACE_BOOT_RET:
116 return initcall_ret_print_line(iter);
117 default:
118 return TRACE_TYPE_UNHANDLED;
119 }
120}
121
122struct tracer boot_tracer __read_mostly =
123{
124 .name = "initcall",
125 .init = boot_trace_init,
126 .reset = tracing_reset_online_cpus,
127 .print_line = initcall_print_line,
128};
129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{
132 struct ftrace_event_call *call = &event_boot_call;
133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace;
137
138 if (!tr || !pre_initcalls_finished)
139 return;
140
141 /* Get its name now since this function could
142 * disappear because it is in the .init section.
143 */
144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable();
146
147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
149 sizeof(*entry), 0, 0);
150 if (!event)
151 goto out;
152 entry = ring_buffer_event_data(event);
153 entry->boot_call = *bt;
154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
156 out:
157 preempt_enable();
158}
159
160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
161{
162 struct ftrace_event_call *call = &event_boot_ret;
163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
165 struct trace_boot_ret *entry;
166 struct trace_array *tr = boot_trace;
167
168 if (!tr || !pre_initcalls_finished)
169 return;
170
171 sprint_symbol(bt->func, (unsigned long)fn);
172 preempt_disable();
173
174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
176 sizeof(*entry), 0, 0);
177 if (!event)
178 goto out;
179 entry = ring_buffer_event_data(event);
180 entry->boot_ret = *bt;
181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
183 out:
184 preempt_enable();
185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
32u64 notrace trace_clock_local(void) 32u64 notrace trace_clock_local(void)
33{ 33{
34 u64 clock; 34 u64 clock;
35 int resched;
36 35
37 /* 36 /*
38 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
39 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
40 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
41 */ 40 */
42 resched = ftrace_preempt_disable(); 41 preempt_disable_notrace();
43 clock = sched_clock(); 42 clock = sched_clock();
44 ftrace_preempt_enable(resched); 43 preempt_enable_notrace();
45 44
46 return clock; 45 return clock;
47} 46}
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void)
56 */ 55 */
57u64 notrace trace_clock(void) 56u64 notrace trace_clock(void)
58{ 57{
59 return cpu_clock(raw_smp_processor_id()); 58 return local_clock();
60} 59}
61 60
62 61
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index dc008c1240da..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
151); 151);
152 152
153/* 153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry: 154 * Stack-trace entry:
172 */ 155 */
173 156
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
271 __entry->map_id, __entry->opcode) 254 __entry->map_id, __entry->opcode)
272); 255);
273 256
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301 257
302#define TRACE_FUNC_SIZE 30 258#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20 259#define TRACE_FILE_SIZE 20
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 274 __entry->func, __entry->file, __entry->correct)
319); 275);
320 276
321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
322
323 TRACE_KMEM_ALLOC,
324
325 F_STRUCT(
326 __field( enum kmemtrace_type_id, type_id )
327 __field( unsigned long, call_site )
328 __field( const void *, ptr )
329 __field( size_t, bytes_req )
330 __field( size_t, bytes_alloc )
331 __field( gfp_t, gfp_flags )
332 __field( int, node )
333 ),
334
335 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
336 " flags:%x node:%d",
337 __entry->type_id, __entry->call_site, __entry->ptr,
338 __entry->bytes_req, __entry->bytes_alloc,
339 __entry->gfp_flags, __entry->node)
340);
341
342FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
343
344 TRACE_KMEM_FREE,
345
346 F_STRUCT(
347 __field( enum kmemtrace_type_id, type_id )
348 __field( unsigned long, call_site )
349 __field( const void *, ptr )
350 ),
351
352 F_printk("type:%u call_site:%lx ptr:%p",
353 __entry->type_id, __entry->call_site, __entry->ptr)
354);
355
356FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
357
358 TRACE_KSYM,
359
360 F_STRUCT(
361 __field( unsigned long, ip )
362 __field( unsigned char, type )
363 __array( char , cmd, TASK_COMM_LEN )
364 __field( unsigned long, addr )
365 ),
366
367 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
368 (void *)__entry->ip, (unsigned int)__entry->type,
369 (void *)__entry->addr, __entry->cmd)
370);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 8a2b73f7c068..000e6e85b445 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,8 +9,6 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
13
14static char *perf_trace_buf[4]; 12static char *perf_trace_buf[4];
15 13
16/* 14/*
@@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
56 } 54 }
57 } 55 }
58 56
59 if (tp_event->class->reg) 57 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret) 58 if (ret)
67 goto fail; 59 goto fail;
68 60
@@ -96,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event)
96 mutex_lock(&event_mutex); 88 mutex_lock(&event_mutex);
97 list_for_each_entry(tp_event, &ftrace_events, list) { 89 list_for_each_entry(tp_event, &ftrace_events, list) {
98 if (tp_event->event.type == event_id && 90 if (tp_event->event.type == event_id &&
99 tp_event->class && 91 tp_event->class && tp_event->class->reg &&
100 (tp_event->class->perf_probe ||
101 tp_event->class->reg) &&
102 try_module_get(tp_event->mod)) { 92 try_module_get(tp_event->mod)) {
103 ret = perf_trace_event_init(tp_event, p_event); 93 ret = perf_trace_event_init(tp_event, p_event);
104 break; 94 break;
@@ -138,18 +128,13 @@ void perf_trace_destroy(struct perf_event *p_event)
138 if (--tp_event->perf_refcount > 0) 128 if (--tp_event->perf_refcount > 0)
139 goto out; 129 goto out;
140 130
141 if (tp_event->class->reg) 131 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
142 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
143 else
144 tracepoint_probe_unregister(tp_event->name,
145 tp_event->class->perf_probe,
146 tp_event);
147 132
148 /* 133 /*
149 * Ensure our callback won't be called anymore. See 134 * Ensure our callback won't be called anymore. The buffers
150 * tracepoint_probe_unregister() and __DO_TRACE(). 135 * will be freed after that.
151 */ 136 */
152 synchronize_sched(); 137 tracepoint_synchronize_unregister();
153 138
154 free_percpu(tp_event->perf_events); 139 free_percpu(tp_event->perf_events);
155 tp_event->perf_events = NULL; 140 tp_event->perf_events = NULL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..09b4fa6e4d3b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,6 +28,7 @@
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields);
31 32
32struct list_head * 33struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call) 34trace_get_fields(struct ftrace_event_call *event_call)
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call)
37 return event_call->class->get_fields(event_call); 38 return event_call->class->get_fields(event_call);
38} 39}
39 40
40int trace_define_field(struct ftrace_event_call *call, const char *type, 41static int __trace_define_field(struct list_head *head, const char *type,
41 const char *name, int offset, int size, int is_signed, 42 const char *name, int offset, int size,
42 int filter_type) 43 int is_signed, int filter_type)
43{ 44{
44 struct ftrace_event_field *field; 45 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
49 46
50 field = kzalloc(sizeof(*field), GFP_KERNEL); 47 field = kzalloc(sizeof(*field), GFP_KERNEL);
51 if (!field) 48 if (!field)
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
68 field->size = size; 65 field->size = size;
69 field->is_signed = is_signed; 66 field->is_signed = is_signed;
70 67
71 head = trace_get_fields(call);
72 list_add(&field->link, head); 68 list_add(&field->link, head);
73 69
74 return 0; 70 return 0;
@@ -80,17 +76,32 @@ err:
80 76
81 return -ENOMEM; 77 return -ENOMEM;
82} 78}
79
80int trace_define_field(struct ftrace_event_call *call, const char *type,
81 const char *name, int offset, int size, int is_signed,
82 int filter_type)
83{
84 struct list_head *head;
85
86 if (WARN_ON(!call->class))
87 return 0;
88
89 head = trace_get_fields(call);
90 return __trace_define_field(head, type, name, offset, size,
91 is_signed, filter_type);
92}
83EXPORT_SYMBOL_GPL(trace_define_field); 93EXPORT_SYMBOL_GPL(trace_define_field);
84 94
85#define __common_field(type, item) \ 95#define __common_field(type, item) \
86 ret = trace_define_field(call, #type, "common_" #item, \ 96 ret = __trace_define_field(&ftrace_common_fields, #type, \
87 offsetof(typeof(ent), item), \ 97 "common_" #item, \
88 sizeof(ent.item), \ 98 offsetof(typeof(ent), item), \
89 is_signed_type(type), FILTER_OTHER); \ 99 sizeof(ent.item), \
100 is_signed_type(type), FILTER_OTHER); \
90 if (ret) \ 101 if (ret) \
91 return ret; 102 return ret;
92 103
93static int trace_define_common_fields(struct ftrace_event_call *call) 104static int trace_define_common_fields(void)
94{ 105{
95 int ret; 106 int ret;
96 struct trace_entry ent; 107 struct trace_entry ent;
@@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call)
130} 141}
131EXPORT_SYMBOL_GPL(trace_event_raw_init); 142EXPORT_SYMBOL_GPL(trace_event_raw_init);
132 143
144int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
145{
146 switch (type) {
147 case TRACE_REG_REGISTER:
148 return tracepoint_probe_register(call->name,
149 call->class->probe,
150 call);
151 case TRACE_REG_UNREGISTER:
152 tracepoint_probe_unregister(call->name,
153 call->class->probe,
154 call);
155 return 0;
156
157#ifdef CONFIG_PERF_EVENTS
158 case TRACE_REG_PERF_REGISTER:
159 return tracepoint_probe_register(call->name,
160 call->class->perf_probe,
161 call);
162 case TRACE_REG_PERF_UNREGISTER:
163 tracepoint_probe_unregister(call->name,
164 call->class->perf_probe,
165 call);
166 return 0;
167#endif
168 }
169 return 0;
170}
171EXPORT_SYMBOL_GPL(ftrace_event_reg);
172
173void trace_event_enable_cmd_record(bool enable)
174{
175 struct ftrace_event_call *call;
176
177 mutex_lock(&event_mutex);
178 list_for_each_entry(call, &ftrace_events, list) {
179 if (!(call->flags & TRACE_EVENT_FL_ENABLED))
180 continue;
181
182 if (enable) {
183 tracing_start_cmdline_record();
184 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
185 } else {
186 tracing_stop_cmdline_record();
187 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
188 }
189 }
190 mutex_unlock(&event_mutex);
191}
192
133static int ftrace_event_enable_disable(struct ftrace_event_call *call, 193static int ftrace_event_enable_disable(struct ftrace_event_call *call,
134 int enable) 194 int enable)
135{ 195{
@@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
139 case 0: 199 case 0:
140 if (call->flags & TRACE_EVENT_FL_ENABLED) { 200 if (call->flags & TRACE_EVENT_FL_ENABLED) {
141 call->flags &= ~TRACE_EVENT_FL_ENABLED; 201 call->flags &= ~TRACE_EVENT_FL_ENABLED;
142 tracing_stop_cmdline_record(); 202 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
143 if (call->class->reg) 203 tracing_stop_cmdline_record();
144 call->class->reg(call, TRACE_REG_UNREGISTER); 204 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
145 else 205 }
146 tracepoint_probe_unregister(call->name, 206 call->class->reg(call, TRACE_REG_UNREGISTER);
147 call->class->probe,
148 call);
149 } 207 }
150 break; 208 break;
151 case 1: 209 case 1:
152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 210 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
153 tracing_start_cmdline_record(); 211 if (trace_flags & TRACE_ITER_RECORD_CMD) {
154 if (call->class->reg) 212 tracing_start_cmdline_record();
155 ret = call->class->reg(call, TRACE_REG_REGISTER); 213 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
156 else 214 }
157 ret = tracepoint_probe_register(call->name, 215 ret = call->class->reg(call, TRACE_REG_REGISTER);
158 call->class->probe,
159 call);
160 if (ret) { 216 if (ret) {
161 tracing_stop_cmdline_record(); 217 tracing_stop_cmdline_record();
162 pr_info("event trace: Could not enable event " 218 pr_info("event trace: Could not enable event "
@@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
194 mutex_lock(&event_mutex); 250 mutex_lock(&event_mutex);
195 list_for_each_entry(call, &ftrace_events, list) { 251 list_for_each_entry(call, &ftrace_events, list) {
196 252
197 if (!call->name || !call->class || 253 if (!call->name || !call->class || !call->class->reg)
198 (!call->class->probe && !call->class->reg))
199 continue; 254 continue;
200 255
201 if (match && 256 if (match &&
@@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
321 * The ftrace subsystem is for showing formats only. 376 * The ftrace subsystem is for showing formats only.
322 * They can not be enabled or disabled via the event files. 377 * They can not be enabled or disabled via the event files.
323 */ 378 */
324 if (call->class && (call->class->probe || call->class->reg)) 379 if (call->class && call->class->reg)
325 return call; 380 return call;
326 } 381 }
327 382
@@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
474 529
475 mutex_lock(&event_mutex); 530 mutex_lock(&event_mutex);
476 list_for_each_entry(call, &ftrace_events, list) { 531 list_for_each_entry(call, &ftrace_events, list) {
477 if (!call->name || !call->class || 532 if (!call->name || !call->class || !call->class->reg)
478 (!call->class->probe && !call->class->reg))
479 continue; 533 continue;
480 534
481 if (system && strcmp(call->class->system, system) != 0) 535 if (system && strcmp(call->class->system, system) != 0)
@@ -544,32 +598,10 @@ out:
544 return ret; 598 return ret;
545} 599}
546 600
547static ssize_t 601static void print_event_fields(struct trace_seq *s, struct list_head *head)
548event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
549 loff_t *ppos)
550{ 602{
551 struct ftrace_event_call *call = filp->private_data;
552 struct ftrace_event_field *field; 603 struct ftrace_event_field *field;
553 struct list_head *head;
554 struct trace_seq *s;
555 int common_field_count = 5;
556 char *buf;
557 int r = 0;
558
559 if (*ppos)
560 return 0;
561
562 s = kmalloc(sizeof(*s), GFP_KERNEL);
563 if (!s)
564 return -ENOMEM;
565
566 trace_seq_init(s);
567
568 trace_seq_printf(s, "name: %s\n", call->name);
569 trace_seq_printf(s, "ID: %d\n", call->event.type);
570 trace_seq_printf(s, "format:\n");
571 604
572 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) { 605 list_for_each_entry_reverse(field, head, link) {
574 /* 606 /*
575 * Smartly shows the array type(except dynamic array). 607 * Smartly shows the array type(except dynamic array).
@@ -584,29 +616,54 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
584 array_descriptor = NULL; 616 array_descriptor = NULL;
585 617
586 if (!array_descriptor) { 618 if (!array_descriptor) {
587 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" 619 trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
588 "\tsize:%u;\tsigned:%d;\n", 620 "\tsize:%u;\tsigned:%d;\n",
589 field->type, field->name, field->offset, 621 field->type, field->name, field->offset,
590 field->size, !!field->is_signed); 622 field->size, !!field->is_signed);
591 } else { 623 } else {
592 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" 624 trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
593 "\tsize:%u;\tsigned:%d;\n", 625 "\tsize:%u;\tsigned:%d;\n",
594 (int)(array_descriptor - field->type), 626 (int)(array_descriptor - field->type),
595 field->type, field->name, 627 field->type, field->name,
596 array_descriptor, field->offset, 628 array_descriptor, field->offset,
597 field->size, !!field->is_signed); 629 field->size, !!field->is_signed);
598 } 630 }
631 }
632}
599 633
600 if (--common_field_count == 0) 634static ssize_t
601 r = trace_seq_printf(s, "\n"); 635event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
636 loff_t *ppos)
637{
638 struct ftrace_event_call *call = filp->private_data;
639 struct list_head *head;
640 struct trace_seq *s;
641 char *buf;
642 int r;
602 643
603 if (!r) 644 if (*ppos)
604 break; 645 return 0;
605 } 646
647 s = kmalloc(sizeof(*s), GFP_KERNEL);
648 if (!s)
649 return -ENOMEM;
650
651 trace_seq_init(s);
652
653 trace_seq_printf(s, "name: %s\n", call->name);
654 trace_seq_printf(s, "ID: %d\n", call->event.type);
655 trace_seq_printf(s, "format:\n");
656
657 /* print common fields */
658 print_event_fields(s, &ftrace_common_fields);
606 659
607 if (r) 660 trace_seq_putc(s, '\n');
608 r = trace_seq_printf(s, "\nprint fmt: %s\n", 661
609 call->print_fmt); 662 /* print event specific fields */
663 head = trace_get_fields(call);
664 print_event_fields(s, head);
665
666 r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt);
610 667
611 if (!r) { 668 if (!r) {
612 /* 669 /*
@@ -963,35 +1020,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
963 return -1; 1020 return -1;
964 } 1021 }
965 1022
966 if (call->class->probe || call->class->reg) 1023 if (call->class->reg)
967 trace_create_file("enable", 0644, call->dir, call, 1024 trace_create_file("enable", 0644, call->dir, call,
968 enable); 1025 enable);
969 1026
970#ifdef CONFIG_PERF_EVENTS 1027#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg)) 1028 if (call->event.type && call->class->reg)
972 trace_create_file("id", 0444, call->dir, call, 1029 trace_create_file("id", 0444, call->dir, call,
973 id); 1030 id);
974#endif 1031#endif
975 1032
976 if (call->class->define_fields) { 1033 /*
977 /* 1034 * Other events may have the same class. Only update
978 * Other events may have the same class. Only update 1035 * the fields if they are not already defined.
979 * the fields if they are not already defined. 1036 */
980 */ 1037 head = trace_get_fields(call);
981 head = trace_get_fields(call); 1038 if (list_empty(head)) {
982 if (list_empty(head)) { 1039 ret = call->class->define_fields(call);
983 ret = trace_define_common_fields(call); 1040 if (ret < 0) {
984 if (!ret) 1041 pr_warning("Could not initialize trace point"
985 ret = call->class->define_fields(call); 1042 " events/%s\n", call->name);
986 if (ret < 0) { 1043 return ret;
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
991 } 1044 }
992 trace_create_file("filter", 0644, call->dir, call,
993 filter);
994 } 1045 }
1046 trace_create_file("filter", 0644, call->dir, call,
1047 filter);
995 1048
996 trace_create_file("format", 0444, call->dir, call, 1049 trace_create_file("format", 0444, call->dir, call,
997 format); 1050 format);
@@ -999,11 +1052,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
999 return 0; 1052 return 0;
1000} 1053}
1001 1054
1002static int __trace_add_event_call(struct ftrace_event_call *call) 1055static int
1056__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1057 const struct file_operations *id,
1058 const struct file_operations *enable,
1059 const struct file_operations *filter,
1060 const struct file_operations *format)
1003{ 1061{
1004 struct dentry *d_events; 1062 struct dentry *d_events;
1005 int ret; 1063 int ret;
1006 1064
1065 /* The linker may leave blanks */
1007 if (!call->name) 1066 if (!call->name)
1008 return -EINVAL; 1067 return -EINVAL;
1009 1068
@@ -1011,8 +1070,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1011 ret = call->class->raw_init(call); 1070 ret = call->class->raw_init(call);
1012 if (ret < 0) { 1071 if (ret < 0) {
1013 if (ret != -ENOSYS) 1072 if (ret != -ENOSYS)
1014 pr_warning("Could not initialize trace " 1073 pr_warning("Could not initialize trace events/%s\n",
1015 "events/%s\n", call->name); 1074 call->name);
1016 return ret; 1075 return ret;
1017 } 1076 }
1018 } 1077 }
@@ -1021,11 +1080,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1021 if (!d_events) 1080 if (!d_events)
1022 return -ENOENT; 1081 return -ENOENT;
1023 1082
1024 ret = event_create_dir(call, d_events, &ftrace_event_id_fops, 1083 ret = event_create_dir(call, d_events, id, enable, filter, format);
1025 &ftrace_enable_fops, &ftrace_event_filter_fops,
1026 &ftrace_event_format_fops);
1027 if (!ret) 1084 if (!ret)
1028 list_add(&call->list, &ftrace_events); 1085 list_add(&call->list, &ftrace_events);
1086 call->mod = mod;
1029 1087
1030 return ret; 1088 return ret;
1031} 1089}
@@ -1035,7 +1093,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
1035{ 1093{
1036 int ret; 1094 int ret;
1037 mutex_lock(&event_mutex); 1095 mutex_lock(&event_mutex);
1038 ret = __trace_add_event_call(call); 1096 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1097 &ftrace_enable_fops,
1098 &ftrace_event_filter_fops,
1099 &ftrace_event_format_fops);
1039 mutex_unlock(&event_mutex); 1100 mutex_unlock(&event_mutex);
1040 return ret; 1101 return ret;
1041} 1102}
@@ -1152,8 +1213,6 @@ static void trace_module_add_events(struct module *mod)
1152{ 1213{
1153 struct ftrace_module_file_ops *file_ops = NULL; 1214 struct ftrace_module_file_ops *file_ops = NULL;
1154 struct ftrace_event_call *call, *start, *end; 1215 struct ftrace_event_call *call, *start, *end;
1155 struct dentry *d_events;
1156 int ret;
1157 1216
1158 start = mod->trace_events; 1217 start = mod->trace_events;
1159 end = mod->trace_events + mod->num_trace_events; 1218 end = mod->trace_events + mod->num_trace_events;
@@ -1161,38 +1220,14 @@ static void trace_module_add_events(struct module *mod)
1161 if (start == end) 1220 if (start == end)
1162 return; 1221 return;
1163 1222
1164 d_events = event_trace_events_dir(); 1223 file_ops = trace_create_file_ops(mod);
1165 if (!d_events) 1224 if (!file_ops)
1166 return; 1225 return;
1167 1226
1168 for_each_event(call, start, end) { 1227 for_each_event(call, start, end) {
1169 /* The linker may leave blanks */ 1228 __trace_add_event_call(call, mod,
1170 if (!call->name)
1171 continue;
1172 if (call->class->raw_init) {
1173 ret = call->class->raw_init(call);
1174 if (ret < 0) {
1175 if (ret != -ENOSYS)
1176 pr_warning("Could not initialize trace "
1177 "point events/%s\n", call->name);
1178 continue;
1179 }
1180 }
1181 /*
1182 * This module has events, create file ops for this module
1183 * if not already done.
1184 */
1185 if (!file_ops) {
1186 file_ops = trace_create_file_ops(mod);
1187 if (!file_ops)
1188 return;
1189 }
1190 call->mod = mod;
1191 ret = event_create_dir(call, d_events,
1192 &file_ops->id, &file_ops->enable, 1229 &file_ops->id, &file_ops->enable,
1193 &file_ops->filter, &file_ops->format); 1230 &file_ops->filter, &file_ops->format);
1194 if (!ret)
1195 list_add(&call->list, &ftrace_events);
1196 } 1231 }
1197} 1232}
1198 1233
@@ -1319,25 +1354,14 @@ static __init int event_trace_init(void)
1319 trace_create_file("enable", 0644, d_events, 1354 trace_create_file("enable", 0644, d_events,
1320 NULL, &ftrace_system_enable_fops); 1355 NULL, &ftrace_system_enable_fops);
1321 1356
1357 if (trace_define_common_fields())
1358 pr_warning("tracing: Failed to allocate common fields");
1359
1322 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1360 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1323 /* The linker may leave blanks */ 1361 __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1324 if (!call->name)
1325 continue;
1326 if (call->class->raw_init) {
1327 ret = call->class->raw_init(call);
1328 if (ret < 0) {
1329 if (ret != -ENOSYS)
1330 pr_warning("Could not initialize trace "
1331 "point events/%s\n", call->name);
1332 continue;
1333 }
1334 }
1335 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1336 &ftrace_enable_fops, 1362 &ftrace_enable_fops,
1337 &ftrace_event_filter_fops, 1363 &ftrace_event_filter_fops,
1338 &ftrace_event_format_fops); 1364 &ftrace_event_format_fops);
1339 if (!ret)
1340 list_add(&call->list, &ftrace_events);
1341 } 1365 }
1342 1366
1343 while (true) { 1367 while (true) {
@@ -1524,12 +1548,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1524 struct ftrace_entry *entry; 1548 struct ftrace_entry *entry;
1525 unsigned long flags; 1549 unsigned long flags;
1526 long disabled; 1550 long disabled;
1527 int resched;
1528 int cpu; 1551 int cpu;
1529 int pc; 1552 int pc;
1530 1553
1531 pc = preempt_count(); 1554 pc = preempt_count();
1532 resched = ftrace_preempt_disable(); 1555 preempt_disable_notrace();
1533 cpu = raw_smp_processor_id(); 1556 cpu = raw_smp_processor_id();
1534 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); 1557 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1535 1558
@@ -1551,7 +1574,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1551 1574
1552 out: 1575 out:
1553 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1576 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1554 ftrace_preempt_enable(resched); 1577 preempt_enable_notrace();
1555} 1578}
1556 1579
1557static struct ftrace_ops trace_ops __initdata = 1580static struct ftrace_ops trace_ops __initdata =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 57bb1bb32999..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system,
497} 497}
498 498
499static struct ftrace_event_field * 499static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500__find_event_field(struct list_head *head, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
504 503
505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) { 504 list_for_each_entry(field, head, link) {
507 if (!strcmp(field->name, name)) 505 if (!strcmp(field->name, name))
508 return field; 506 return field;
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
511 return NULL; 509 return NULL;
512} 510}
513 511
512static struct ftrace_event_field *
513find_event_field(struct ftrace_event_call *call, char *name)
514{
515 struct ftrace_event_field *field;
516 struct list_head *head;
517
518 field = __find_event_field(&ftrace_common_fields, name);
519 if (field)
520 return field;
521
522 head = trace_get_fields(call);
523 return __find_event_field(head, name);
524}
525
514static void filter_free_pred(struct filter_pred *pred) 526static void filter_free_pred(struct filter_pred *pred)
515{ 527{
516 if (!pred) 528 if (!pred)
@@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system)
627 int err; 639 int err;
628 640
629 list_for_each_entry(call, &ftrace_events, list) { 641 list_for_each_entry(call, &ftrace_events, list) {
630 if (!call->class || !call->class->define_fields)
631 continue;
632
633 if (strcmp(call->class->system, system->name) != 0) 642 if (strcmp(call->class->system, system->name) != 0)
634 continue; 643 continue;
635 644
@@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
646 struct ftrace_event_call *call; 655 struct ftrace_event_call *call;
647 656
648 list_for_each_entry(call, &ftrace_events, list) { 657 list_for_each_entry(call, &ftrace_events, list) {
649 if (!call->class || !call->class->define_fields)
650 continue;
651
652 if (strcmp(call->class->system, system->name) != 0) 658 if (strcmp(call->class->system, system->name) != 0)
653 continue; 659 continue;
654 660
@@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system,
1251 list_for_each_entry(call, &ftrace_events, list) { 1257 list_for_each_entry(call, &ftrace_events, list) {
1252 struct event_filter *filter = call->filter; 1258 struct event_filter *filter = call->filter;
1253 1259
1254 if (!call->class || !call->class->define_fields)
1255 continue;
1256
1257 if (strcmp(call->class->system, system->name) != 0) 1260 if (strcmp(call->class->system, system->name) != 0)
1258 continue; 1261 continue;
1259 1262
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 8536e2a65969..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
125 125
126#include "trace_entries.h" 126#include "trace_entries.h"
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{
130 INIT_LIST_HEAD(&call->class->fields);
131 return 0;
132}
133
134#undef __entry 128#undef __entry
135#define __entry REC 129#define __entry REC
136 130
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
158struct ftrace_event_class event_class_ftrace_##call = { \ 152struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \ 153 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \ 154 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \ 155 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
162}; \ 156}; \
163 \ 157 \
164struct ftrace_event_call __used \ 158struct ftrace_event_call __used \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
54 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
55 unsigned long flags; 55 unsigned long flags;
56 long disabled; 56 long disabled;
57 int cpu, resched; 57 int cpu;
58 int pc; 58 int pc;
59 59
60 if (unlikely(!ftrace_function_enabled)) 60 if (unlikely(!ftrace_function_enabled))
61 return; 61 return;
62 62
63 pc = preempt_count(); 63 pc = preempt_count();
64 resched = ftrace_preempt_disable(); 64 preempt_disable_notrace();
65 local_save_flags(flags); 65 local_save_flags(flags);
66 cpu = raw_smp_processor_id(); 66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu]; 67 data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
71 trace_function(tr, ip, parent_ip, flags, pc); 71 trace_function(tr, ip, parent_ip, flags, pc);
72 72
73 atomic_dec(&data->disabled); 73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched); 74 preempt_enable_notrace();
75} 75}
76 76
77static void 77static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..6bff23625781 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -641,7 +641,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
641 641
642 /* Print nsecs (we don't want to exceed 7 numbers) */ 642 /* Print nsecs (we don't want to exceed 7 numbers) */
643 if (len < 7) { 643 if (len < 7) {
644 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); 644 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
645 nsecs_rem);
645 ret = trace_seq_printf(s, ".%s", nsecs_str); 646 ret = trace_seq_printf(s, ".%s", nsecs_str);
646 if (!ret) 647 if (!ret)
647 return TRACE_TYPE_PARTIAL_LINE; 648 return TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6fd486e0cef4..73a6b0601f2e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly =
649#endif 649#endif
650 .open = irqsoff_trace_open, 650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close, 651 .close = irqsoff_trace_close,
652 .use_max_tr = 1,
652}; 653};
653# define register_irqsoff(trace) register_tracer(&trace) 654# define register_irqsoff(trace) register_tracer(&trace)
654#else 655#else
@@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly =
681#endif 682#endif
682 .open = irqsoff_trace_open, 683 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close, 684 .close = irqsoff_trace_close,
685 .use_max_tr = 1,
684}; 686};
685# define register_preemptoff(trace) register_tracer(&trace) 687# define register_preemptoff(trace) register_tracer(&trace)
686#else 688#else
@@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
715#endif 717#endif
716 .open = irqsoff_trace_open, 718 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close, 719 .close = irqsoff_trace_close,
720 .use_max_tr = 1,
718}; 721};
719 722
720# define register_preemptirqsoff(trace) register_tracer(&trace) 723# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..7b8ecd751d93
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,136 @@
1/*
2 * kdb helper for dumping the ftrace buffer
3 *
4 * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
5 *
6 * ftrace_dump_buf based on ftrace_dump:
7 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
8 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
9 *
10 */
11#include <linux/init.h>
12#include <linux/kgdb.h>
13#include <linux/kdb.h>
14#include <linux/ftrace.h>
15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h"
18#include "trace_output.h"
19
20static void ftrace_dump_buf(int skip_lines, long cpu_file)
21{
22 /* use static because iter can be a bit big for the stack */
23 static struct trace_iterator iter;
24 unsigned int old_userobj;
25 int cnt = 0, cpu;
26
27 trace_init_global_iter(&iter);
28
29 for_each_tracing_cpu(cpu) {
30 atomic_inc(&iter.tr->data[cpu]->disabled);
31 }
32
33 old_userobj = trace_flags;
34
35 /* don't look at user memory in panic mode */
36 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
37
38 kdb_printf("Dumping ftrace buffer:\n");
39
40 /* reset all but tr, trace, and overruns */
41 memset(&iter.seq, 0,
42 sizeof(struct trace_iterator) -
43 offsetof(struct trace_iterator, seq));
44 iter.iter_flags |= TRACE_FILE_LAT_FMT;
45 iter.pos = -1;
46
47 if (cpu_file == TRACE_PIPE_ALL_CPU) {
48 for_each_tracing_cpu(cpu) {
49 iter.buffer_iter[cpu] =
50 ring_buffer_read_prepare(iter.tr->buffer, cpu);
51 ring_buffer_read_start(iter.buffer_iter[cpu]);
52 tracing_iter_reset(&iter, cpu);
53 }
54 } else {
55 iter.cpu_file = cpu_file;
56 iter.buffer_iter[cpu_file] =
57 ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
58 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
59 tracing_iter_reset(&iter, cpu_file);
60 }
61 if (!trace_empty(&iter))
62 trace_find_next_entry_inc(&iter);
63 while (!trace_empty(&iter)) {
64 if (!cnt)
65 kdb_printf("---------------------------------\n");
66 cnt++;
67
68 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
69 print_trace_line(&iter);
70 if (!skip_lines)
71 trace_printk_seq(&iter.seq);
72 else
73 skip_lines--;
74 if (KDB_FLAG(CMD_INTERRUPT))
75 goto out;
76 }
77
78 if (!cnt)
79 kdb_printf(" (ftrace buffer empty)\n");
80 else
81 kdb_printf("---------------------------------\n");
82
83out:
84 trace_flags = old_userobj;
85
86 for_each_tracing_cpu(cpu) {
87 atomic_dec(&iter.tr->data[cpu]->disabled);
88 }
89
90 for_each_tracing_cpu(cpu)
91 if (iter.buffer_iter[cpu])
92 ring_buffer_read_finish(iter.buffer_iter[cpu]);
93}
94
95/*
96 * kdb_ftdump - Dump the ftrace log buffer
97 */
98static int kdb_ftdump(int argc, const char **argv)
99{
100 int skip_lines = 0;
101 long cpu_file;
102 char *cp;
103
104 if (argc > 2)
105 return KDB_ARGCOUNT;
106
107 if (argc) {
108 skip_lines = simple_strtol(argv[1], &cp, 0);
109 if (*cp)
110 skip_lines = 0;
111 }
112
113 if (argc == 2) {
114 cpu_file = simple_strtol(argv[2], &cp, 0);
115 if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
116 !cpu_online(cpu_file))
117 return KDB_BADINT;
118 } else {
119 cpu_file = TRACE_PIPE_ALL_CPU;
120 }
121
122 kdb_trap_printk++;
123 ftrace_dump_buf(skip_lines, cpu_file);
124 kdb_trap_printk--;
125
126 return 0;
127}
128
129static __init int kdb_ftrace_register(void)
130{
131 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
132 "Dump ftrace log", 0, KDB_REPEAT_NONE);
133 return 0;
134}
135
136late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f52b5f50299d..8b27c9849b42 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,6 +30,8 @@
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <linux/uaccess.h>
33#include <asm/bitsperlong.h> 35#include <asm/bitsperlong.h>
34 36
35#include "trace.h" 37#include "trace.h"
@@ -38,6 +40,7 @@
38#define MAX_TRACE_ARGS 128 40#define MAX_TRACE_ARGS 128
39#define MAX_ARGSTR_LEN 63 41#define MAX_ARGSTR_LEN 63
40#define MAX_EVENT_NAME_LEN 64 42#define MAX_EVENT_NAME_LEN 64
43#define MAX_STRING_SIZE PATH_MAX
41#define KPROBE_EVENT_SYSTEM "kprobes" 44#define KPROBE_EVENT_SYSTEM "kprobes"
42 45
43/* Reserved field names */ 46/* Reserved field names */
@@ -58,14 +61,16 @@ const char *reserved_field_names[] = {
58}; 61};
59 62
60/* Printing function type */ 63/* Printing function type */
61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); 64typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
65 void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type 66#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type 67#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64 68
65/* Printing in basic type function template */ 69/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ 70#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 71static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\ 72 const char *name, \
73 void *data, void *ent)\
69{ \ 74{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ 75 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \ 76} \
@@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) 85DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) 86DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82 87
88/* data_rloc: data relative location, compatible with u32 */
89#define make_data_rloc(len, roffs) \
90 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
91#define get_rloc_len(dl) ((u32)(dl) >> 16)
92#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
93
94static inline void *get_rloc_data(u32 *dl)
95{
96 return (u8 *)dl + get_rloc_offs(*dl);
97}
98
99/* For data_loc conversion */
100static inline void *get_loc_data(u32 *dl, void *ent)
101{
102 return (u8 *)ent + get_rloc_offs(*dl);
103}
104
105/*
106 * Convert data_rloc to data_loc:
107 * data_rloc stores the offset from data_rloc itself, but data_loc
108 * stores the offset from event entry.
109 */
110#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
111
112/* For defining macros, define string/string_size types */
113typedef u32 string;
114typedef u32 string_size;
115
116/* Print type function for string type */
117static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
118 const char *name,
119 void *data, void *ent)
120{
121 int len = *(u32 *)data >> 16;
122
123 if (!len)
124 return trace_seq_printf(s, " %s=(fault)", name);
125 else
126 return trace_seq_printf(s, " %s=\"%s\"", name,
127 (const char *)get_loc_data(data, ent));
128}
129static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
130
83/* Data fetch function type */ 131/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); 132typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85 133
@@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm,
94 return fprm->fn(regs, fprm->data, dest); 142 return fprm->fn(regs, fprm->data, dest);
95} 143}
96 144
97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type 145#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
98/* 146/*
99 * Define macro for basic types - we don't need to define s* types, because 147 * Define macro for basic types - we don't need to define s* types, because
100 * we have to care only about bitwidth at recording time. 148 * we have to care only about bitwidth at recording time.
101 */ 149 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \ 150#define DEFINE_BASIC_FETCH_FUNCS(method) \
103DEFINE_FETCH_##kind(u8) \ 151DEFINE_FETCH_##method(u8) \
104DEFINE_FETCH_##kind(u16) \ 152DEFINE_FETCH_##method(u16) \
105DEFINE_FETCH_##kind(u32) \ 153DEFINE_FETCH_##method(u32) \
106DEFINE_FETCH_##kind(u64) 154DEFINE_FETCH_##method(u64)
107 155
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ 156#define CHECK_FETCH_FUNCS(method, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \ 157 (((FETCH_FUNC_NAME(method, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \ 158 (FETCH_FUNC_NAME(method, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \ 159 (FETCH_FUNC_NAME(method, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn)) 160 (FETCH_FUNC_NAME(method, u64) == fn) || \
161 (FETCH_FUNC_NAME(method, string) == fn) || \
162 (FETCH_FUNC_NAME(method, string_size) == fn)) \
163 && (fn != NULL))
113 164
114/* Data fetch function templates */ 165/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \ 166#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 167static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \ 168 void *offset, void *dest) \
118{ \ 169{ \
119 *(type *)dest = (type)regs_get_register(regs, \ 170 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \ 171 (unsigned int)((unsigned long)offset)); \
121} 172}
122DEFINE_BASIC_FETCH_FUNCS(reg) 173DEFINE_BASIC_FETCH_FUNCS(reg)
174/* No string on the register */
175#define fetch_reg_string NULL
176#define fetch_reg_string_size NULL
123 177
124#define DEFINE_FETCH_stack(type) \ 178#define DEFINE_FETCH_stack(type) \
125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 179static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
@@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
129 (unsigned int)((unsigned long)offset)); \ 183 (unsigned int)((unsigned long)offset)); \
130} 184}
131DEFINE_BASIC_FETCH_FUNCS(stack) 185DEFINE_BASIC_FETCH_FUNCS(stack)
186/* No string on the stack entry */
187#define fetch_stack_string NULL
188#define fetch_stack_string_size NULL
132 189
133#define DEFINE_FETCH_retval(type) \ 190#define DEFINE_FETCH_retval(type) \
134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ 191static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
@@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
137 *(type *)dest = (type)regs_return_value(regs); \ 194 *(type *)dest = (type)regs_return_value(regs); \
138} 195}
139DEFINE_BASIC_FETCH_FUNCS(retval) 196DEFINE_BASIC_FETCH_FUNCS(retval)
197/* No string on the retval */
198#define fetch_retval_string NULL
199#define fetch_retval_string_size NULL
140 200
141#define DEFINE_FETCH_memory(type) \ 201#define DEFINE_FETCH_memory(type) \
142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 202static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
@@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
149 *(type *)dest = retval; \ 209 *(type *)dest = retval; \
150} 210}
151DEFINE_BASIC_FETCH_FUNCS(memory) 211DEFINE_BASIC_FETCH_FUNCS(memory)
212/*
213 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
214 * length and relative data location.
215 */
216static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
217 void *addr, void *dest)
218{
219 long ret;
220 int maxlen = get_rloc_len(*(u32 *)dest);
221 u8 *dst = get_rloc_data(dest);
222 u8 *src = addr;
223 mm_segment_t old_fs = get_fs();
224 if (!maxlen)
225 return;
226 /*
227 * Try to get string again, since the string can be changed while
228 * probing.
229 */
230 set_fs(KERNEL_DS);
231 pagefault_disable();
232 do
233 ret = __copy_from_user_inatomic(dst++, src++, 1);
234 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
235 dst[-1] = '\0';
236 pagefault_enable();
237 set_fs(old_fs);
238
239 if (ret < 0) { /* Failed to fetch string */
240 ((u8 *)get_rloc_data(dest))[0] = '\0';
241 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
242 } else
243 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
244 get_rloc_offs(*(u32 *)dest));
245}
246/* Return the length of string -- including null terminal byte */
247static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
248 void *addr, void *dest)
249{
250 int ret, len = 0;
251 u8 c;
252 mm_segment_t old_fs = get_fs();
253
254 set_fs(KERNEL_DS);
255 pagefault_disable();
256 do {
257 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
258 len++;
259 } while (c && ret == 0 && len < MAX_STRING_SIZE);
260 pagefault_enable();
261 set_fs(old_fs);
262
263 if (ret < 0) /* Failed to check the length */
264 *(u32 *)dest = 0;
265 else
266 *(u32 *)dest = len;
267}
152 268
153/* Memory fetching by symbol */ 269/* Memory fetching by symbol */
154struct symbol_cache { 270struct symbol_cache {
@@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
203 *(type *)dest = 0; \ 319 *(type *)dest = 0; \
204} 320}
205DEFINE_BASIC_FETCH_FUNCS(symbol) 321DEFINE_BASIC_FETCH_FUNCS(symbol)
322DEFINE_FETCH_symbol(string)
323DEFINE_FETCH_symbol(string_size)
206 324
207/* Dereference memory access function */ 325/* Dereference memory access function */
208struct deref_fetch_param { 326struct deref_fetch_param {
@@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
224 *(type *)dest = 0; \ 342 *(type *)dest = 0; \
225} 343}
226DEFINE_BASIC_FETCH_FUNCS(deref) 344DEFINE_BASIC_FETCH_FUNCS(deref)
345DEFINE_FETCH_deref(string)
346DEFINE_FETCH_deref(string_size)
227 347
228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 348static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
229{ 349{
230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) 350 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
231 free_deref_fetch_param(data->orig.data); 351 free_deref_fetch_param(data->orig.data);
232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) 352 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
233 free_symbol_cache(data->orig.data); 353 free_symbol_cache(data->orig.data);
234 kfree(data); 354 kfree(data);
235} 355}
@@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) 360#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) 361#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242 362
243#define ASSIGN_FETCH_FUNC(kind, type) \ 363/* Fetch types */
244 .kind = FETCH_FUNC_NAME(kind, type) 364enum {
245 365 FETCH_MTD_reg = 0,
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ 366 FETCH_MTD_stack,
247 {.name = #ptype, \ 367 FETCH_MTD_retval,
248 .size = sizeof(ftype), \ 368 FETCH_MTD_memory,
249 .is_signed = sign, \ 369 FETCH_MTD_symbol,
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \ 370 FETCH_MTD_deref,
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \ 371 FETCH_MTD_END,
252ASSIGN_FETCH_FUNC(reg, ftype), \ 372};
253ASSIGN_FETCH_FUNC(stack, ftype), \ 373
254ASSIGN_FETCH_FUNC(retval, ftype), \ 374#define ASSIGN_FETCH_FUNC(method, type) \
255ASSIGN_FETCH_FUNC(memory, ftype), \ 375 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
256ASSIGN_FETCH_FUNC(symbol, ftype), \ 376
257ASSIGN_FETCH_FUNC(deref, ftype), \ 377#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
378 {.name = _name, \
379 .size = _size, \
380 .is_signed = sign, \
381 .print = PRINT_TYPE_FUNC_NAME(ptype), \
382 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
383 .fmttype = _fmttype, \
384 .fetch = { \
385ASSIGN_FETCH_FUNC(reg, ftype), \
386ASSIGN_FETCH_FUNC(stack, ftype), \
387ASSIGN_FETCH_FUNC(retval, ftype), \
388ASSIGN_FETCH_FUNC(memory, ftype), \
389ASSIGN_FETCH_FUNC(symbol, ftype), \
390ASSIGN_FETCH_FUNC(deref, ftype), \
391 } \
258 } 392 }
259 393
394#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
395 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
396
397#define FETCH_TYPE_STRING 0
398#define FETCH_TYPE_STRSIZE 1
399
260/* Fetch type information table */ 400/* Fetch type information table */
261static const struct fetch_type { 401static const struct fetch_type {
262 const char *name; /* Name of type */ 402 const char *name; /* Name of type */
@@ -264,14 +404,16 @@ static const struct fetch_type {
264 int is_signed; /* Signed flag */ 404 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */ 405 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */ 406 const char *fmt; /* Fromat string */
407 const char *fmttype; /* Name in format file */
267 /* Fetch functions */ 408 /* Fetch functions */
268 fetch_func_t reg; 409 fetch_func_t fetch[FETCH_MTD_END];
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = { 410} fetch_type_table[] = {
411 /* Special types */
412 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
413 sizeof(u32), 1, "__data_loc char[]"),
414 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
415 string_size, sizeof(u32), 0, "u32"),
416 /* Basic types */
275 ASSIGN_FETCH_TYPE(u8, u8, 0), 417 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0), 418 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0), 419 ASSIGN_FETCH_TYPE(u32, u32, 0),
@@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs,
302 *(unsigned long *)dest = kernel_stack_pointer(regs); 444 *(unsigned long *)dest = kernel_stack_pointer(regs);
303} 445}
304 446
447static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
448 fetch_func_t orig_fn)
449{
450 int i;
451
452 if (type != &fetch_type_table[FETCH_TYPE_STRING])
453 return NULL; /* Only string type needs size function */
454 for (i = 0; i < FETCH_MTD_END; i++)
455 if (type->fetch[i] == orig_fn)
456 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
457
458 WARN_ON(1); /* This should not happen */
459 return NULL;
460}
461
305/** 462/**
306 * Kprobe event core functions 463 * Kprobe event core functions
307 */ 464 */
308 465
309struct probe_arg { 466struct probe_arg {
310 struct fetch_param fetch; 467 struct fetch_param fetch;
468 struct fetch_param fetch_size;
311 unsigned int offset; /* Offset from argument entry */ 469 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */ 470 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */ 471 const char *comm; /* Command of this argument */
@@ -429,9 +587,9 @@ error:
429 587
430static void free_probe_arg(struct probe_arg *arg) 588static void free_probe_arg(struct probe_arg *arg)
431{ 589{
432 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) 590 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
433 free_deref_fetch_param(arg->fetch.data); 591 free_deref_fetch_param(arg->fetch.data);
434 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) 592 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
435 free_symbol_cache(arg->fetch.data); 593 free_symbol_cache(arg->fetch.data);
436 kfree(arg->name); 594 kfree(arg->name);
437 kfree(arg->comm); 595 kfree(arg->comm);
@@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
548 706
549 if (strcmp(arg, "retval") == 0) { 707 if (strcmp(arg, "retval") == 0) {
550 if (is_return) 708 if (is_return)
551 f->fn = t->retval; 709 f->fn = t->fetch[FETCH_MTD_retval];
552 else 710 else
553 ret = -EINVAL; 711 ret = -EINVAL;
554 } else if (strncmp(arg, "stack", 5) == 0) { 712 } else if (strncmp(arg, "stack", 5) == 0) {
@@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
562 if (ret || param > PARAM_MAX_STACK) 720 if (ret || param > PARAM_MAX_STACK)
563 ret = -EINVAL; 721 ret = -EINVAL;
564 else { 722 else {
565 f->fn = t->stack; 723 f->fn = t->fetch[FETCH_MTD_stack];
566 f->data = (void *)param; 724 f->data = (void *)param;
567 } 725 }
568 } else 726 } else
@@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
588 case '%': /* named register */ 746 case '%': /* named register */
589 ret = regs_query_register_offset(arg + 1); 747 ret = regs_query_register_offset(arg + 1);
590 if (ret >= 0) { 748 if (ret >= 0) {
591 f->fn = t->reg; 749 f->fn = t->fetch[FETCH_MTD_reg];
592 f->data = (void *)(unsigned long)ret; 750 f->data = (void *)(unsigned long)ret;
593 ret = 0; 751 ret = 0;
594 } 752 }
@@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
598 ret = strict_strtoul(arg + 1, 0, &param); 756 ret = strict_strtoul(arg + 1, 0, &param);
599 if (ret) 757 if (ret)
600 break; 758 break;
601 f->fn = t->memory; 759 f->fn = t->fetch[FETCH_MTD_memory];
602 f->data = (void *)param; 760 f->data = (void *)param;
603 } else { 761 } else {
604 ret = split_symbol_offset(arg + 1, &offset); 762 ret = split_symbol_offset(arg + 1, &offset);
@@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
606 break; 764 break;
607 f->data = alloc_symbol_cache(arg + 1, offset); 765 f->data = alloc_symbol_cache(arg + 1, offset);
608 if (f->data) 766 if (f->data)
609 f->fn = t->symbol; 767 f->fn = t->fetch[FETCH_MTD_symbol];
610 } 768 }
611 break; 769 break;
612 case '+': /* deref memory */ 770 case '+': /* deref memory */
@@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
636 if (ret) 794 if (ret)
637 kfree(dprm); 795 kfree(dprm);
638 else { 796 else {
639 f->fn = t->deref; 797 f->fn = t->fetch[FETCH_MTD_deref];
640 f->data = (void *)dprm; 798 f->data = (void *)dprm;
641 } 799 }
642 } 800 }
643 break; 801 break;
644 } 802 }
645 if (!ret && !f->fn) 803 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
804 pr_info("%s type has no corresponding fetch method.\n",
805 t->name);
646 ret = -EINVAL; 806 ret = -EINVAL;
807 }
647 return ret; 808 return ret;
648} 809}
649 810
@@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
652 struct probe_arg *parg, int is_return) 813 struct probe_arg *parg, int is_return)
653{ 814{
654 const char *t; 815 const char *t;
816 int ret;
655 817
656 if (strlen(arg) > MAX_ARGSTR_LEN) { 818 if (strlen(arg) > MAX_ARGSTR_LEN) {
657 pr_info("Argument is too long.: %s\n", arg); 819 pr_info("Argument is too long.: %s\n", arg);
@@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
674 } 836 }
675 parg->offset = tp->size; 837 parg->offset = tp->size;
676 tp->size += parg->type->size; 838 tp->size += parg->type->size;
677 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 839 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
840 if (ret >= 0) {
841 parg->fetch_size.fn = get_fetch_size_function(parg->type,
842 parg->fetch.fn);
843 parg->fetch_size.data = parg->fetch.data;
844 }
845 return ret;
678} 846}
679 847
680/* Return 1 if name is reserved or already used by another argument */ 848/* Return 1 if name is reserved or already used by another argument */
@@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv)
757 pr_info("Delete command needs an event name.\n"); 925 pr_info("Delete command needs an event name.\n");
758 return -EINVAL; 926 return -EINVAL;
759 } 927 }
928 mutex_lock(&probe_lock);
760 tp = find_probe_event(event, group); 929 tp = find_probe_event(event, group);
761 if (!tp) { 930 if (!tp) {
931 mutex_unlock(&probe_lock);
762 pr_info("Event %s/%s doesn't exist.\n", group, event); 932 pr_info("Event %s/%s doesn't exist.\n", group, event);
763 return -ENOENT; 933 return -ENOENT;
764 } 934 }
765 /* delete an event */ 935 /* delete an event */
766 unregister_trace_probe(tp); 936 unregister_trace_probe(tp);
767 free_trace_probe(tp); 937 free_trace_probe(tp);
938 mutex_unlock(&probe_lock);
768 return 0; 939 return 0;
769 } 940 }
770 941
@@ -1043,6 +1214,54 @@ static const struct file_operations kprobe_profile_ops = {
1043 .release = seq_release, 1214 .release = seq_release,
1044}; 1215};
1045 1216
1217/* Sum up total data length for dynamic arraies (strings) */
1218static __kprobes int __get_data_size(struct trace_probe *tp,
1219 struct pt_regs *regs)
1220{
1221 int i, ret = 0;
1222 u32 len;
1223
1224 for (i = 0; i < tp->nr_args; i++)
1225 if (unlikely(tp->args[i].fetch_size.fn)) {
1226 call_fetch(&tp->args[i].fetch_size, regs, &len);
1227 ret += len;
1228 }
1229
1230 return ret;
1231}
1232
1233/* Store the value of each argument */
1234static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
1235 struct pt_regs *regs,
1236 u8 *data, int maxlen)
1237{
1238 int i;
1239 u32 end = tp->size;
1240 u32 *dl; /* Data (relative) location */
1241
1242 for (i = 0; i < tp->nr_args; i++) {
1243 if (unlikely(tp->args[i].fetch_size.fn)) {
1244 /*
1245 * First, we set the relative location and
1246 * maximum data length to *dl
1247 */
1248 dl = (u32 *)(data + tp->args[i].offset);
1249 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
1250 /* Then try to fetch string or dynamic array data */
1251 call_fetch(&tp->args[i].fetch, regs, dl);
1252 /* Reduce maximum length */
1253 end += get_rloc_len(*dl);
1254 maxlen -= get_rloc_len(*dl);
1255 /* Trick here, convert data_rloc to data_loc */
1256 *dl = convert_rloc_to_loc(*dl,
1257 ent_size + tp->args[i].offset);
1258 } else
1259 /* Just fetching data normally */
1260 call_fetch(&tp->args[i].fetch, regs,
1261 data + tp->args[i].offset);
1262 }
1263}
1264
1046/* Kprobe handler */ 1265/* Kprobe handler */
1047static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1266static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1048{ 1267{
@@ -1050,8 +1269,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1050 struct kprobe_trace_entry_head *entry; 1269 struct kprobe_trace_entry_head *entry;
1051 struct ring_buffer_event *event; 1270 struct ring_buffer_event *event;
1052 struct ring_buffer *buffer; 1271 struct ring_buffer *buffer;
1053 u8 *data; 1272 int size, dsize, pc;
1054 int size, i, pc;
1055 unsigned long irq_flags; 1273 unsigned long irq_flags;
1056 struct ftrace_event_call *call = &tp->call; 1274 struct ftrace_event_call *call = &tp->call;
1057 1275
@@ -1060,7 +1278,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1060 local_save_flags(irq_flags); 1278 local_save_flags(irq_flags);
1061 pc = preempt_count(); 1279 pc = preempt_count();
1062 1280
1063 size = sizeof(*entry) + tp->size; 1281 dsize = __get_data_size(tp, regs);
1282 size = sizeof(*entry) + tp->size + dsize;
1064 1283
1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1284 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1066 size, irq_flags, pc); 1285 size, irq_flags, pc);
@@ -1069,9 +1288,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1069 1288
1070 entry = ring_buffer_event_data(event); 1289 entry = ring_buffer_event_data(event);
1071 entry->ip = (unsigned long)kp->addr; 1290 entry->ip = (unsigned long)kp->addr;
1072 data = (u8 *)&entry[1]; 1291 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1073 for (i = 0; i < tp->nr_args; i++)
1074 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1075 1292
1076 if (!filter_current_check_discard(buffer, call, entry, event)) 1293 if (!filter_current_check_discard(buffer, call, entry, event))
1077 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1294 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1085,15 +1302,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1085 struct kretprobe_trace_entry_head *entry; 1302 struct kretprobe_trace_entry_head *entry;
1086 struct ring_buffer_event *event; 1303 struct ring_buffer_event *event;
1087 struct ring_buffer *buffer; 1304 struct ring_buffer *buffer;
1088 u8 *data; 1305 int size, pc, dsize;
1089 int size, i, pc;
1090 unsigned long irq_flags; 1306 unsigned long irq_flags;
1091 struct ftrace_event_call *call = &tp->call; 1307 struct ftrace_event_call *call = &tp->call;
1092 1308
1093 local_save_flags(irq_flags); 1309 local_save_flags(irq_flags);
1094 pc = preempt_count(); 1310 pc = preempt_count();
1095 1311
1096 size = sizeof(*entry) + tp->size; 1312 dsize = __get_data_size(tp, regs);
1313 size = sizeof(*entry) + tp->size + dsize;
1097 1314
1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1315 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1099 size, irq_flags, pc); 1316 size, irq_flags, pc);
@@ -1103,9 +1320,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1103 entry = ring_buffer_event_data(event); 1320 entry = ring_buffer_event_data(event);
1104 entry->func = (unsigned long)tp->rp.kp.addr; 1321 entry->func = (unsigned long)tp->rp.kp.addr;
1105 entry->ret_ip = (unsigned long)ri->ret_addr; 1322 entry->ret_ip = (unsigned long)ri->ret_addr;
1106 data = (u8 *)&entry[1]; 1323 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1107 for (i = 0; i < tp->nr_args; i++)
1108 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1109 1324
1110 if (!filter_current_check_discard(buffer, call, entry, event)) 1325 if (!filter_current_check_discard(buffer, call, entry, event))
1111 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1326 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1137,7 +1352,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1137 data = (u8 *)&field[1]; 1352 data = (u8 *)&field[1];
1138 for (i = 0; i < tp->nr_args; i++) 1353 for (i = 0; i < tp->nr_args; i++)
1139 if (!tp->args[i].type->print(s, tp->args[i].name, 1354 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset)) 1355 data + tp->args[i].offset, field))
1141 goto partial; 1356 goto partial;
1142 1357
1143 if (!trace_seq_puts(s, "\n")) 1358 if (!trace_seq_puts(s, "\n"))
@@ -1179,7 +1394,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1179 data = (u8 *)&field[1]; 1394 data = (u8 *)&field[1];
1180 for (i = 0; i < tp->nr_args; i++) 1395 for (i = 0; i < tp->nr_args; i++)
1181 if (!tp->args[i].type->print(s, tp->args[i].name, 1396 if (!tp->args[i].type->print(s, tp->args[i].name,
1182 data + tp->args[i].offset)) 1397 data + tp->args[i].offset, field))
1183 goto partial; 1398 goto partial;
1184 1399
1185 if (!trace_seq_puts(s, "\n")) 1400 if (!trace_seq_puts(s, "\n"))
@@ -1214,11 +1429,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1214 } 1429 }
1215} 1430}
1216 1431
1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1218{
1219 return 0;
1220}
1221
1222#undef DEFINE_FIELD 1432#undef DEFINE_FIELD
1223#define DEFINE_FIELD(type, item, name, is_signed) \ 1433#define DEFINE_FIELD(type, item, name, is_signed) \
1224 do { \ 1434 do { \
@@ -1239,7 +1449,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1239 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1449 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1240 /* Set argument names as fields */ 1450 /* Set argument names as fields */
1241 for (i = 0; i < tp->nr_args; i++) { 1451 for (i = 0; i < tp->nr_args; i++) {
1242 ret = trace_define_field(event_call, tp->args[i].type->name, 1452 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1243 tp->args[i].name, 1453 tp->args[i].name,
1244 sizeof(field) + tp->args[i].offset, 1454 sizeof(field) + tp->args[i].offset,
1245 tp->args[i].type->size, 1455 tp->args[i].type->size,
@@ -1261,7 +1471,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1261 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1471 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1262 /* Set argument names as fields */ 1472 /* Set argument names as fields */
1263 for (i = 0; i < tp->nr_args; i++) { 1473 for (i = 0; i < tp->nr_args; i++) {
1264 ret = trace_define_field(event_call, tp->args[i].type->name, 1474 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1265 tp->args[i].name, 1475 tp->args[i].name,
1266 sizeof(field) + tp->args[i].offset, 1476 sizeof(field) + tp->args[i].offset,
1267 tp->args[i].type->size, 1477 tp->args[i].type->size,
@@ -1301,8 +1511,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1301 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1511 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1302 1512
1303 for (i = 0; i < tp->nr_args; i++) { 1513 for (i = 0; i < tp->nr_args; i++) {
1304 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", 1514 if (strcmp(tp->args[i].type->name, "string") == 0)
1305 tp->args[i].name); 1515 pos += snprintf(buf + pos, LEN_OR_ZERO,
1516 ", __get_str(%s)",
1517 tp->args[i].name);
1518 else
1519 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1520 tp->args[i].name);
1306 } 1521 }
1307 1522
1308#undef LEN_OR_ZERO 1523#undef LEN_OR_ZERO
@@ -1339,11 +1554,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1339 struct ftrace_event_call *call = &tp->call; 1554 struct ftrace_event_call *call = &tp->call;
1340 struct kprobe_trace_entry_head *entry; 1555 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head; 1556 struct hlist_head *head;
1342 u8 *data; 1557 int size, __size, dsize;
1343 int size, __size, i;
1344 int rctx; 1558 int rctx;
1345 1559
1346 __size = sizeof(*entry) + tp->size; 1560 dsize = __get_data_size(tp, regs);
1561 __size = sizeof(*entry) + tp->size + dsize;
1347 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1562 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1348 size -= sizeof(u32); 1563 size -= sizeof(u32);
1349 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1564 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1355,9 +1570,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1355 return; 1570 return;
1356 1571
1357 entry->ip = (unsigned long)kp->addr; 1572 entry->ip = (unsigned long)kp->addr;
1358 data = (u8 *)&entry[1]; 1573 memset(&entry[1], 0, dsize);
1359 for (i = 0; i < tp->nr_args; i++) 1574 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1361 1575
1362 head = this_cpu_ptr(call->perf_events); 1576 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1577 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
@@ -1371,11 +1585,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1371 struct ftrace_event_call *call = &tp->call; 1585 struct ftrace_event_call *call = &tp->call;
1372 struct kretprobe_trace_entry_head *entry; 1586 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head; 1587 struct hlist_head *head;
1374 u8 *data; 1588 int size, __size, dsize;
1375 int size, __size, i;
1376 int rctx; 1589 int rctx;
1377 1590
1378 __size = sizeof(*entry) + tp->size; 1591 dsize = __get_data_size(tp, regs);
1592 __size = sizeof(*entry) + tp->size + dsize;
1379 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1593 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1380 size -= sizeof(u32); 1594 size -= sizeof(u32);
1381 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1595 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1388,9 +1602,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1388 1602
1389 entry->func = (unsigned long)tp->rp.kp.addr; 1603 entry->func = (unsigned long)tp->rp.kp.addr;
1390 entry->ret_ip = (unsigned long)ri->ret_addr; 1604 entry->ret_ip = (unsigned long)ri->ret_addr;
1391 data = (u8 *)&entry[1]; 1605 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1392 for (i = 0; i < tp->nr_args; i++)
1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1394 1606
1395 head = this_cpu_ptr(call->perf_events); 1607 head = this_cpu_ptr(call->perf_events);
1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1608 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
@@ -1486,15 +1698,12 @@ static int register_probe_event(struct trace_probe *tp)
1486 int ret; 1698 int ret;
1487 1699
1488 /* Initialize ftrace_event_call */ 1700 /* Initialize ftrace_event_call */
1701 INIT_LIST_HEAD(&call->class->fields);
1489 if (probe_is_return(tp)) { 1702 if (probe_is_return(tp)) {
1490 INIT_LIST_HEAD(&call->class->fields);
1491 call->event.funcs = &kretprobe_funcs; 1703 call->event.funcs = &kretprobe_funcs;
1492 call->class->raw_init = probe_event_raw_init;
1493 call->class->define_fields = kretprobe_event_define_fields; 1704 call->class->define_fields = kretprobe_event_define_fields;
1494 } else { 1705 } else {
1495 INIT_LIST_HEAD(&call->class->fields);
1496 call->event.funcs = &kprobe_funcs; 1706 call->event.funcs = &kprobe_funcs;
1497 call->class->raw_init = probe_event_raw_init;
1498 call->class->define_fields = kprobe_event_define_fields; 1707 call->class->define_fields = kprobe_event_define_fields;
1499 } 1708 }
1500 if (set_print_fmt(tp) < 0) 1709 if (set_print_fmt(tp) < 0)
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 8eaf00749b65..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,508 +0,0 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/fs.h>
28
29#include "trace_output.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35#include <asm/atomic.h>
36
37#define KSYM_TRACER_OP_LEN 3 /* rw- */
38
39struct trace_ksym {
40 struct perf_event **ksym_hbp;
41 struct perf_event_attr attr;
42#ifdef CONFIG_PROFILE_KSYM_TRACER
43 atomic64_t counter;
44#endif
45 struct hlist_node ksym_hlist;
46};
47
48static struct trace_array *ksym_trace_array;
49
50static unsigned int ksym_tracing_enabled;
51
52static HLIST_HEAD(ksym_filter_head);
53
54static DEFINE_MUTEX(ksym_tracer_mutex);
55
56#ifdef CONFIG_PROFILE_KSYM_TRACER
57
58#define MAX_UL_INT 0xffffffff
59
60void ksym_collect_stats(unsigned long hbp_hit_addr)
61{
62 struct hlist_node *node;
63 struct trace_ksym *entry;
64
65 rcu_read_lock();
66 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
67 if (entry->attr.bp_addr == hbp_hit_addr) {
68 atomic64_inc(&entry->counter);
69 break;
70 }
71 }
72 rcu_read_unlock();
73}
74#endif /* CONFIG_PROFILE_KSYM_TRACER */
75
76void ksym_hbp_handler(struct perf_event *hbp, int nmi,
77 struct perf_sample_data *data,
78 struct pt_regs *regs)
79{
80 struct ring_buffer_event *event;
81 struct ksym_trace_entry *entry;
82 struct ring_buffer *buffer;
83 int pc;
84
85 if (!ksym_tracing_enabled)
86 return;
87
88 buffer = ksym_trace_array->buffer;
89
90 pc = preempt_count();
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
93 sizeof(*entry), 0, pc);
94 if (!event)
95 return;
96
97 entry = ring_buffer_event_data(event);
98 entry->ip = instruction_pointer(regs);
99 entry->type = hw_breakpoint_type(hbp);
100 entry->addr = hw_breakpoint_addr(hbp);
101 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
102
103#ifdef CONFIG_PROFILE_KSYM_TRACER
104 ksym_collect_stats(hw_breakpoint_addr(hbp));
105#endif /* CONFIG_PROFILE_KSYM_TRACER */
106
107 trace_buffer_unlock_commit(buffer, event, 0, pc);
108}
109
110/* Valid access types are represented as
111 *
112 * rw- : Set Read/Write Access Breakpoint
113 * -w- : Set Write Access Breakpoint
114 * --- : Clear Breakpoints
115 * --x : Set Execution Break points (Not available yet)
116 *
117 */
118static int ksym_trace_get_access_type(char *str)
119{
120 int access = 0;
121
122 if (str[0] == 'r')
123 access |= HW_BREAKPOINT_R;
124
125 if (str[1] == 'w')
126 access |= HW_BREAKPOINT_W;
127
128 if (str[2] == 'x')
129 access |= HW_BREAKPOINT_X;
130
131 switch (access) {
132 case HW_BREAKPOINT_R:
133 case HW_BREAKPOINT_W:
134 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
135 return access;
136 default:
137 return -EINVAL;
138 }
139}
140
141/*
142 * There can be several possible malformed requests and we attempt to capture
143 * all of them. We enumerate some of the rules
144 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
145 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
146 * <module>:<ksym_name>:<op>.
147 * 2. No delimiter symbol ':' in the input string
148 * 3. Spurious operator symbols or symbols not in their respective positions
149 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
150 * 5. Kernel symbol not a part of /proc/kallsyms
151 * 6. Duplicate requests
152 */
153static int parse_ksym_trace_str(char *input_string, char **ksymname,
154 unsigned long *addr)
155{
156 int ret;
157
158 *ksymname = strsep(&input_string, ":");
159 *addr = kallsyms_lookup_name(*ksymname);
160
161 /* Check for malformed request: (2), (1) and (5) */
162 if ((!input_string) ||
163 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
164 (*addr == 0))
165 return -EINVAL;;
166
167 ret = ksym_trace_get_access_type(input_string);
168
169 return ret;
170}
171
172int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
173{
174 struct trace_ksym *entry;
175 int ret = -ENOMEM;
176
177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
178 if (!entry)
179 return -ENOMEM;
180
181 hw_breakpoint_init(&entry->attr);
182
183 entry->attr.bp_type = op;
184 entry->attr.bp_addr = addr;
185 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
186
187 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
188 ksym_hbp_handler);
189
190 if (IS_ERR(entry->ksym_hbp)) {
191 ret = PTR_ERR(entry->ksym_hbp);
192 if (ret == -ENOSPC) {
193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
199 goto err;
200 }
201
202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
203
204 return 0;
205
206err:
207 kfree(entry);
208
209 return ret;
210}
211
212static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
213 size_t count, loff_t *ppos)
214{
215 struct trace_ksym *entry;
216 struct hlist_node *node;
217 struct trace_seq *s;
218 ssize_t cnt = 0;
219 int ret;
220
221 s = kmalloc(sizeof(*s), GFP_KERNEL);
222 if (!s)
223 return -ENOMEM;
224 trace_seq_init(s);
225
226 mutex_lock(&ksym_tracer_mutex);
227
228 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
229 ret = trace_seq_printf(s, "%pS:",
230 (void *)(unsigned long)entry->attr.bp_addr);
231 if (entry->attr.bp_type == HW_BREAKPOINT_R)
232 ret = trace_seq_puts(s, "r--\n");
233 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
234 ret = trace_seq_puts(s, "-w-\n");
235 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
236 ret = trace_seq_puts(s, "rw-\n");
237 WARN_ON_ONCE(!ret);
238 }
239
240 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
241
242 mutex_unlock(&ksym_tracer_mutex);
243
244 kfree(s);
245
246 return cnt;
247}
248
249static void __ksym_trace_reset(void)
250{
251 struct trace_ksym *entry;
252 struct hlist_node *node, *node1;
253
254 mutex_lock(&ksym_tracer_mutex);
255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
256 ksym_hlist) {
257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
258 hlist_del_rcu(&(entry->ksym_hlist));
259 synchronize_rcu();
260 kfree(entry);
261 }
262 mutex_unlock(&ksym_tracer_mutex);
263}
264
265static ssize_t ksym_trace_filter_write(struct file *file,
266 const char __user *buffer,
267 size_t count, loff_t *ppos)
268{
269 struct trace_ksym *entry;
270 struct hlist_node *node;
271 char *buf, *input_string, *ksymname = NULL;
272 unsigned long ksym_addr = 0;
273 int ret, op, changed = 0;
274
275 buf = kzalloc(count + 1, GFP_KERNEL);
276 if (!buf)
277 return -ENOMEM;
278
279 ret = -EFAULT;
280 if (copy_from_user(buf, buffer, count))
281 goto out;
282
283 buf[count] = '\0';
284 input_string = strstrip(buf);
285
286 /*
287 * Clear all breakpoints if:
288 * 1: echo > ksym_trace_filter
289 * 2: echo 0 > ksym_trace_filter
290 * 3: echo "*:---" > ksym_trace_filter
291 */
292 if (!input_string[0] || !strcmp(input_string, "0") ||
293 !strcmp(input_string, "*:---")) {
294 __ksym_trace_reset();
295 ret = 0;
296 goto out;
297 }
298
299 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
300 if (ret < 0)
301 goto out;
302
303 mutex_lock(&ksym_tracer_mutex);
304
305 ret = -EINVAL;
306 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
307 if (entry->attr.bp_addr == ksym_addr) {
308 /* Check for malformed request: (6) */
309 if (entry->attr.bp_type != op)
310 changed = 1;
311 else
312 goto out_unlock;
313 break;
314 }
315 }
316 if (changed) {
317 unregister_wide_hw_breakpoint(entry->ksym_hbp);
318 entry->attr.bp_type = op;
319 ret = 0;
320 if (op > 0) {
321 entry->ksym_hbp =
322 register_wide_hw_breakpoint(&entry->attr,
323 ksym_hbp_handler);
324 if (IS_ERR(entry->ksym_hbp))
325 ret = PTR_ERR(entry->ksym_hbp);
326 else
327 goto out_unlock;
328 }
329 /* Error or "symbol:---" case: drop it */
330 hlist_del_rcu(&(entry->ksym_hlist));
331 synchronize_rcu();
332 kfree(entry);
333 goto out_unlock;
334 } else {
335 /* Check for malformed request: (4) */
336 if (op)
337 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
338 }
339out_unlock:
340 mutex_unlock(&ksym_tracer_mutex);
341out:
342 kfree(buf);
343 return !ret ? count : ret;
344}
345
346static const struct file_operations ksym_tracing_fops = {
347 .open = tracing_open_generic,
348 .read = ksym_trace_filter_read,
349 .write = ksym_trace_filter_write,
350};
351
352static void ksym_trace_reset(struct trace_array *tr)
353{
354 ksym_tracing_enabled = 0;
355 __ksym_trace_reset();
356}
357
358static int ksym_trace_init(struct trace_array *tr)
359{
360 int cpu, ret = 0;
361
362 for_each_online_cpu(cpu)
363 tracing_reset(tr, cpu);
364 ksym_tracing_enabled = 1;
365 ksym_trace_array = tr;
366
367 return ret;
368}
369
370static void ksym_trace_print_header(struct seq_file *m)
371{
372 seq_puts(m,
373 "# TASK-PID CPU# Symbol "
374 "Type Function\n");
375 seq_puts(m,
376 "# | | | "
377 " | |\n");
378}
379
380static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
381{
382 struct trace_entry *entry = iter->ent;
383 struct trace_seq *s = &iter->seq;
384 struct ksym_trace_entry *field;
385 char str[KSYM_SYMBOL_LEN];
386 int ret;
387
388 if (entry->type != TRACE_KSYM)
389 return TRACE_TYPE_UNHANDLED;
390
391 trace_assign_type(field, entry);
392
393 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
394 entry->pid, iter->cpu, (char *)field->addr);
395 if (!ret)
396 return TRACE_TYPE_PARTIAL_LINE;
397
398 switch (field->type) {
399 case HW_BREAKPOINT_R:
400 ret = trace_seq_printf(s, " R ");
401 break;
402 case HW_BREAKPOINT_W:
403 ret = trace_seq_printf(s, " W ");
404 break;
405 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
406 ret = trace_seq_printf(s, " RW ");
407 break;
408 default:
409 return TRACE_TYPE_PARTIAL_LINE;
410 }
411
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 sprint_symbol(str, field->ip);
416 ret = trace_seq_printf(s, "%s\n", str);
417 if (!ret)
418 return TRACE_TYPE_PARTIAL_LINE;
419
420 return TRACE_TYPE_HANDLED;
421}
422
423struct tracer ksym_tracer __read_mostly =
424{
425 .name = "ksym_tracer",
426 .init = ksym_trace_init,
427 .reset = ksym_trace_reset,
428#ifdef CONFIG_FTRACE_SELFTEST
429 .selftest = trace_selftest_startup_ksym,
430#endif
431 .print_header = ksym_trace_print_header,
432 .print_line = ksym_trace_output
433};
434
435#ifdef CONFIG_PROFILE_KSYM_TRACER
436static int ksym_profile_show(struct seq_file *m, void *v)
437{
438 struct hlist_node *node;
439 struct trace_ksym *entry;
440 int access_type = 0;
441 char fn_name[KSYM_NAME_LEN];
442
443 seq_puts(m, " Access Type ");
444 seq_puts(m, " Symbol Counter\n");
445 seq_puts(m, " ----------- ");
446 seq_puts(m, " ------ -------\n");
447
448 rcu_read_lock();
449 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
450
451 access_type = entry->attr.bp_type;
452
453 switch (access_type) {
454 case HW_BREAKPOINT_R:
455 seq_puts(m, " R ");
456 break;
457 case HW_BREAKPOINT_W:
458 seq_puts(m, " W ");
459 break;
460 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
461 seq_puts(m, " RW ");
462 break;
463 default:
464 seq_puts(m, " NA ");
465 }
466
467 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
468 seq_printf(m, " %-36s", fn_name);
469 else
470 seq_printf(m, " %-36s", "<NA>");
471 seq_printf(m, " %15llu\n",
472 (unsigned long long)atomic64_read(&entry->counter));
473 }
474 rcu_read_unlock();
475
476 return 0;
477}
478
479static int ksym_profile_open(struct inode *node, struct file *file)
480{
481 return single_open(file, ksym_profile_show, NULL);
482}
483
484static const struct file_operations ksym_profile_fops = {
485 .open = ksym_profile_open,
486 .read = seq_read,
487 .llseek = seq_lseek,
488 .release = single_release,
489};
490#endif /* CONFIG_PROFILE_KSYM_TRACER */
491
492__init static int init_ksym_trace(void)
493{
494 struct dentry *d_tracer;
495
496 d_tracer = tracing_init_dentry();
497
498 trace_create_file("ksym_trace_filter", 0644, d_tracer,
499 NULL, &ksym_tracing_fops);
500
501#ifdef CONFIG_PROFILE_KSYM_TRACER
502 trace_create_file("ksym_profile", 0444, d_tracer,
503 NULL, &ksym_profile_fops);
504#endif
505
506 return register_tracer(&ksym_tracer);
507}
508device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..02272baa2206 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18 18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 20
24static int next_event_type = __TRACE_LAST_TYPE + 1; 21static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = {
1069 .funcs = &trace_wake_funcs, 1066 .funcs = &trace_wake_funcs,
1070}; 1067};
1071 1068
1072/* TRACE_SPECIAL */
1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1074 int flags, struct trace_event *event)
1075{
1076 struct special_entry *field;
1077
1078 trace_assign_type(field, iter->ent);
1079
1080 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
1081 field->arg1,
1082 field->arg2,
1083 field->arg3))
1084 return TRACE_TYPE_PARTIAL_LINE;
1085
1086 return TRACE_TYPE_HANDLED;
1087}
1088
1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1090 int flags, struct trace_event *event)
1091{
1092 struct special_entry *field;
1093 struct trace_seq *s = &iter->seq;
1094
1095 trace_assign_type(field, iter->ent);
1096
1097 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
1098 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
1099 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1100
1101 return TRACE_TYPE_HANDLED;
1102}
1103
1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1105 int flags, struct trace_event *event)
1106{
1107 struct special_entry *field;
1108 struct trace_seq *s = &iter->seq;
1109
1110 trace_assign_type(field, iter->ent);
1111
1112 SEQ_PUT_FIELD_RET(s, field->arg1);
1113 SEQ_PUT_FIELD_RET(s, field->arg2);
1114 SEQ_PUT_FIELD_RET(s, field->arg3);
1115
1116 return TRACE_TYPE_HANDLED;
1117}
1118
1119static struct trace_event_functions trace_special_funcs = {
1120 .trace = trace_special_print,
1121 .raw = trace_special_print,
1122 .hex = trace_special_hex,
1123 .binary = trace_special_bin,
1124};
1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1131/* TRACE_STACK */ 1069/* TRACE_STACK */
1132 1070
1133static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1071static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1161 1099
1162static struct trace_event_functions trace_stack_funcs = { 1100static struct trace_event_functions trace_stack_funcs = {
1163 .trace = trace_stack_print, 1101 .trace = trace_stack_print,
1164 .raw = trace_special_print,
1165 .hex = trace_special_hex,
1166 .binary = trace_special_bin,
1167}; 1102};
1168 1103
1169static struct trace_event trace_stack_event = { 1104static struct trace_event trace_stack_event = {
@@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1194 1129
1195static struct trace_event_functions trace_user_stack_funcs = { 1130static struct trace_event_functions trace_user_stack_funcs = {
1196 .trace = trace_user_stack_print, 1131 .trace = trace_user_stack_print,
1197 .raw = trace_special_print,
1198 .hex = trace_special_hex,
1199 .binary = trace_special_bin,
1200}; 1132};
1201 1133
1202static struct trace_event trace_user_stack_event = { 1134static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = {
1314 &trace_fn_event, 1246 &trace_fn_event,
1315 &trace_ctx_event, 1247 &trace_ctx_event,
1316 &trace_wake_event, 1248 &trace_wake_event,
1317 &trace_special_event,
1318 &trace_stack_event, 1249 &trace_stack_event,
1319 &trace_user_stack_event, 1250 &trace_user_stack_event,
1320 &trace_bprint_event, 1251 &trace_bprint_event,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0e73bc2ef8c5..4086eae6e81b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
46 struct trace_array_cpu *data; 46 struct trace_array_cpu *data;
47 unsigned long flags; 47 unsigned long flags;
48 long disabled; 48 long disabled;
49 int resched;
50 int cpu; 49 int cpu;
51 int pc; 50 int pc;
52 51
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
54 return; 53 return;
55 54
56 pc = preempt_count(); 55 pc = preempt_count();
57 resched = ftrace_preempt_disable(); 56 preempt_disable_notrace();
58 57
59 cpu = raw_smp_processor_id(); 58 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu) 59 if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
74 out: 73 out:
75 atomic_dec(&data->disabled); 74 atomic_dec(&data->disabled);
76 out_enable: 75 out_enable:
77 ftrace_preempt_enable(resched); 76 preempt_enable_notrace();
78} 77}
79 78
80static struct ftrace_ops trace_ops __read_mostly = 79static struct ftrace_ops trace_ops __read_mostly =
@@ -383,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
383#ifdef CONFIG_FTRACE_SELFTEST 382#ifdef CONFIG_FTRACE_SELFTEST
384 .selftest = trace_selftest_startup_wakeup, 383 .selftest = trace_selftest_startup_wakeup,
385#endif 384#endif
385 .use_max_tr = 1,
386}; 386};
387 387
388static struct tracer wakeup_rt_tracer __read_mostly = 388static struct tracer wakeup_rt_tracer __read_mostly =
@@ -397,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
397#ifdef CONFIG_FTRACE_SELFTEST 397#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 398 .selftest = trace_selftest_startup_wakeup,
399#endif 399#endif
400 .use_max_tr = 1,
400}; 401};
401 402
402__init static int init_wakeup_tracer(void) 403__init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 250e7f9bd2f0..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_WAKE: 13 case TRACE_WAKE:
14 case TRACE_STACK: 14 case TRACE_STACK:
15 case TRACE_PRINT: 15 case TRACE_PRINT:
16 case TRACE_SPECIAL:
17 case TRACE_BRANCH: 16 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
20 case TRACE_KSYM:
21 return 1; 19 return 1;
22 } 20 }
23 return 0; 21 return 0;
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
691} 689}
692#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 690#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
693 691
694#ifdef CONFIG_SYSPROF_TRACER
695int
696trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
697{
698 unsigned long count;
699 int ret;
700
701 /* start the tracing */
702 ret = tracer_init(trace, tr);
703 if (ret) {
704 warn_failed_init_tracer(trace, ret);
705 return ret;
706 }
707
708 /* Sleep for a 1/10 of a second */
709 msleep(100);
710 /* stop the tracing. */
711 tracing_stop();
712 /* check the trace buffer */
713 ret = trace_test_buffer(tr, &count);
714 trace->reset(tr);
715 tracing_start();
716
717 if (!ret && !count) {
718 printk(KERN_CONT ".. no entries found ..");
719 ret = -1;
720 }
721
722 return ret;
723}
724#endif /* CONFIG_SYSPROF_TRACER */
725
726#ifdef CONFIG_BRANCH_TRACER 692#ifdef CONFIG_BRANCH_TRACER
727int 693int
728trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) 694trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 721}
756#endif /* CONFIG_BRANCH_TRACER */ 722#endif /* CONFIG_BRANCH_TRACER */
757 723
758#ifdef CONFIG_KSYM_TRACER
759static int ksym_selftest_dummy;
760
761int
762trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
763{
764 unsigned long count;
765 int ret;
766
767 /* start the tracing */
768 ret = tracer_init(trace, tr);
769 if (ret) {
770 warn_failed_init_tracer(trace, ret);
771 return ret;
772 }
773
774 ksym_selftest_dummy = 0;
775 /* Register the read-write tracing request */
776
777 ret = process_new_ksym_entry("ksym_selftest_dummy",
778 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
779 (unsigned long)(&ksym_selftest_dummy));
780
781 if (ret < 0) {
782 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
783 goto ret_path;
784 }
785 /* Perform a read and a write operation over the dummy variable to
786 * trigger the tracer
787 */
788 if (ksym_selftest_dummy == 0)
789 ksym_selftest_dummy++;
790
791 /* stop the tracing. */
792 tracing_stop();
793 /* check the trace buffer */
794 ret = trace_test_buffer(tr, &count);
795 trace->reset(tr);
796 tracing_start();
797
798 /* read & write operations - one each is performed on the dummy variable
799 * triggering two entries in the trace buffer
800 */
801 if (!ret && count != 2) {
802 printk(KERN_CONT "Ksym tracer startup test failed");
803 ret = -1;
804 }
805
806ret_path:
807 return ret;
808}
809#endif /* CONFIG_KSYM_TRACER */
810
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..056468eae7cf 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
110static void 110static void
111stack_trace_call(unsigned long ip, unsigned long parent_ip) 111stack_trace_call(unsigned long ip, unsigned long parent_ip)
112{ 112{
113 int cpu, resched; 113 int cpu;
114 114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return; 116 return;
117 117
118 resched = ftrace_preempt_disable(); 118 preempt_disable_notrace();
119 119
120 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
121 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
127 out: 127 out:
128 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
129 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
130 ftrace_preempt_enable(resched); 130 preempt_enable_notrace();
131} 131}
132 132
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
26static struct list_head * 29static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call) 30syscall_get_enter_fields(struct ftrace_event_call *call)
28{ 31{
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34static struct list_head * 37static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call) 38syscall_get_exit_fields(struct ftrace_event_call *call)
36{ 39{
37 struct syscall_metadata *entry = call->data; 40 return &syscall_exit_fields;
38
39 return &entry->exit_fields;
40} 41}
41 42
42struct trace_event_functions enter_syscall_print_funcs = { 43struct trace_event_functions enter_syscall_print_funcs = {
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca9..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
1/*
2 * trace stack traces
3 *
4 * Copyright (C) 2004-2008, Soeren Sandmann
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/kallsyms.h>
9#include <linux/debugfs.h>
10#include <linux/hrtimer.h>
11#include <linux/uaccess.h>
12#include <linux/ftrace.h>
13#include <linux/module.h>
14#include <linux/irq.h>
15#include <linux/fs.h>
16
17#include <asm/stacktrace.h>
18
19#include "trace.h"
20
21static struct trace_array *sysprof_trace;
22static int __read_mostly tracer_enabled;
23
24/*
25 * 1 msec sample interval by default:
26 */
27static unsigned long sample_period = 1000000;
28static const unsigned int sample_max_depth = 512;
29
30static DEFINE_MUTEX(sample_timer_lock);
31/*
32 * Per CPU hrtimers that do the profiling:
33 */
34static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
35
36struct stack_frame {
37 const void __user *next_fp;
38 unsigned long return_address;
39};
40
41static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
42{
43 int ret;
44
45 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
46 return 0;
47
48 ret = 1;
49 pagefault_disable();
50 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
51 ret = 0;
52 pagefault_enable();
53
54 return ret;
55}
56
57struct backtrace_info {
58 struct trace_array_cpu *data;
59 struct trace_array *tr;
60 int pos;
61};
62
63static void
64backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
65{
66 /* Ignore warnings */
67}
68
69static void backtrace_warning(void *data, char *msg)
70{
71 /* Ignore warnings */
72}
73
74static int backtrace_stack(void *data, char *name)
75{
76 /* Don't bother with IRQ stacks for now */
77 return -1;
78}
79
80static void backtrace_address(void *data, unsigned long addr, int reliable)
81{
82 struct backtrace_info *info = data;
83
84 if (info->pos < sample_max_depth && reliable) {
85 __trace_special(info->tr, info->data, 1, addr, 0);
86
87 info->pos++;
88 }
89}
90
91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack,
95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
97};
98
99static int
100trace_kernel(struct pt_regs *regs, struct trace_array *tr,
101 struct trace_array_cpu *data)
102{
103 struct backtrace_info info;
104 unsigned long bp;
105 char *stack;
106
107 info.tr = tr;
108 info.data = data;
109 info.pos = 1;
110
111 __trace_special(info.tr, info.data, 1, regs->ip, 0);
112
113 stack = ((char *)regs + sizeof(struct pt_regs));
114#ifdef CONFIG_FRAME_POINTER
115 bp = regs->bp;
116#else
117 bp = 0;
118#endif
119
120 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
121
122 return info.pos;
123}
124
125static void timer_notify(struct pt_regs *regs, int cpu)
126{
127 struct trace_array_cpu *data;
128 struct stack_frame frame;
129 struct trace_array *tr;
130 const void __user *fp;
131 int is_user;
132 int i;
133
134 if (!regs)
135 return;
136
137 tr = sysprof_trace;
138 data = tr->data[cpu];
139 is_user = user_mode(regs);
140
141 if (!current || current->pid == 0)
142 return;
143
144 if (is_user && current->state != TASK_RUNNING)
145 return;
146
147 __trace_special(tr, data, 0, 0, current->pid);
148
149 if (!is_user)
150 i = trace_kernel(regs, tr, data);
151 else
152 i = 0;
153
154 /*
155 * Trace user stack if we are not a kernel thread
156 */
157 if (current->mm && i < sample_max_depth) {
158 regs = (struct pt_regs *)current->thread.sp0 - 1;
159
160 fp = (void __user *)regs->bp;
161
162 __trace_special(tr, data, 2, regs->ip, 0);
163
164 while (i < sample_max_depth) {
165 frame.next_fp = NULL;
166 frame.return_address = 0;
167 if (!copy_stack_frame(fp, &frame))
168 break;
169 if ((unsigned long)fp < regs->sp)
170 break;
171
172 __trace_special(tr, data, 2, frame.return_address,
173 (unsigned long)fp);
174 fp = frame.next_fp;
175
176 i++;
177 }
178
179 }
180
181 /*
182 * Special trace entry if we overflow the max depth:
183 */
184 if (i == sample_max_depth)
185 __trace_special(tr, data, -1, -1, -1);
186
187 __trace_special(tr, data, 3, current->pid, i);
188}
189
190static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
191{
192 /* trace here */
193 timer_notify(get_irq_regs(), smp_processor_id());
194
195 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
196
197 return HRTIMER_RESTART;
198}
199
200static void start_stack_timer(void *unused)
201{
202 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
203
204 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 hrtimer->function = stack_trace_timer_fn;
206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
208 HRTIMER_MODE_REL_PINNED);
209}
210
211static void start_stack_timers(void)
212{
213 on_each_cpu(start_stack_timer, NULL, 1);
214}
215
216static void stop_stack_timer(int cpu)
217{
218 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
219
220 hrtimer_cancel(hrtimer);
221}
222
223static void stop_stack_timers(void)
224{
225 int cpu;
226
227 for_each_online_cpu(cpu)
228 stop_stack_timer(cpu);
229}
230
231static void stop_stack_trace(struct trace_array *tr)
232{
233 mutex_lock(&sample_timer_lock);
234 stop_stack_timers();
235 tracer_enabled = 0;
236 mutex_unlock(&sample_timer_lock);
237}
238
239static int stack_trace_init(struct trace_array *tr)
240{
241 sysprof_trace = tr;
242
243 tracing_start_cmdline_record();
244
245 mutex_lock(&sample_timer_lock);
246 start_stack_timers();
247 tracer_enabled = 1;
248 mutex_unlock(&sample_timer_lock);
249 return 0;
250}
251
252static void stack_trace_reset(struct trace_array *tr)
253{
254 tracing_stop_cmdline_record();
255 stop_stack_trace(tr);
256}
257
258static struct tracer stack_trace __read_mostly =
259{
260 .name = "sysprof",
261 .init = stack_trace_init,
262 .reset = stack_trace_reset,
263#ifdef CONFIG_FTRACE_SELFTEST
264 .selftest = trace_selftest_startup_sysprof,
265#endif
266};
267
268__init static int init_stack_trace(void)
269{
270 return register_tracer(&stack_trace);
271}
272device_initcall(init_stack_trace);
273
274#define MAX_LONG_DIGITS 22
275
276static ssize_t
277sysprof_sample_read(struct file *filp, char __user *ubuf,
278 size_t cnt, loff_t *ppos)
279{
280 char buf[MAX_LONG_DIGITS];
281 int r;
282
283 r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
284
285 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
286}
287
288static ssize_t
289sysprof_sample_write(struct file *filp, const char __user *ubuf,
290 size_t cnt, loff_t *ppos)
291{
292 char buf[MAX_LONG_DIGITS];
293 unsigned long val;
294
295 if (cnt > MAX_LONG_DIGITS-1)
296 cnt = MAX_LONG_DIGITS-1;
297
298 if (copy_from_user(&buf, ubuf, cnt))
299 return -EFAULT;
300
301 buf[cnt] = 0;
302
303 val = simple_strtoul(buf, NULL, 10);
304 /*
305 * Enforce a minimum sample period of 100 usecs:
306 */
307 if (val < 100)
308 val = 100;
309
310 mutex_lock(&sample_timer_lock);
311 stop_stack_timers();
312 sample_period = val * 1000;
313 start_stack_timers();
314 mutex_unlock(&sample_timer_lock);
315
316 return cnt;
317}
318
319static const struct file_operations sysprof_sample_fops = {
320 .read = sysprof_sample_read,
321 .write = sysprof_sample_write,
322};
323
324void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
325{
326
327 trace_create_file("sysprof_sample_period", 0644,
328 d_tracer, NULL, &sysprof_sample_fops);
329}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/highuid.h>
12#include <linux/cred.h> 13#include <linux/cred.h>
13 14
14/* 15/*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
82 schedule_work(&ns->destroyer); 83 schedule_work(&ns->destroyer);
83} 84}
84EXPORT_SYMBOL(free_user_ns); 85EXPORT_SYMBOL(free_user_ns);
86
87uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
88{
89 struct user_namespace *tmp;
90
91 if (likely(to == cred->user->user_ns))
92 return uid;
93
94
95 /* Is cred->user the creator of the target user_ns
96 * or the creator of one of it's parents?
97 */
98 for ( tmp = to; tmp != &init_user_ns;
99 tmp = tmp->creator->user_ns ) {
100 if (cred->user == tmp->creator) {
101 return (uid_t)0;
102 }
103 }
104
105 /* No useful relationship so no mapping */
106 return overflowuid;
107}
108
109gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
110{
111 struct user_namespace *tmp;
112
113 if (likely(to == cred->user->user_ns))
114 return gid;
115
116 /* Is cred->user the creator of the target user_ns
117 * or the creator of one of it's parents?
118 */
119 for ( tmp = to; tmp != &init_user_ns;
120 tmp = tmp->creator->user_ns ) {
121 if (cred->user == tmp->creator) {
122 return (gid_t)0;
123 }
124 }
125
126 /* No useful relationship so no mapping */
127 return overflowgid;
128}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..613bc1f04610
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,567 @@
1/*
2 * Detect hard and soft lockups on a system
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * this code detects hard lockups: incidents in where on a CPU
7 * the kernel does not respond to anything except NMI.
8 *
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well.
13 */
14
15#include <linux/mm.h>
16#include <linux/cpu.h>
17#include <linux/nmi.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/freezer.h>
21#include <linux/kthread.h>
22#include <linux/lockdep.h>
23#include <linux/notifier.h>
24#include <linux/module.h>
25#include <linux/sysctl.h>
26
27#include <asm/irq_regs.h>
28#include <linux/perf_event.h>
29
30int watchdog_enabled;
31int __read_mostly softlockup_thresh = 60;
32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif
45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog;
48
49
50/* boot commands */
51/*
52 * Should we panic when a soft-lockup or hard-lockup occurs:
53 */
54#ifdef CONFIG_HARDLOCKUP_DETECTOR
55static int hardlockup_panic;
56
57static int __init hardlockup_panic_setup(char *str)
58{
59 if (!strncmp(str, "panic", 5))
60 hardlockup_panic = 1;
61 return 1;
62}
63__setup("nmi_watchdog=", hardlockup_panic_setup);
64#endif
65
66unsigned int __read_mostly softlockup_panic =
67 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
68
69static int __init softlockup_panic_setup(char *str)
70{
71 softlockup_panic = simple_strtoul(str, NULL, 0);
72
73 return 1;
74}
75__setup("softlockup_panic=", softlockup_panic_setup);
76
77static int __init nowatchdog_setup(char *str)
78{
79 no_watchdog = 1;
80 return 1;
81}
82__setup("nowatchdog", nowatchdog_setup);
83
84/* deprecated */
85static int __init nosoftlockup_setup(char *str)
86{
87 no_watchdog = 1;
88 return 1;
89}
90__setup("nosoftlockup", nosoftlockup_setup);
91/* */
92
93
94/*
95 * Returns seconds, approximately. We don't need nanosecond
96 * resolution, and we don't need to waste time with a big divide when
97 * 2^30ns == 1.074s.
98 */
99static unsigned long get_timestamp(int this_cpu)
100{
101 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
102}
103
104static unsigned long get_sample_period(void)
105{
106 /*
107 * convert softlockup_thresh from seconds to ns
108 * the divide by 5 is to give hrtimer 5 chances to
109 * increment before the hardlockup detector generates
110 * a warning
111 */
112 return softlockup_thresh / 5 * NSEC_PER_SEC;
113}
114
115/* Commands for resetting the watchdog */
116static void __touch_watchdog(void)
117{
118 int this_cpu = smp_processor_id();
119
120 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
121}
122
123void touch_softlockup_watchdog(void)
124{
125 __get_cpu_var(watchdog_touch_ts) = 0;
126}
127EXPORT_SYMBOL(touch_softlockup_watchdog);
128
129void touch_all_softlockup_watchdogs(void)
130{
131 int cpu;
132
133 /*
134 * this is done lockless
135 * do we care if a 0 races with a timestamp?
136 * all it means is the softlock check starts one cycle later
137 */
138 for_each_online_cpu(cpu)
139 per_cpu(watchdog_touch_ts, cpu) = 0;
140}
141
142#ifdef CONFIG_HARDLOCKUP_DETECTOR
143void touch_nmi_watchdog(void)
144{
145 __get_cpu_var(watchdog_nmi_touch) = true;
146 touch_softlockup_watchdog();
147}
148EXPORT_SYMBOL(touch_nmi_watchdog);
149
150#endif
151
152void touch_softlockup_watchdog_sync(void)
153{
154 __raw_get_cpu_var(softlockup_touch_sync) = true;
155 __raw_get_cpu_var(watchdog_touch_ts) = 0;
156}
157
158#ifdef CONFIG_HARDLOCKUP_DETECTOR
159/* watchdog detector functions */
160static int is_hardlockup(void)
161{
162 unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
163
164 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
165 return 1;
166
167 __get_cpu_var(hrtimer_interrupts_saved) = hrint;
168 return 0;
169}
170#endif
171
172static int is_softlockup(unsigned long touch_ts)
173{
174 unsigned long now = get_timestamp(smp_processor_id());
175
176 /* Warn about unreasonable delays: */
177 if (time_after(now, touch_ts + softlockup_thresh))
178 return now - touch_ts;
179
180 return 0;
181}
182
183static int
184watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
185{
186 did_panic = 1;
187
188 return NOTIFY_DONE;
189}
190
191static struct notifier_block panic_block = {
192 .notifier_call = watchdog_panic,
193};
194
195#ifdef CONFIG_HARDLOCKUP_DETECTOR
196static struct perf_event_attr wd_hw_attr = {
197 .type = PERF_TYPE_HARDWARE,
198 .config = PERF_COUNT_HW_CPU_CYCLES,
199 .size = sizeof(struct perf_event_attr),
200 .pinned = 1,
201 .disabled = 1,
202};
203
204/* Callback function for perf event subsystem */
205void watchdog_overflow_callback(struct perf_event *event, int nmi,
206 struct perf_sample_data *data,
207 struct pt_regs *regs)
208{
209 if (__get_cpu_var(watchdog_nmi_touch) == true) {
210 __get_cpu_var(watchdog_nmi_touch) = false;
211 return;
212 }
213
214 /* check for a hardlockup
215 * This is done by making sure our timer interrupt
216 * is incrementing. The timer interrupt should have
217 * fired multiple times before we overflow'd. If it hasn't
218 * then this is a good indication the cpu is stuck
219 */
220 if (is_hardlockup()) {
221 int this_cpu = smp_processor_id();
222
223 /* only print hardlockups once */
224 if (__get_cpu_var(hard_watchdog_warn) == true)
225 return;
226
227 if (hardlockup_panic)
228 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
229 else
230 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
231
232 __get_cpu_var(hard_watchdog_warn) = true;
233 return;
234 }
235
236 __get_cpu_var(hard_watchdog_warn) = false;
237 return;
238}
239static void watchdog_interrupt_count(void)
240{
241 __get_cpu_var(hrtimer_interrupts)++;
242}
243#else
244static inline void watchdog_interrupt_count(void) { return; }
245#endif /* CONFIG_HARDLOCKUP_DETECTOR */
246
247/* watchdog kicker functions */
248static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
249{
250 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
251 struct pt_regs *regs = get_irq_regs();
252 int duration;
253
254 /* kick the hardlockup detector */
255 watchdog_interrupt_count();
256
257 /* kick the softlockup detector */
258 wake_up_process(__get_cpu_var(softlockup_watchdog));
259
260 /* .. and repeat */
261 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
262
263 if (touch_ts == 0) {
264 if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
265 /*
266 * If the time stamp was touched atomically
267 * make sure the scheduler tick is up to date.
268 */
269 __get_cpu_var(softlockup_touch_sync) = false;
270 sched_clock_tick();
271 }
272 __touch_watchdog();
273 return HRTIMER_RESTART;
274 }
275
276 /* check for a softlockup
277 * This is done by making sure a high priority task is
278 * being scheduled. The task touches the watchdog to
279 * indicate it is getting cpu time. If it hasn't then
280 * this is a good indication some task is hogging the cpu
281 */
282 duration = is_softlockup(touch_ts);
283 if (unlikely(duration)) {
284 /* only warn once */
285 if (__get_cpu_var(soft_watchdog_warn) == true)
286 return HRTIMER_RESTART;
287
288 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
289 smp_processor_id(), duration,
290 current->comm, task_pid_nr(current));
291 print_modules();
292 print_irqtrace_events(current);
293 if (regs)
294 show_regs(regs);
295 else
296 dump_stack();
297
298 if (softlockup_panic)
299 panic("softlockup: hung tasks");
300 __get_cpu_var(soft_watchdog_warn) = true;
301 } else
302 __get_cpu_var(soft_watchdog_warn) = false;
303
304 return HRTIMER_RESTART;
305}
306
307
308/*
309 * The watchdog thread - touches the timestamp.
310 */
311static int watchdog(void *unused)
312{
313 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
314 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
315
316 sched_setscheduler(current, SCHED_FIFO, &param);
317
318 /* initialize timestamp */
319 __touch_watchdog();
320
321 /* kick off the timer for the hardlockup detector */
322 /* done here because hrtimer_start can only pin to smp_processor_id() */
323 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
324 HRTIMER_MODE_REL_PINNED);
325
326 set_current_state(TASK_INTERRUPTIBLE);
327 /*
328 * Run briefly once per second to reset the softlockup timestamp.
329 * If this gets delayed for more than 60 seconds then the
330 * debug-printout triggers in watchdog_timer_fn().
331 */
332 while (!kthread_should_stop()) {
333 __touch_watchdog();
334 schedule();
335
336 if (kthread_should_stop())
337 break;
338
339 set_current_state(TASK_INTERRUPTIBLE);
340 }
341 __set_current_state(TASK_RUNNING);
342
343 return 0;
344}
345
346
347#ifdef CONFIG_HARDLOCKUP_DETECTOR
348static int watchdog_nmi_enable(int cpu)
349{
350 struct perf_event_attr *wd_attr;
351 struct perf_event *event = per_cpu(watchdog_ev, cpu);
352
353 /* is it already setup and enabled? */
354 if (event && event->state > PERF_EVENT_STATE_OFF)
355 goto out;
356
357 /* it is setup but not enabled */
358 if (event != NULL)
359 goto out_enable;
360
361 /* Try to register using hardware perf events */
362 wd_attr = &wd_hw_attr;
363 wd_attr->sample_period = hw_nmi_get_sample_period();
364 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
365 if (!IS_ERR(event)) {
366 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
367 goto out_save;
368 }
369
370 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
371 return -1;
372
373 /* success path */
374out_save:
375 per_cpu(watchdog_ev, cpu) = event;
376out_enable:
377 perf_event_enable(per_cpu(watchdog_ev, cpu));
378out:
379 return 0;
380}
381
382static void watchdog_nmi_disable(int cpu)
383{
384 struct perf_event *event = per_cpu(watchdog_ev, cpu);
385
386 if (event) {
387 perf_event_disable(event);
388 per_cpu(watchdog_ev, cpu) = NULL;
389
390 /* should be in cleanup, but blocks oprofile */
391 perf_event_release_kernel(event);
392 }
393 return;
394}
395#else
396static int watchdog_nmi_enable(int cpu) { return 0; }
397static void watchdog_nmi_disable(int cpu) { return; }
398#endif /* CONFIG_HARDLOCKUP_DETECTOR */
399
400/* prepare/enable/disable routines */
401static int watchdog_prepare_cpu(int cpu)
402{
403 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
404
405 WARN_ON(per_cpu(softlockup_watchdog, cpu));
406 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
407 hrtimer->function = watchdog_timer_fn;
408
409 return 0;
410}
411
412static int watchdog_enable(int cpu)
413{
414 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
415
416 /* enable the perf event */
417 if (watchdog_nmi_enable(cpu) != 0)
418 return -1;
419
420 /* create the watchdog thread */
421 if (!p) {
422 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
423 if (IS_ERR(p)) {
424 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
425 return -1;
426 }
427 kthread_bind(p, cpu);
428 per_cpu(watchdog_touch_ts, cpu) = 0;
429 per_cpu(softlockup_watchdog, cpu) = p;
430 wake_up_process(p);
431 }
432
433 return 0;
434}
435
436static void watchdog_disable(int cpu)
437{
438 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
439 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
440
441 /*
442 * cancel the timer first to stop incrementing the stats
443 * and waking up the kthread
444 */
445 hrtimer_cancel(hrtimer);
446
447 /* disable the perf event */
448 watchdog_nmi_disable(cpu);
449
450 /* stop the watchdog thread */
451 if (p) {
452 per_cpu(softlockup_watchdog, cpu) = NULL;
453 kthread_stop(p);
454 }
455
456 /* if any cpu succeeds, watchdog is considered enabled for the system */
457 watchdog_enabled = 1;
458}
459
460static void watchdog_enable_all_cpus(void)
461{
462 int cpu;
463 int result = 0;
464
465 for_each_online_cpu(cpu)
466 result += watchdog_enable(cpu);
467
468 if (result)
469 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
470
471}
472
473static void watchdog_disable_all_cpus(void)
474{
475 int cpu;
476
477 for_each_online_cpu(cpu)
478 watchdog_disable(cpu);
479
480 /* if all watchdogs are disabled, then they are disabled for the system */
481 watchdog_enabled = 0;
482}
483
484
485/* sysctl functions */
486#ifdef CONFIG_SYSCTL
487/*
488 * proc handler for /proc/sys/kernel/nmi_watchdog
489 */
490
491int proc_dowatchdog_enabled(struct ctl_table *table, int write,
492 void __user *buffer, size_t *length, loff_t *ppos)
493{
494 proc_dointvec(table, write, buffer, length, ppos);
495
496 if (watchdog_enabled)
497 watchdog_enable_all_cpus();
498 else
499 watchdog_disable_all_cpus();
500 return 0;
501}
502
503int proc_dowatchdog_thresh(struct ctl_table *table, int write,
504 void __user *buffer,
505 size_t *lenp, loff_t *ppos)
506{
507 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
508}
509#endif /* CONFIG_SYSCTL */
510
511
512/*
513 * Create/destroy watchdog threads as CPUs come and go:
514 */
515static int __cpuinit
516cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
517{
518 int hotcpu = (unsigned long)hcpu;
519
520 switch (action) {
521 case CPU_UP_PREPARE:
522 case CPU_UP_PREPARE_FROZEN:
523 if (watchdog_prepare_cpu(hotcpu))
524 return NOTIFY_BAD;
525 break;
526 case CPU_ONLINE:
527 case CPU_ONLINE_FROZEN:
528 if (watchdog_enable(hotcpu))
529 return NOTIFY_BAD;
530 break;
531#ifdef CONFIG_HOTPLUG_CPU
532 case CPU_UP_CANCELED:
533 case CPU_UP_CANCELED_FROZEN:
534 watchdog_disable(hotcpu);
535 break;
536 case CPU_DEAD:
537 case CPU_DEAD_FROZEN:
538 watchdog_disable(hotcpu);
539 break;
540#endif /* CONFIG_HOTPLUG_CPU */
541 }
542 return NOTIFY_OK;
543}
544
545static struct notifier_block __cpuinitdata cpu_nfb = {
546 .notifier_call = cpu_callback
547};
548
549static int __init spawn_watchdog_task(void)
550{
551 void *cpu = (void *)(long)smp_processor_id();
552 int err;
553
554 if (no_watchdog)
555 return 0;
556
557 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
558 WARN_ON(err == NOTIFY_BAD);
559
560 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
561 register_cpu_notifier(&cpu_nfb);
562
563 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
564
565 return 0;
566}
567early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..2994a0e3a61c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,41 +33,287 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 35#include <linux/lockdep.h>
36#define CREATE_TRACE_POINTS 36#include <linux/idr.h>
37#include <trace/events/workqueue.h> 37
38#include "workqueue_sched.h"
39
40enum {
41 /* global_cwq flags */
42 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
43 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
44 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
45 GCWQ_FREEZING = 1 << 3, /* freeze in progress */
46 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
47
48 /* worker flags */
49 WORKER_STARTED = 1 << 0, /* started */
50 WORKER_DIE = 1 << 1, /* die die die */
51 WORKER_IDLE = 1 << 2, /* is idle */
52 WORKER_PREP = 1 << 3, /* preparing to run works */
53 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
54 WORKER_REBIND = 1 << 5, /* mom is home, come back */
55 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
56 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
57
58 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
59 WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
60
61 /* gcwq->trustee_state */
62 TRUSTEE_START = 0, /* start */
63 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
64 TRUSTEE_BUTCHER = 2, /* butcher workers */
65 TRUSTEE_RELEASE = 3, /* release workers */
66 TRUSTEE_DONE = 4, /* trustee is done */
67
68 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
69 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
70 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
71
72 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
73 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
74
75 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
76 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
77 CREATE_COOLDOWN = HZ, /* time to breath after fail */
78 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
79
80 /*
81 * Rescue workers are used only on emergencies and shared by
82 * all cpus. Give -20.
83 */
84 RESCUER_NICE_LEVEL = -20,
85};
38 86
39/* 87/*
40 * The per-CPU workqueue (if single thread, we always use the first 88 * Structure fields follow one of the following exclusion rules.
41 * possible cpu). 89 *
90 * I: Set during initialization and read-only afterwards.
91 *
92 * P: Preemption protected. Disabling preemption is enough and should
93 * only be modified and accessed from the local cpu.
94 *
95 * L: gcwq->lock protected. Access with gcwq->lock held.
96 *
97 * X: During normal operation, modification requires gcwq->lock and
98 * should be done only from local cpu. Either disabling preemption
99 * on local cpu or grabbing gcwq->lock is enough for read access.
100 * If GCWQ_DISASSOCIATED is set, it's identical to L.
101 *
102 * F: wq->flush_mutex protected.
103 *
104 * W: workqueue_lock protected.
42 */ 105 */
43struct cpu_workqueue_struct {
44 106
45 spinlock_t lock; 107struct global_cwq;
46 108
47 struct list_head worklist; 109/*
48 wait_queue_head_t more_work; 110 * The poor guys doing the actual heavy lifting. All on-duty workers
49 struct work_struct *current_work; 111 * are either serving the manager role, on idle list or on busy hash.
112 */
113struct worker {
114 /* on idle list while idle, on busy hash table while busy */
115 union {
116 struct list_head entry; /* L: while idle */
117 struct hlist_node hentry; /* L: while busy */
118 };
50 119
51 struct workqueue_struct *wq; 120 struct work_struct *current_work; /* L: work being processed */
52 struct task_struct *thread; 121 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
53} ____cacheline_aligned; 122 struct list_head scheduled; /* L: scheduled works */
123 struct task_struct *task; /* I: worker task */
124 struct global_cwq *gcwq; /* I: the associated gcwq */
125 /* 64 bytes boundary on 64bit, 32 on 32bit */
126 unsigned long last_active; /* L: last active timestamp */
127 unsigned int flags; /* X: flags */
128 int id; /* I: worker id */
129 struct work_struct rebind_work; /* L: rebind worker to cpu */
130};
131
132/*
133 * Global per-cpu workqueue. There's one and only one for each cpu
134 * and all works are queued and processed here regardless of their
135 * target workqueues.
136 */
137struct global_cwq {
138 spinlock_t lock; /* the gcwq lock */
139 struct list_head worklist; /* L: list of pending works */
140 unsigned int cpu; /* I: the associated cpu */
141 unsigned int flags; /* L: GCWQ_* flags */
142
143 int nr_workers; /* L: total number of workers */
144 int nr_idle; /* L: currently idle ones */
145
146 /* workers are chained either in the idle_list or busy_hash */
147 struct list_head idle_list; /* X: list of idle workers */
148 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
149 /* L: hash of busy workers */
150
151 struct timer_list idle_timer; /* L: worker idle timeout */
152 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
153
154 struct ida worker_ida; /* L: for worker IDs */
155
156 struct task_struct *trustee; /* L: for gcwq shutdown */
157 unsigned int trustee_state; /* L: trustee state */
158 wait_queue_head_t trustee_wait; /* trustee wait */
159 struct worker *first_idle; /* L: first idle worker */
160} ____cacheline_aligned_in_smp;
161
162/*
163 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
164 * work_struct->data are used for flags and thus cwqs need to be
165 * aligned at two's power of the number of flag bits.
166 */
167struct cpu_workqueue_struct {
168 struct global_cwq *gcwq; /* I: the associated gcwq */
169 struct workqueue_struct *wq; /* I: the owning workqueue */
170 int work_color; /* L: current color */
171 int flush_color; /* L: flushing color */
172 int nr_in_flight[WORK_NR_COLORS];
173 /* L: nr of in_flight works */
174 int nr_active; /* L: nr of active works */
175 int max_active; /* L: max active works */
176 struct list_head delayed_works; /* L: delayed works */
177};
178
179/*
180 * Structure used to wait for workqueue flush.
181 */
182struct wq_flusher {
183 struct list_head list; /* F: list of flushers */
184 int flush_color; /* F: flush color waiting for */
185 struct completion done; /* flush completion */
186};
187
188/*
189 * All cpumasks are assumed to be always set on UP and thus can't be
190 * used to determine whether there's something to be done.
191 */
192#ifdef CONFIG_SMP
193typedef cpumask_var_t mayday_mask_t;
194#define mayday_test_and_set_cpu(cpu, mask) \
195 cpumask_test_and_set_cpu((cpu), (mask))
196#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
197#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
198#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp))
199#define free_mayday_mask(mask) free_cpumask_var((mask))
200#else
201typedef unsigned long mayday_mask_t;
202#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
203#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
204#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
205#define alloc_mayday_mask(maskp, gfp) true
206#define free_mayday_mask(mask) do { } while (0)
207#endif
54 208
55/* 209/*
56 * The externally visible workqueue abstraction is an array of 210 * The externally visible workqueue abstraction is an array of
57 * per-CPU workqueues: 211 * per-CPU workqueues:
58 */ 212 */
59struct workqueue_struct { 213struct workqueue_struct {
60 struct cpu_workqueue_struct *cpu_wq; 214 unsigned int flags; /* I: WQ_* flags */
61 struct list_head list; 215 union {
62 const char *name; 216 struct cpu_workqueue_struct __percpu *pcpu;
63 int singlethread; 217 struct cpu_workqueue_struct *single;
64 int freezeable; /* Freeze threads during suspend */ 218 unsigned long v;
65 int rt; 219 } cpu_wq; /* I: cwq's */
220 struct list_head list; /* W: list of all workqueues */
221
222 struct mutex flush_mutex; /* protects wq flushing */
223 int work_color; /* F: current work color */
224 int flush_color; /* F: current flush color */
225 atomic_t nr_cwqs_to_flush; /* flush in progress */
226 struct wq_flusher *first_flusher; /* F: first flusher */
227 struct list_head flusher_queue; /* F: flush waiters */
228 struct list_head flusher_overflow; /* F: flush overflow list */
229
230 mayday_mask_t mayday_mask; /* cpus requesting rescue */
231 struct worker *rescuer; /* I: rescue worker */
232
233 int saved_max_active; /* W: saved cwq max_active */
234 const char *name; /* I: workqueue name */
66#ifdef CONFIG_LOCKDEP 235#ifdef CONFIG_LOCKDEP
67 struct lockdep_map lockdep_map; 236 struct lockdep_map lockdep_map;
68#endif 237#endif
69}; 238};
70 239
240struct workqueue_struct *system_wq __read_mostly;
241struct workqueue_struct *system_long_wq __read_mostly;
242struct workqueue_struct *system_nrt_wq __read_mostly;
243struct workqueue_struct *system_unbound_wq __read_mostly;
244EXPORT_SYMBOL_GPL(system_wq);
245EXPORT_SYMBOL_GPL(system_long_wq);
246EXPORT_SYMBOL_GPL(system_nrt_wq);
247EXPORT_SYMBOL_GPL(system_unbound_wq);
248
249#define for_each_busy_worker(worker, i, pos, gcwq) \
250 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
251 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
252
253static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
254 unsigned int sw)
255{
256 if (cpu < nr_cpu_ids) {
257 if (sw & 1) {
258 cpu = cpumask_next(cpu, mask);
259 if (cpu < nr_cpu_ids)
260 return cpu;
261 }
262 if (sw & 2)
263 return WORK_CPU_UNBOUND;
264 }
265 return WORK_CPU_NONE;
266}
267
268static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
269 struct workqueue_struct *wq)
270{
271 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
272}
273
274/*
275 * CPU iterators
276 *
277 * An extra gcwq is defined for an invalid cpu number
278 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
279 * specific CPU. The following iterators are similar to
280 * for_each_*_cpu() iterators but also considers the unbound gcwq.
281 *
282 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
283 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
284 * for_each_cwq_cpu() : possible CPUs for bound workqueues,
285 * WORK_CPU_UNBOUND for unbound workqueues
286 */
287#define for_each_gcwq_cpu(cpu) \
288 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
289 (cpu) < WORK_CPU_NONE; \
290 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
291
292#define for_each_online_gcwq_cpu(cpu) \
293 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
294 (cpu) < WORK_CPU_NONE; \
295 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
296
297#define for_each_cwq_cpu(cpu, wq) \
298 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
299 (cpu) < WORK_CPU_NONE; \
300 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
301
302#ifdef CONFIG_LOCKDEP
303/**
304 * in_workqueue_context() - in context of specified workqueue?
305 * @wq: the workqueue of interest
306 *
307 * Checks lockdep state to see if the current task is executing from
308 * within a workqueue item. This function exists only if lockdep is
309 * enabled.
310 */
311int in_workqueue_context(struct workqueue_struct *wq)
312{
313 return lock_is_held(&wq->lockdep_map);
314}
315#endif
316
71#ifdef CONFIG_DEBUG_OBJECTS_WORK 317#ifdef CONFIG_DEBUG_OBJECTS_WORK
72 318
73static struct debug_obj_descr work_debug_descr; 319static struct debug_obj_descr work_debug_descr;
@@ -107,7 +353,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
107 * statically initialized. We just make sure that it 353 * statically initialized. We just make sure that it
108 * is tracked in the object tracker. 354 * is tracked in the object tracker.
109 */ 355 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { 356 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr); 357 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr); 358 debug_object_activate(work, &work_debug_descr);
113 return 0; 359 return 0;
@@ -181,94 +427,575 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
181/* Serializes the accesses to the list of workqueues. */ 427/* Serializes the accesses to the list of workqueues. */
182static DEFINE_SPINLOCK(workqueue_lock); 428static DEFINE_SPINLOCK(workqueue_lock);
183static LIST_HEAD(workqueues); 429static LIST_HEAD(workqueues);
430static bool workqueue_freezing; /* W: have wqs started freezing? */
431
432/*
433 * The almighty global cpu workqueues. nr_running is the only field
434 * which is expected to be used frequently by other cpus via
435 * try_to_wake_up(). Put it in a separate cacheline.
436 */
437static DEFINE_PER_CPU(struct global_cwq, global_cwq);
438static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
184 439
185static int singlethread_cpu __read_mostly;
186static const struct cpumask *cpu_singlethread_map __read_mostly;
187/* 440/*
188 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 441 * Global cpu workqueue and nr_running counter for unbound gcwq. The
189 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 442 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
190 * which comes in between can't use for_each_online_cpu(). We could 443 * workers have WORKER_UNBOUND set.
191 * use cpu_possible_map, the cpumask below is more a documentation
192 * than optimization.
193 */ 444 */
194static cpumask_var_t cpu_populated_map __read_mostly; 445static struct global_cwq unbound_global_cwq;
446static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
195 447
196/* If it's single threaded, it isn't in the list of workqueues. */ 448static int worker_thread(void *__worker);
197static inline int is_wq_single_threaded(struct workqueue_struct *wq) 449
450static struct global_cwq *get_gcwq(unsigned int cpu)
198{ 451{
199 return wq->singlethread; 452 if (cpu != WORK_CPU_UNBOUND)
453 return &per_cpu(global_cwq, cpu);
454 else
455 return &unbound_global_cwq;
200} 456}
201 457
202static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) 458static atomic_t *get_gcwq_nr_running(unsigned int cpu)
203{ 459{
204 return is_wq_single_threaded(wq) 460 if (cpu != WORK_CPU_UNBOUND)
205 ? cpu_singlethread_map : cpu_populated_map; 461 return &per_cpu(gcwq_nr_running, cpu);
462 else
463 return &unbound_gcwq_nr_running;
206} 464}
207 465
208static 466static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
209struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) 467 struct workqueue_struct *wq)
210{ 468{
211 if (unlikely(is_wq_single_threaded(wq))) 469 if (!(wq->flags & WQ_UNBOUND)) {
212 cpu = singlethread_cpu; 470 if (likely(cpu < nr_cpu_ids)) {
213 return per_cpu_ptr(wq->cpu_wq, cpu); 471#ifdef CONFIG_SMP
472 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
473#else
474 return wq->cpu_wq.single;
475#endif
476 }
477 } else if (likely(cpu == WORK_CPU_UNBOUND))
478 return wq->cpu_wq.single;
479 return NULL;
480}
481
482static unsigned int work_color_to_flags(int color)
483{
484 return color << WORK_STRUCT_COLOR_SHIFT;
485}
486
487static int get_work_color(struct work_struct *work)
488{
489 return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
490 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
491}
492
493static int work_next_color(int color)
494{
495 return (color + 1) % WORK_NR_COLORS;
214} 496}
215 497
216/* 498/*
217 * Set the workqueue on which a work item is to be run 499 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
218 * - Must *only* be called if the pending flag is set 500 * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
501 * cleared and the work data contains the cpu number it was last on.
502 *
503 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
504 * cwq, cpu or clear work->data. These functions should only be
505 * called while the work is owned - ie. while the PENDING bit is set.
506 *
507 * get_work_[g]cwq() can be used to obtain the gcwq or cwq
508 * corresponding to a work. gcwq is available once the work has been
509 * queued anywhere after initialization. cwq is available only from
510 * queueing until execution starts.
219 */ 511 */
220static inline void set_wq_data(struct work_struct *work, 512static inline void set_work_data(struct work_struct *work, unsigned long data,
221 struct cpu_workqueue_struct *cwq) 513 unsigned long flags)
222{ 514{
223 unsigned long new;
224
225 BUG_ON(!work_pending(work)); 515 BUG_ON(!work_pending(work));
516 atomic_long_set(&work->data, data | flags | work_static(work));
517}
518
519static void set_work_cwq(struct work_struct *work,
520 struct cpu_workqueue_struct *cwq,
521 unsigned long extra_flags)
522{
523 set_work_data(work, (unsigned long)cwq,
524 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
525}
526
527static void set_work_cpu(struct work_struct *work, unsigned int cpu)
528{
529 set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
530}
531
532static void clear_work_data(struct work_struct *work)
533{
534 set_work_data(work, WORK_STRUCT_NO_CPU, 0);
535}
536
537static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
538{
539 unsigned long data = atomic_long_read(&work->data);
540
541 if (data & WORK_STRUCT_CWQ)
542 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
543 else
544 return NULL;
545}
546
547static struct global_cwq *get_work_gcwq(struct work_struct *work)
548{
549 unsigned long data = atomic_long_read(&work->data);
550 unsigned int cpu;
551
552 if (data & WORK_STRUCT_CWQ)
553 return ((struct cpu_workqueue_struct *)
554 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
555
556 cpu = data >> WORK_STRUCT_FLAG_BITS;
557 if (cpu == WORK_CPU_NONE)
558 return NULL;
559
560 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
561 return get_gcwq(cpu);
562}
563
564/*
565 * Policy functions. These define the policies on how the global
566 * worker pool is managed. Unless noted otherwise, these functions
567 * assume that they're being called with gcwq->lock held.
568 */
569
570static bool __need_more_worker(struct global_cwq *gcwq)
571{
572 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
573 gcwq->flags & GCWQ_HIGHPRI_PENDING;
574}
575
576/*
577 * Need to wake up a worker? Called from anything but currently
578 * running workers.
579 */
580static bool need_more_worker(struct global_cwq *gcwq)
581{
582 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
583}
584
585/* Can I start working? Called from busy but !running workers. */
586static bool may_start_working(struct global_cwq *gcwq)
587{
588 return gcwq->nr_idle;
589}
590
591/* Do I need to keep working? Called from currently running workers. */
592static bool keep_working(struct global_cwq *gcwq)
593{
594 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
595
596 return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
597}
598
599/* Do we need a new worker? Called from manager. */
600static bool need_to_create_worker(struct global_cwq *gcwq)
601{
602 return need_more_worker(gcwq) && !may_start_working(gcwq);
603}
604
605/* Do I need to be the manager? */
606static bool need_to_manage_workers(struct global_cwq *gcwq)
607{
608 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
609}
610
611/* Do we have too many workers and should some go away? */
612static bool too_many_workers(struct global_cwq *gcwq)
613{
614 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
615 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
616 int nr_busy = gcwq->nr_workers - nr_idle;
226 617
227 new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); 618 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
228 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
229 atomic_long_set(&work->data, new);
230} 619}
231 620
232/* 621/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. 622 * Wake up functions.
234 */ 623 */
235static inline void clear_wq_data(struct work_struct *work) 624
625/* Return the first worker. Safe with preemption disabled */
626static struct worker *first_worker(struct global_cwq *gcwq)
236{ 627{
237 unsigned long flags = *work_data_bits(work) & 628 if (unlikely(list_empty(&gcwq->idle_list)))
238 (1UL << WORK_STRUCT_STATIC); 629 return NULL;
239 atomic_long_set(&work->data, flags); 630
631 return list_first_entry(&gcwq->idle_list, struct worker, entry);
240} 632}
241 633
242static inline 634/**
243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 635 * wake_up_worker - wake up an idle worker
636 * @gcwq: gcwq to wake worker for
637 *
638 * Wake up the first idle worker of @gcwq.
639 *
640 * CONTEXT:
641 * spin_lock_irq(gcwq->lock).
642 */
643static void wake_up_worker(struct global_cwq *gcwq)
244{ 644{
245 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 645 struct worker *worker = first_worker(gcwq);
646
647 if (likely(worker))
648 wake_up_process(worker->task);
649}
650
651/**
652 * wq_worker_waking_up - a worker is waking up
653 * @task: task waking up
654 * @cpu: CPU @task is waking up to
655 *
656 * This function is called during try_to_wake_up() when a worker is
657 * being awoken.
658 *
659 * CONTEXT:
660 * spin_lock_irq(rq->lock)
661 */
662void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
663{
664 struct worker *worker = kthread_data(task);
665
666 if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
667 atomic_inc(get_gcwq_nr_running(cpu));
668}
669
670/**
671 * wq_worker_sleeping - a worker is going to sleep
672 * @task: task going to sleep
673 * @cpu: CPU in question, must be the current CPU number
674 *
675 * This function is called during schedule() when a busy worker is
676 * going to sleep. Worker on the same cpu can be woken up by
677 * returning pointer to its task.
678 *
679 * CONTEXT:
680 * spin_lock_irq(rq->lock)
681 *
682 * RETURNS:
683 * Worker task on @cpu to wake up, %NULL if none.
684 */
685struct task_struct *wq_worker_sleeping(struct task_struct *task,
686 unsigned int cpu)
687{
688 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
689 struct global_cwq *gcwq = get_gcwq(cpu);
690 atomic_t *nr_running = get_gcwq_nr_running(cpu);
691
692 if (unlikely(worker->flags & WORKER_NOT_RUNNING))
693 return NULL;
694
695 /* this can only happen on the local cpu */
696 BUG_ON(cpu != raw_smp_processor_id());
697
698 /*
699 * The counterpart of the following dec_and_test, implied mb,
700 * worklist not empty test sequence is in insert_work().
701 * Please read comment there.
702 *
703 * NOT_RUNNING is clear. This means that trustee is not in
704 * charge and we're running on the local cpu w/ rq lock held
705 * and preemption disabled, which in turn means that none else
706 * could be manipulating idle_list, so dereferencing idle_list
707 * without gcwq lock is safe.
708 */
709 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
710 to_wakeup = first_worker(gcwq);
711 return to_wakeup ? to_wakeup->task : NULL;
712}
713
714/**
715 * worker_set_flags - set worker flags and adjust nr_running accordingly
716 * @worker: self
717 * @flags: flags to set
718 * @wakeup: wakeup an idle worker if necessary
719 *
720 * Set @flags in @worker->flags and adjust nr_running accordingly. If
721 * nr_running becomes zero and @wakeup is %true, an idle worker is
722 * woken up.
723 *
724 * CONTEXT:
725 * spin_lock_irq(gcwq->lock)
726 */
727static inline void worker_set_flags(struct worker *worker, unsigned int flags,
728 bool wakeup)
729{
730 struct global_cwq *gcwq = worker->gcwq;
731
732 WARN_ON_ONCE(worker->task != current);
733
734 /*
735 * If transitioning into NOT_RUNNING, adjust nr_running and
736 * wake up an idle worker as necessary if requested by
737 * @wakeup.
738 */
739 if ((flags & WORKER_NOT_RUNNING) &&
740 !(worker->flags & WORKER_NOT_RUNNING)) {
741 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
742
743 if (wakeup) {
744 if (atomic_dec_and_test(nr_running) &&
745 !list_empty(&gcwq->worklist))
746 wake_up_worker(gcwq);
747 } else
748 atomic_dec(nr_running);
749 }
750
751 worker->flags |= flags;
246} 752}
247 753
754/**
755 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
756 * @worker: self
757 * @flags: flags to clear
758 *
759 * Clear @flags in @worker->flags and adjust nr_running accordingly.
760 *
761 * CONTEXT:
762 * spin_lock_irq(gcwq->lock)
763 */
764static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
765{
766 struct global_cwq *gcwq = worker->gcwq;
767 unsigned int oflags = worker->flags;
768
769 WARN_ON_ONCE(worker->task != current);
770
771 worker->flags &= ~flags;
772
773 /* if transitioning out of NOT_RUNNING, increment nr_running */
774 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
775 if (!(worker->flags & WORKER_NOT_RUNNING))
776 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
777}
778
779/**
780 * busy_worker_head - return the busy hash head for a work
781 * @gcwq: gcwq of interest
782 * @work: work to be hashed
783 *
784 * Return hash head of @gcwq for @work.
785 *
786 * CONTEXT:
787 * spin_lock_irq(gcwq->lock).
788 *
789 * RETURNS:
790 * Pointer to the hash head.
791 */
792static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
793 struct work_struct *work)
794{
795 const int base_shift = ilog2(sizeof(struct work_struct));
796 unsigned long v = (unsigned long)work;
797
798 /* simple shift and fold hash, do we need something better? */
799 v >>= base_shift;
800 v += v >> BUSY_WORKER_HASH_ORDER;
801 v &= BUSY_WORKER_HASH_MASK;
802
803 return &gcwq->busy_hash[v];
804}
805
806/**
807 * __find_worker_executing_work - find worker which is executing a work
808 * @gcwq: gcwq of interest
809 * @bwh: hash head as returned by busy_worker_head()
810 * @work: work to find worker for
811 *
812 * Find a worker which is executing @work on @gcwq. @bwh should be
813 * the hash head obtained by calling busy_worker_head() with the same
814 * work.
815 *
816 * CONTEXT:
817 * spin_lock_irq(gcwq->lock).
818 *
819 * RETURNS:
820 * Pointer to worker which is executing @work if found, NULL
821 * otherwise.
822 */
823static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
824 struct hlist_head *bwh,
825 struct work_struct *work)
826{
827 struct worker *worker;
828 struct hlist_node *tmp;
829
830 hlist_for_each_entry(worker, tmp, bwh, hentry)
831 if (worker->current_work == work)
832 return worker;
833 return NULL;
834}
835
836/**
837 * find_worker_executing_work - find worker which is executing a work
838 * @gcwq: gcwq of interest
839 * @work: work to find worker for
840 *
841 * Find a worker which is executing @work on @gcwq. This function is
842 * identical to __find_worker_executing_work() except that this
843 * function calculates @bwh itself.
844 *
845 * CONTEXT:
846 * spin_lock_irq(gcwq->lock).
847 *
848 * RETURNS:
849 * Pointer to worker which is executing @work if found, NULL
850 * otherwise.
851 */
852static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
853 struct work_struct *work)
854{
855 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
856 work);
857}
858
859/**
860 * gcwq_determine_ins_pos - find insertion position
861 * @gcwq: gcwq of interest
862 * @cwq: cwq a work is being queued for
863 *
864 * A work for @cwq is about to be queued on @gcwq, determine insertion
865 * position for the work. If @cwq is for HIGHPRI wq, the work is
866 * queued at the head of the queue but in FIFO order with respect to
867 * other HIGHPRI works; otherwise, at the end of the queue. This
868 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
869 * there are HIGHPRI works pending.
870 *
871 * CONTEXT:
872 * spin_lock_irq(gcwq->lock).
873 *
874 * RETURNS:
875 * Pointer to inserstion position.
876 */
877static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
878 struct cpu_workqueue_struct *cwq)
879{
880 struct work_struct *twork;
881
882 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
883 return &gcwq->worklist;
884
885 list_for_each_entry(twork, &gcwq->worklist, entry) {
886 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
887
888 if (!(tcwq->wq->flags & WQ_HIGHPRI))
889 break;
890 }
891
892 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
893 return &twork->entry;
894}
895
896/**
897 * insert_work - insert a work into gcwq
898 * @cwq: cwq @work belongs to
899 * @work: work to insert
900 * @head: insertion point
901 * @extra_flags: extra WORK_STRUCT_* flags to set
902 *
903 * Insert @work which belongs to @cwq into @gcwq after @head.
904 * @extra_flags is or'd to work_struct flags.
905 *
906 * CONTEXT:
907 * spin_lock_irq(gcwq->lock).
908 */
248static void insert_work(struct cpu_workqueue_struct *cwq, 909static void insert_work(struct cpu_workqueue_struct *cwq,
249 struct work_struct *work, struct list_head *head) 910 struct work_struct *work, struct list_head *head,
911 unsigned int extra_flags)
250{ 912{
251 trace_workqueue_insertion(cwq->thread, work); 913 struct global_cwq *gcwq = cwq->gcwq;
914
915 /* we own @work, set data and link */
916 set_work_cwq(work, cwq, extra_flags);
252 917
253 set_wq_data(work, cwq);
254 /* 918 /*
255 * Ensure that we get the right work->data if we see the 919 * Ensure that we get the right work->data if we see the
256 * result of list_add() below, see try_to_grab_pending(). 920 * result of list_add() below, see try_to_grab_pending().
257 */ 921 */
258 smp_wmb(); 922 smp_wmb();
923
259 list_add_tail(&work->entry, head); 924 list_add_tail(&work->entry, head);
260 wake_up(&cwq->more_work); 925
926 /*
927 * Ensure either worker_sched_deactivated() sees the above
928 * list_add_tail() or we see zero nr_running to avoid workers
929 * lying around lazily while there are works to be processed.
930 */
931 smp_mb();
932
933 if (__need_more_worker(gcwq))
934 wake_up_worker(gcwq);
261} 935}
262 936
263static void __queue_work(struct cpu_workqueue_struct *cwq, 937static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
264 struct work_struct *work) 938 struct work_struct *work)
265{ 939{
940 struct global_cwq *gcwq;
941 struct cpu_workqueue_struct *cwq;
942 struct list_head *worklist;
266 unsigned long flags; 943 unsigned long flags;
267 944
268 debug_work_activate(work); 945 debug_work_activate(work);
269 spin_lock_irqsave(&cwq->lock, flags); 946
270 insert_work(cwq, work, &cwq->worklist); 947 /* determine gcwq to use */
271 spin_unlock_irqrestore(&cwq->lock, flags); 948 if (!(wq->flags & WQ_UNBOUND)) {
949 struct global_cwq *last_gcwq;
950
951 if (unlikely(cpu == WORK_CPU_UNBOUND))
952 cpu = raw_smp_processor_id();
953
954 /*
955 * It's multi cpu. If @wq is non-reentrant and @work
956 * was previously on a different cpu, it might still
957 * be running there, in which case the work needs to
958 * be queued on that cpu to guarantee non-reentrance.
959 */
960 gcwq = get_gcwq(cpu);
961 if (wq->flags & WQ_NON_REENTRANT &&
962 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
963 struct worker *worker;
964
965 spin_lock_irqsave(&last_gcwq->lock, flags);
966
967 worker = find_worker_executing_work(last_gcwq, work);
968
969 if (worker && worker->current_cwq->wq == wq)
970 gcwq = last_gcwq;
971 else {
972 /* meh... not running there, queue here */
973 spin_unlock_irqrestore(&last_gcwq->lock, flags);
974 spin_lock_irqsave(&gcwq->lock, flags);
975 }
976 } else
977 spin_lock_irqsave(&gcwq->lock, flags);
978 } else {
979 gcwq = get_gcwq(WORK_CPU_UNBOUND);
980 spin_lock_irqsave(&gcwq->lock, flags);
981 }
982
983 /* gcwq determined, get cwq and queue */
984 cwq = get_cwq(gcwq->cpu, wq);
985
986 BUG_ON(!list_empty(&work->entry));
987
988 cwq->nr_in_flight[cwq->work_color]++;
989
990 if (likely(cwq->nr_active < cwq->max_active)) {
991 cwq->nr_active++;
992 worklist = gcwq_determine_ins_pos(gcwq, cwq);
993 } else
994 worklist = &cwq->delayed_works;
995
996 insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
997
998 spin_unlock_irqrestore(&gcwq->lock, flags);
272} 999}
273 1000
274/** 1001/**
@@ -308,9 +1035,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
308{ 1035{
309 int ret = 0; 1036 int ret = 0;
310 1037
311 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1038 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
312 BUG_ON(!list_empty(&work->entry)); 1039 __queue_work(cpu, wq, work);
313 __queue_work(wq_per_cpu(wq, cpu), work);
314 ret = 1; 1040 ret = 1;
315 } 1041 }
316 return ret; 1042 return ret;
@@ -320,10 +1046,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
320static void delayed_work_timer_fn(unsigned long __data) 1046static void delayed_work_timer_fn(unsigned long __data)
321{ 1047{
322 struct delayed_work *dwork = (struct delayed_work *)__data; 1048 struct delayed_work *dwork = (struct delayed_work *)__data;
323 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); 1049 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
324 struct workqueue_struct *wq = cwq->wq;
325 1050
326 __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); 1051 __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
327} 1052}
328 1053
329/** 1054/**
@@ -360,14 +1085,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
360 struct timer_list *timer = &dwork->timer; 1085 struct timer_list *timer = &dwork->timer;
361 struct work_struct *work = &dwork->work; 1086 struct work_struct *work = &dwork->work;
362 1087
363 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1088 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1089 unsigned int lcpu;
1090
364 BUG_ON(timer_pending(timer)); 1091 BUG_ON(timer_pending(timer));
365 BUG_ON(!list_empty(&work->entry)); 1092 BUG_ON(!list_empty(&work->entry));
366 1093
367 timer_stats_timer_set_start_info(&dwork->timer); 1094 timer_stats_timer_set_start_info(&dwork->timer);
368 1095
369 /* This stores cwq for the moment, for the timer_fn */ 1096 /*
370 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); 1097 * This stores cwq for the moment, for the timer_fn.
1098 * Note that the work's gcwq is preserved to allow
1099 * reentrance detection for delayed works.
1100 */
1101 if (!(wq->flags & WQ_UNBOUND)) {
1102 struct global_cwq *gcwq = get_work_gcwq(work);
1103
1104 if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1105 lcpu = gcwq->cpu;
1106 else
1107 lcpu = raw_smp_processor_id();
1108 } else
1109 lcpu = WORK_CPU_UNBOUND;
1110
1111 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1112
371 timer->expires = jiffies + delay; 1113 timer->expires = jiffies + delay;
372 timer->data = (unsigned long)dwork; 1114 timer->data = (unsigned long)dwork;
373 timer->function = delayed_work_timer_fn; 1115 timer->function = delayed_work_timer_fn;
@@ -382,80 +1124,872 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
382} 1124}
383EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1125EXPORT_SYMBOL_GPL(queue_delayed_work_on);
384 1126
385static void run_workqueue(struct cpu_workqueue_struct *cwq) 1127/**
1128 * worker_enter_idle - enter idle state
1129 * @worker: worker which is entering idle state
1130 *
1131 * @worker is entering idle state. Update stats and idle timer if
1132 * necessary.
1133 *
1134 * LOCKING:
1135 * spin_lock_irq(gcwq->lock).
1136 */
1137static void worker_enter_idle(struct worker *worker)
386{ 1138{
387 spin_lock_irq(&cwq->lock); 1139 struct global_cwq *gcwq = worker->gcwq;
388 while (!list_empty(&cwq->worklist)) { 1140
389 struct work_struct *work = list_entry(cwq->worklist.next, 1141 BUG_ON(worker->flags & WORKER_IDLE);
390 struct work_struct, entry); 1142 BUG_ON(!list_empty(&worker->entry) &&
391 work_func_t f = work->func; 1143 (worker->hentry.next || worker->hentry.pprev));
392#ifdef CONFIG_LOCKDEP 1144
1145 /* can't use worker_set_flags(), also called from start_worker() */
1146 worker->flags |= WORKER_IDLE;
1147 gcwq->nr_idle++;
1148 worker->last_active = jiffies;
1149
1150 /* idle_list is LIFO */
1151 list_add(&worker->entry, &gcwq->idle_list);
1152
1153 if (likely(!(worker->flags & WORKER_ROGUE))) {
1154 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1155 mod_timer(&gcwq->idle_timer,
1156 jiffies + IDLE_WORKER_TIMEOUT);
1157 } else
1158 wake_up_all(&gcwq->trustee_wait);
1159
1160 /* sanity check nr_running */
1161 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1162 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1163}
1164
1165/**
1166 * worker_leave_idle - leave idle state
1167 * @worker: worker which is leaving idle state
1168 *
1169 * @worker is leaving idle state. Update stats.
1170 *
1171 * LOCKING:
1172 * spin_lock_irq(gcwq->lock).
1173 */
1174static void worker_leave_idle(struct worker *worker)
1175{
1176 struct global_cwq *gcwq = worker->gcwq;
1177
1178 BUG_ON(!(worker->flags & WORKER_IDLE));
1179 worker_clr_flags(worker, WORKER_IDLE);
1180 gcwq->nr_idle--;
1181 list_del_init(&worker->entry);
1182}
1183
1184/**
1185 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1186 * @worker: self
1187 *
1188 * Works which are scheduled while the cpu is online must at least be
1189 * scheduled to a worker which is bound to the cpu so that if they are
1190 * flushed from cpu callbacks while cpu is going down, they are
1191 * guaranteed to execute on the cpu.
1192 *
1193 * This function is to be used by rogue workers and rescuers to bind
1194 * themselves to the target cpu and may race with cpu going down or
1195 * coming online. kthread_bind() can't be used because it may put the
1196 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1197 * verbatim as it's best effort and blocking and gcwq may be
1198 * [dis]associated in the meantime.
1199 *
1200 * This function tries set_cpus_allowed() and locks gcwq and verifies
1201 * the binding against GCWQ_DISASSOCIATED which is set during
1202 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1203 * idle state or fetches works without dropping lock, it can guarantee
1204 * the scheduling requirement described in the first paragraph.
1205 *
1206 * CONTEXT:
1207 * Might sleep. Called without any lock but returns with gcwq->lock
1208 * held.
1209 *
1210 * RETURNS:
1211 * %true if the associated gcwq is online (@worker is successfully
1212 * bound), %false if offline.
1213 */
1214static bool worker_maybe_bind_and_lock(struct worker *worker)
1215{
1216 struct global_cwq *gcwq = worker->gcwq;
1217 struct task_struct *task = worker->task;
1218
1219 while (true) {
393 /* 1220 /*
394 * It is permissible to free the struct work_struct 1221 * The following call may fail, succeed or succeed
395 * from inside the function that is called from it, 1222 * without actually migrating the task to the cpu if
396 * this we need to take into account for lockdep too. 1223 * it races with cpu hotunplug operation. Verify
397 * To avoid bogus "held lock freed" warnings as well 1224 * against GCWQ_DISASSOCIATED.
398 * as problems when looking into work->lockdep_map,
399 * make a copy and use that here.
400 */ 1225 */
401 struct lockdep_map lockdep_map = work->lockdep_map; 1226 if (!(gcwq->flags & GCWQ_DISASSOCIATED))
402#endif 1227 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
403 trace_workqueue_execution(cwq->thread, work); 1228
404 debug_work_deactivate(work); 1229 spin_lock_irq(&gcwq->lock);
405 cwq->current_work = work; 1230 if (gcwq->flags & GCWQ_DISASSOCIATED)
406 list_del_init(cwq->worklist.next); 1231 return false;
407 spin_unlock_irq(&cwq->lock); 1232 if (task_cpu(task) == gcwq->cpu &&
408 1233 cpumask_equal(&current->cpus_allowed,
409 BUG_ON(get_wq_data(work) != cwq); 1234 get_cpu_mask(gcwq->cpu)))
410 work_clear_pending(work); 1235 return true;
411 lock_map_acquire(&cwq->wq->lockdep_map); 1236 spin_unlock_irq(&gcwq->lock);
412 lock_map_acquire(&lockdep_map); 1237
413 f(work); 1238 /* CPU has come up inbetween, retry migration */
414 lock_map_release(&lockdep_map); 1239 cpu_relax();
415 lock_map_release(&cwq->wq->lockdep_map); 1240 }
416 1241}
417 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 1242
418 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 1243/*
419 "%s/0x%08x/%d\n", 1244 * Function for worker->rebind_work used to rebind rogue busy workers
420 current->comm, preempt_count(), 1245 * to the associated cpu which is coming back online. This is
421 task_pid_nr(current)); 1246 * scheduled by cpu up but can race with other cpu hotplug operations
422 printk(KERN_ERR " last function: "); 1247 * and may be executed twice without intervening cpu down.
423 print_symbol("%s\n", (unsigned long)f); 1248 */
424 debug_show_held_locks(current); 1249static void worker_rebind_fn(struct work_struct *work)
425 dump_stack(); 1250{
1251 struct worker *worker = container_of(work, struct worker, rebind_work);
1252 struct global_cwq *gcwq = worker->gcwq;
1253
1254 if (worker_maybe_bind_and_lock(worker))
1255 worker_clr_flags(worker, WORKER_REBIND);
1256
1257 spin_unlock_irq(&gcwq->lock);
1258}
1259
1260static struct worker *alloc_worker(void)
1261{
1262 struct worker *worker;
1263
1264 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1265 if (worker) {
1266 INIT_LIST_HEAD(&worker->entry);
1267 INIT_LIST_HEAD(&worker->scheduled);
1268 INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1269 /* on creation a worker is in !idle && prep state */
1270 worker->flags = WORKER_PREP;
1271 }
1272 return worker;
1273}
1274
1275/**
1276 * create_worker - create a new workqueue worker
1277 * @gcwq: gcwq the new worker will belong to
1278 * @bind: whether to set affinity to @cpu or not
1279 *
1280 * Create a new worker which is bound to @gcwq. The returned worker
1281 * can be started by calling start_worker() or destroyed using
1282 * destroy_worker().
1283 *
1284 * CONTEXT:
1285 * Might sleep. Does GFP_KERNEL allocations.
1286 *
1287 * RETURNS:
1288 * Pointer to the newly created worker.
1289 */
1290static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1291{
1292 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1293 struct worker *worker = NULL;
1294 int id = -1;
1295
1296 spin_lock_irq(&gcwq->lock);
1297 while (ida_get_new(&gcwq->worker_ida, &id)) {
1298 spin_unlock_irq(&gcwq->lock);
1299 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1300 goto fail;
1301 spin_lock_irq(&gcwq->lock);
1302 }
1303 spin_unlock_irq(&gcwq->lock);
1304
1305 worker = alloc_worker();
1306 if (!worker)
1307 goto fail;
1308
1309 worker->gcwq = gcwq;
1310 worker->id = id;
1311
1312 if (!on_unbound_cpu)
1313 worker->task = kthread_create(worker_thread, worker,
1314 "kworker/%u:%d", gcwq->cpu, id);
1315 else
1316 worker->task = kthread_create(worker_thread, worker,
1317 "kworker/u:%d", id);
1318 if (IS_ERR(worker->task))
1319 goto fail;
1320
1321 /*
1322 * A rogue worker will become a regular one if CPU comes
1323 * online later on. Make sure every worker has
1324 * PF_THREAD_BOUND set.
1325 */
1326 if (bind && !on_unbound_cpu)
1327 kthread_bind(worker->task, gcwq->cpu);
1328 else {
1329 worker->task->flags |= PF_THREAD_BOUND;
1330 if (on_unbound_cpu)
1331 worker->flags |= WORKER_UNBOUND;
1332 }
1333
1334 return worker;
1335fail:
1336 if (id >= 0) {
1337 spin_lock_irq(&gcwq->lock);
1338 ida_remove(&gcwq->worker_ida, id);
1339 spin_unlock_irq(&gcwq->lock);
1340 }
1341 kfree(worker);
1342 return NULL;
1343}
1344
1345/**
1346 * start_worker - start a newly created worker
1347 * @worker: worker to start
1348 *
1349 * Make the gcwq aware of @worker and start it.
1350 *
1351 * CONTEXT:
1352 * spin_lock_irq(gcwq->lock).
1353 */
1354static void start_worker(struct worker *worker)
1355{
1356 worker->flags |= WORKER_STARTED;
1357 worker->gcwq->nr_workers++;
1358 worker_enter_idle(worker);
1359 wake_up_process(worker->task);
1360}
1361
1362/**
1363 * destroy_worker - destroy a workqueue worker
1364 * @worker: worker to be destroyed
1365 *
1366 * Destroy @worker and adjust @gcwq stats accordingly.
1367 *
1368 * CONTEXT:
1369 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1370 */
1371static void destroy_worker(struct worker *worker)
1372{
1373 struct global_cwq *gcwq = worker->gcwq;
1374 int id = worker->id;
1375
1376 /* sanity check frenzy */
1377 BUG_ON(worker->current_work);
1378 BUG_ON(!list_empty(&worker->scheduled));
1379
1380 if (worker->flags & WORKER_STARTED)
1381 gcwq->nr_workers--;
1382 if (worker->flags & WORKER_IDLE)
1383 gcwq->nr_idle--;
1384
1385 list_del_init(&worker->entry);
1386 worker->flags |= WORKER_DIE;
1387
1388 spin_unlock_irq(&gcwq->lock);
1389
1390 kthread_stop(worker->task);
1391 kfree(worker);
1392
1393 spin_lock_irq(&gcwq->lock);
1394 ida_remove(&gcwq->worker_ida, id);
1395}
1396
1397static void idle_worker_timeout(unsigned long __gcwq)
1398{
1399 struct global_cwq *gcwq = (void *)__gcwq;
1400
1401 spin_lock_irq(&gcwq->lock);
1402
1403 if (too_many_workers(gcwq)) {
1404 struct worker *worker;
1405 unsigned long expires;
1406
1407 /* idle_list is kept in LIFO order, check the last one */
1408 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1409 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1410
1411 if (time_before(jiffies, expires))
1412 mod_timer(&gcwq->idle_timer, expires);
1413 else {
1414 /* it's been idle for too long, wake up manager */
1415 gcwq->flags |= GCWQ_MANAGE_WORKERS;
1416 wake_up_worker(gcwq);
1417 }
1418 }
1419
1420 spin_unlock_irq(&gcwq->lock);
1421}
1422
1423static bool send_mayday(struct work_struct *work)
1424{
1425 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1426 struct workqueue_struct *wq = cwq->wq;
1427 unsigned int cpu;
1428
1429 if (!(wq->flags & WQ_RESCUER))
1430 return false;
1431
1432 /* mayday mayday mayday */
1433 cpu = cwq->gcwq->cpu;
1434 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1435 if (cpu == WORK_CPU_UNBOUND)
1436 cpu = 0;
1437 if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1438 wake_up_process(wq->rescuer->task);
1439 return true;
1440}
1441
1442static void gcwq_mayday_timeout(unsigned long __gcwq)
1443{
1444 struct global_cwq *gcwq = (void *)__gcwq;
1445 struct work_struct *work;
1446
1447 spin_lock_irq(&gcwq->lock);
1448
1449 if (need_to_create_worker(gcwq)) {
1450 /*
1451 * We've been trying to create a new worker but
1452 * haven't been successful. We might be hitting an
1453 * allocation deadlock. Send distress signals to
1454 * rescuers.
1455 */
1456 list_for_each_entry(work, &gcwq->worklist, entry)
1457 send_mayday(work);
1458 }
1459
1460 spin_unlock_irq(&gcwq->lock);
1461
1462 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
1463}
1464
1465/**
1466 * maybe_create_worker - create a new worker if necessary
1467 * @gcwq: gcwq to create a new worker for
1468 *
1469 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
1470 * have at least one idle worker on return from this function. If
1471 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1472 * sent to all rescuers with works scheduled on @gcwq to resolve
1473 * possible allocation deadlock.
1474 *
1475 * On return, need_to_create_worker() is guaranteed to be false and
1476 * may_start_working() true.
1477 *
1478 * LOCKING:
1479 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1480 * multiple times. Does GFP_KERNEL allocations. Called only from
1481 * manager.
1482 *
1483 * RETURNS:
1484 * false if no action was taken and gcwq->lock stayed locked, true
1485 * otherwise.
1486 */
1487static bool maybe_create_worker(struct global_cwq *gcwq)
1488{
1489 if (!need_to_create_worker(gcwq))
1490 return false;
1491restart:
1492 spin_unlock_irq(&gcwq->lock);
1493
1494 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1495 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1496
1497 while (true) {
1498 struct worker *worker;
1499
1500 worker = create_worker(gcwq, true);
1501 if (worker) {
1502 del_timer_sync(&gcwq->mayday_timer);
1503 spin_lock_irq(&gcwq->lock);
1504 start_worker(worker);
1505 BUG_ON(need_to_create_worker(gcwq));
1506 return true;
426 } 1507 }
427 1508
428 spin_lock_irq(&cwq->lock); 1509 if (!need_to_create_worker(gcwq))
429 cwq->current_work = NULL; 1510 break;
1511
1512 __set_current_state(TASK_INTERRUPTIBLE);
1513 schedule_timeout(CREATE_COOLDOWN);
1514
1515 if (!need_to_create_worker(gcwq))
1516 break;
430 } 1517 }
431 spin_unlock_irq(&cwq->lock); 1518
1519 del_timer_sync(&gcwq->mayday_timer);
1520 spin_lock_irq(&gcwq->lock);
1521 if (need_to_create_worker(gcwq))
1522 goto restart;
1523 return true;
432} 1524}
433 1525
434static int worker_thread(void *__cwq) 1526/**
1527 * maybe_destroy_worker - destroy workers which have been idle for a while
1528 * @gcwq: gcwq to destroy workers for
1529 *
1530 * Destroy @gcwq workers which have been idle for longer than
1531 * IDLE_WORKER_TIMEOUT.
1532 *
1533 * LOCKING:
1534 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1535 * multiple times. Called only from manager.
1536 *
1537 * RETURNS:
1538 * false if no action was taken and gcwq->lock stayed locked, true
1539 * otherwise.
1540 */
1541static bool maybe_destroy_workers(struct global_cwq *gcwq)
435{ 1542{
436 struct cpu_workqueue_struct *cwq = __cwq; 1543 bool ret = false;
437 DEFINE_WAIT(wait);
438 1544
439 if (cwq->wq->freezeable) 1545 while (too_many_workers(gcwq)) {
440 set_freezable(); 1546 struct worker *worker;
1547 unsigned long expires;
441 1548
442 for (;;) { 1549 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
443 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 1550 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
444 if (!freezing(current) &&
445 !kthread_should_stop() &&
446 list_empty(&cwq->worklist))
447 schedule();
448 finish_wait(&cwq->more_work, &wait);
449 1551
450 try_to_freeze(); 1552 if (time_before(jiffies, expires)) {
1553 mod_timer(&gcwq->idle_timer, expires);
1554 break;
1555 }
451 1556
452 if (kthread_should_stop()) 1557 destroy_worker(worker);
1558 ret = true;
1559 }
1560
1561 return ret;
1562}
1563
1564/**
1565 * manage_workers - manage worker pool
1566 * @worker: self
1567 *
1568 * Assume the manager role and manage gcwq worker pool @worker belongs
1569 * to. At any given time, there can be only zero or one manager per
1570 * gcwq. The exclusion is handled automatically by this function.
1571 *
1572 * The caller can safely start processing works on false return. On
1573 * true return, it's guaranteed that need_to_create_worker() is false
1574 * and may_start_working() is true.
1575 *
1576 * CONTEXT:
1577 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1578 * multiple times. Does GFP_KERNEL allocations.
1579 *
1580 * RETURNS:
1581 * false if no action was taken and gcwq->lock stayed locked, true if
1582 * some action was taken.
1583 */
1584static bool manage_workers(struct worker *worker)
1585{
1586 struct global_cwq *gcwq = worker->gcwq;
1587 bool ret = false;
1588
1589 if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1590 return ret;
1591
1592 gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1593 gcwq->flags |= GCWQ_MANAGING_WORKERS;
1594
1595 /*
1596 * Destroy and then create so that may_start_working() is true
1597 * on return.
1598 */
1599 ret |= maybe_destroy_workers(gcwq);
1600 ret |= maybe_create_worker(gcwq);
1601
1602 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1603
1604 /*
1605 * The trustee might be waiting to take over the manager
1606 * position, tell it we're done.
1607 */
1608 if (unlikely(gcwq->trustee))
1609 wake_up_all(&gcwq->trustee_wait);
1610
1611 return ret;
1612}
1613
1614/**
1615 * move_linked_works - move linked works to a list
1616 * @work: start of series of works to be scheduled
1617 * @head: target list to append @work to
1618 * @nextp: out paramter for nested worklist walking
1619 *
1620 * Schedule linked works starting from @work to @head. Work series to
1621 * be scheduled starts at @work and includes any consecutive work with
1622 * WORK_STRUCT_LINKED set in its predecessor.
1623 *
1624 * If @nextp is not NULL, it's updated to point to the next work of
1625 * the last scheduled work. This allows move_linked_works() to be
1626 * nested inside outer list_for_each_entry_safe().
1627 *
1628 * CONTEXT:
1629 * spin_lock_irq(gcwq->lock).
1630 */
1631static void move_linked_works(struct work_struct *work, struct list_head *head,
1632 struct work_struct **nextp)
1633{
1634 struct work_struct *n;
1635
1636 /*
1637 * Linked worklist will always end before the end of the list,
1638 * use NULL for list head.
1639 */
1640 list_for_each_entry_safe_from(work, n, NULL, entry) {
1641 list_move_tail(&work->entry, head);
1642 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
453 break; 1643 break;
1644 }
1645
1646 /*
1647 * If we're already inside safe list traversal and have moved
1648 * multiple works to the scheduled queue, the next position
1649 * needs to be updated.
1650 */
1651 if (nextp)
1652 *nextp = n;
1653}
1654
1655static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1656{
1657 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1658 struct work_struct, entry);
1659 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1660
1661 move_linked_works(work, pos, NULL);
1662 cwq->nr_active++;
1663}
454 1664
455 run_workqueue(cwq); 1665/**
1666 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1667 * @cwq: cwq of interest
1668 * @color: color of work which left the queue
1669 *
1670 * A work either has completed or is removed from pending queue,
1671 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1672 *
1673 * CONTEXT:
1674 * spin_lock_irq(gcwq->lock).
1675 */
1676static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1677{
1678 /* ignore uncolored works */
1679 if (color == WORK_NO_COLOR)
1680 return;
1681
1682 cwq->nr_in_flight[color]--;
1683 cwq->nr_active--;
1684
1685 if (!list_empty(&cwq->delayed_works)) {
1686 /* one down, submit a delayed one */
1687 if (cwq->nr_active < cwq->max_active)
1688 cwq_activate_first_delayed(cwq);
456 } 1689 }
457 1690
458 return 0; 1691 /* is flush in progress and are we at the flushing tip? */
1692 if (likely(cwq->flush_color != color))
1693 return;
1694
1695 /* are there still in-flight works? */
1696 if (cwq->nr_in_flight[color])
1697 return;
1698
1699 /* this cwq is done, clear flush_color */
1700 cwq->flush_color = -1;
1701
1702 /*
1703 * If this was the last cwq, wake up the first flusher. It
1704 * will handle the rest.
1705 */
1706 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1707 complete(&cwq->wq->first_flusher->done);
1708}
1709
1710/**
1711 * process_one_work - process single work
1712 * @worker: self
1713 * @work: work to process
1714 *
1715 * Process @work. This function contains all the logics necessary to
1716 * process a single work including synchronization against and
1717 * interaction with other workers on the same cpu, queueing and
1718 * flushing. As long as context requirement is met, any worker can
1719 * call this function to process a work.
1720 *
1721 * CONTEXT:
1722 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1723 */
1724static void process_one_work(struct worker *worker, struct work_struct *work)
1725{
1726 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1727 struct global_cwq *gcwq = cwq->gcwq;
1728 struct hlist_head *bwh = busy_worker_head(gcwq, work);
1729 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1730 work_func_t f = work->func;
1731 int work_color;
1732 struct worker *collision;
1733#ifdef CONFIG_LOCKDEP
1734 /*
1735 * It is permissible to free the struct work_struct from
1736 * inside the function that is called from it, this we need to
1737 * take into account for lockdep too. To avoid bogus "held
1738 * lock freed" warnings as well as problems when looking into
1739 * work->lockdep_map, make a copy and use that here.
1740 */
1741 struct lockdep_map lockdep_map = work->lockdep_map;
1742#endif
1743 /*
1744 * A single work shouldn't be executed concurrently by
1745 * multiple workers on a single cpu. Check whether anyone is
1746 * already processing the work. If so, defer the work to the
1747 * currently executing one.
1748 */
1749 collision = __find_worker_executing_work(gcwq, bwh, work);
1750 if (unlikely(collision)) {
1751 move_linked_works(work, &collision->scheduled, NULL);
1752 return;
1753 }
1754
1755 /* claim and process */
1756 debug_work_deactivate(work);
1757 hlist_add_head(&worker->hentry, bwh);
1758 worker->current_work = work;
1759 worker->current_cwq = cwq;
1760 work_color = get_work_color(work);
1761
1762 /* record the current cpu number in the work data and dequeue */
1763 set_work_cpu(work, gcwq->cpu);
1764 list_del_init(&work->entry);
1765
1766 /*
1767 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1768 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1769 */
1770 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1771 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1772 struct work_struct, entry);
1773
1774 if (!list_empty(&gcwq->worklist) &&
1775 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1776 wake_up_worker(gcwq);
1777 else
1778 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1779 }
1780
1781 /*
1782 * CPU intensive works don't participate in concurrency
1783 * management. They're the scheduler's responsibility.
1784 */
1785 if (unlikely(cpu_intensive))
1786 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1787
1788 spin_unlock_irq(&gcwq->lock);
1789
1790 work_clear_pending(work);
1791 lock_map_acquire(&cwq->wq->lockdep_map);
1792 lock_map_acquire(&lockdep_map);
1793 f(work);
1794 lock_map_release(&lockdep_map);
1795 lock_map_release(&cwq->wq->lockdep_map);
1796
1797 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1798 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1799 "%s/0x%08x/%d\n",
1800 current->comm, preempt_count(), task_pid_nr(current));
1801 printk(KERN_ERR " last function: ");
1802 print_symbol("%s\n", (unsigned long)f);
1803 debug_show_held_locks(current);
1804 dump_stack();
1805 }
1806
1807 spin_lock_irq(&gcwq->lock);
1808
1809 /* clear cpu intensive status */
1810 if (unlikely(cpu_intensive))
1811 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1812
1813 /* we're done with it, release */
1814 hlist_del_init(&worker->hentry);
1815 worker->current_work = NULL;
1816 worker->current_cwq = NULL;
1817 cwq_dec_nr_in_flight(cwq, work_color);
1818}
1819
1820/**
1821 * process_scheduled_works - process scheduled works
1822 * @worker: self
1823 *
1824 * Process all scheduled works. Please note that the scheduled list
1825 * may change while processing a work, so this function repeatedly
1826 * fetches a work from the top and executes it.
1827 *
1828 * CONTEXT:
1829 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1830 * multiple times.
1831 */
1832static void process_scheduled_works(struct worker *worker)
1833{
1834 while (!list_empty(&worker->scheduled)) {
1835 struct work_struct *work = list_first_entry(&worker->scheduled,
1836 struct work_struct, entry);
1837 process_one_work(worker, work);
1838 }
1839}
1840
1841/**
1842 * worker_thread - the worker thread function
1843 * @__worker: self
1844 *
1845 * The gcwq worker thread function. There's a single dynamic pool of
1846 * these per each cpu. These workers process all works regardless of
1847 * their specific target workqueue. The only exception is works which
1848 * belong to workqueues with a rescuer which will be explained in
1849 * rescuer_thread().
1850 */
1851static int worker_thread(void *__worker)
1852{
1853 struct worker *worker = __worker;
1854 struct global_cwq *gcwq = worker->gcwq;
1855
1856 /* tell the scheduler that this is a workqueue worker */
1857 worker->task->flags |= PF_WQ_WORKER;
1858woke_up:
1859 spin_lock_irq(&gcwq->lock);
1860
1861 /* DIE can be set only while we're idle, checking here is enough */
1862 if (worker->flags & WORKER_DIE) {
1863 spin_unlock_irq(&gcwq->lock);
1864 worker->task->flags &= ~PF_WQ_WORKER;
1865 return 0;
1866 }
1867
1868 worker_leave_idle(worker);
1869recheck:
1870 /* no more worker necessary? */
1871 if (!need_more_worker(gcwq))
1872 goto sleep;
1873
1874 /* do we need to manage? */
1875 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1876 goto recheck;
1877
1878 /*
1879 * ->scheduled list can only be filled while a worker is
1880 * preparing to process a work or actually processing it.
1881 * Make sure nobody diddled with it while I was sleeping.
1882 */
1883 BUG_ON(!list_empty(&worker->scheduled));
1884
1885 /*
1886 * When control reaches this point, we're guaranteed to have
1887 * at least one idle worker or that someone else has already
1888 * assumed the manager role.
1889 */
1890 worker_clr_flags(worker, WORKER_PREP);
1891
1892 do {
1893 struct work_struct *work =
1894 list_first_entry(&gcwq->worklist,
1895 struct work_struct, entry);
1896
1897 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1898 /* optimization path, not strictly necessary */
1899 process_one_work(worker, work);
1900 if (unlikely(!list_empty(&worker->scheduled)))
1901 process_scheduled_works(worker);
1902 } else {
1903 move_linked_works(work, &worker->scheduled, NULL);
1904 process_scheduled_works(worker);
1905 }
1906 } while (keep_working(gcwq));
1907
1908 worker_set_flags(worker, WORKER_PREP, false);
1909sleep:
1910 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1911 goto recheck;
1912
1913 /*
1914 * gcwq->lock is held and there's no work to process and no
1915 * need to manage, sleep. Workers are woken up only while
1916 * holding gcwq->lock or from local cpu, so setting the
1917 * current state before releasing gcwq->lock is enough to
1918 * prevent losing any event.
1919 */
1920 worker_enter_idle(worker);
1921 __set_current_state(TASK_INTERRUPTIBLE);
1922 spin_unlock_irq(&gcwq->lock);
1923 schedule();
1924 goto woke_up;
1925}
1926
1927/**
1928 * rescuer_thread - the rescuer thread function
1929 * @__wq: the associated workqueue
1930 *
1931 * Workqueue rescuer thread function. There's one rescuer for each
1932 * workqueue which has WQ_RESCUER set.
1933 *
1934 * Regular work processing on a gcwq may block trying to create a new
1935 * worker which uses GFP_KERNEL allocation which has slight chance of
1936 * developing into deadlock if some works currently on the same queue
1937 * need to be processed to satisfy the GFP_KERNEL allocation. This is
1938 * the problem rescuer solves.
1939 *
1940 * When such condition is possible, the gcwq summons rescuers of all
1941 * workqueues which have works queued on the gcwq and let them process
1942 * those works so that forward progress can be guaranteed.
1943 *
1944 * This should happen rarely.
1945 */
1946static int rescuer_thread(void *__wq)
1947{
1948 struct workqueue_struct *wq = __wq;
1949 struct worker *rescuer = wq->rescuer;
1950 struct list_head *scheduled = &rescuer->scheduled;
1951 bool is_unbound = wq->flags & WQ_UNBOUND;
1952 unsigned int cpu;
1953
1954 set_user_nice(current, RESCUER_NICE_LEVEL);
1955repeat:
1956 set_current_state(TASK_INTERRUPTIBLE);
1957
1958 if (kthread_should_stop())
1959 return 0;
1960
1961 /*
1962 * See whether any cpu is asking for help. Unbounded
1963 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
1964 */
1965 for_each_mayday_cpu(cpu, wq->mayday_mask) {
1966 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
1967 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
1968 struct global_cwq *gcwq = cwq->gcwq;
1969 struct work_struct *work, *n;
1970
1971 __set_current_state(TASK_RUNNING);
1972 mayday_clear_cpu(cpu, wq->mayday_mask);
1973
1974 /* migrate to the target cpu if possible */
1975 rescuer->gcwq = gcwq;
1976 worker_maybe_bind_and_lock(rescuer);
1977
1978 /*
1979 * Slurp in all works issued via this workqueue and
1980 * process'em.
1981 */
1982 BUG_ON(!list_empty(&rescuer->scheduled));
1983 list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
1984 if (get_work_cwq(work) == cwq)
1985 move_linked_works(work, scheduled, &n);
1986
1987 process_scheduled_works(rescuer);
1988 spin_unlock_irq(&gcwq->lock);
1989 }
1990
1991 schedule();
1992 goto repeat;
459} 1993}
460 1994
461struct wq_barrier { 1995struct wq_barrier {
@@ -469,44 +2003,137 @@ static void wq_barrier_func(struct work_struct *work)
469 complete(&barr->done); 2003 complete(&barr->done);
470} 2004}
471 2005
2006/**
2007 * insert_wq_barrier - insert a barrier work
2008 * @cwq: cwq to insert barrier into
2009 * @barr: wq_barrier to insert
2010 * @target: target work to attach @barr to
2011 * @worker: worker currently executing @target, NULL if @target is not executing
2012 *
2013 * @barr is linked to @target such that @barr is completed only after
2014 * @target finishes execution. Please note that the ordering
2015 * guarantee is observed only with respect to @target and on the local
2016 * cpu.
2017 *
2018 * Currently, a queued barrier can't be canceled. This is because
2019 * try_to_grab_pending() can't determine whether the work to be
2020 * grabbed is at the head of the queue and thus can't clear LINKED
2021 * flag of the previous work while there must be a valid next work
2022 * after a work with LINKED flag set.
2023 *
2024 * Note that when @worker is non-NULL, @target may be modified
2025 * underneath us, so we can't reliably determine cwq from @target.
2026 *
2027 * CONTEXT:
2028 * spin_lock_irq(gcwq->lock).
2029 */
472static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2030static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
473 struct wq_barrier *barr, struct list_head *head) 2031 struct wq_barrier *barr,
2032 struct work_struct *target, struct worker *worker)
474{ 2033{
2034 struct list_head *head;
2035 unsigned int linked = 0;
2036
475 /* 2037 /*
476 * debugobject calls are safe here even with cwq->lock locked 2038 * debugobject calls are safe here even with gcwq->lock locked
477 * as we know for sure that this will not trigger any of the 2039 * as we know for sure that this will not trigger any of the
478 * checks and call back into the fixup functions where we 2040 * checks and call back into the fixup functions where we
479 * might deadlock. 2041 * might deadlock.
480 */ 2042 */
481 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2043 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
482 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 2044 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
483
484 init_completion(&barr->done); 2045 init_completion(&barr->done);
485 2046
2047 /*
2048 * If @target is currently being executed, schedule the
2049 * barrier to the worker; otherwise, put it after @target.
2050 */
2051 if (worker)
2052 head = worker->scheduled.next;
2053 else {
2054 unsigned long *bits = work_data_bits(target);
2055
2056 head = target->entry.next;
2057 /* there can already be other linked works, inherit and set */
2058 linked = *bits & WORK_STRUCT_LINKED;
2059 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2060 }
2061
486 debug_work_activate(&barr->work); 2062 debug_work_activate(&barr->work);
487 insert_work(cwq, &barr->work, head); 2063 insert_work(cwq, &barr->work, head,
2064 work_color_to_flags(WORK_NO_COLOR) | linked);
488} 2065}
489 2066
490static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 2067/**
2068 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2069 * @wq: workqueue being flushed
2070 * @flush_color: new flush color, < 0 for no-op
2071 * @work_color: new work color, < 0 for no-op
2072 *
2073 * Prepare cwqs for workqueue flushing.
2074 *
2075 * If @flush_color is non-negative, flush_color on all cwqs should be
2076 * -1. If no cwq has in-flight commands at the specified color, all
2077 * cwq->flush_color's stay at -1 and %false is returned. If any cwq
2078 * has in flight commands, its cwq->flush_color is set to
2079 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2080 * wakeup logic is armed and %true is returned.
2081 *
2082 * The caller should have initialized @wq->first_flusher prior to
2083 * calling this function with non-negative @flush_color. If
2084 * @flush_color is negative, no flush color update is done and %false
2085 * is returned.
2086 *
2087 * If @work_color is non-negative, all cwqs should have the same
2088 * work_color which is previous to @work_color and all will be
2089 * advanced to @work_color.
2090 *
2091 * CONTEXT:
2092 * mutex_lock(wq->flush_mutex).
2093 *
2094 * RETURNS:
2095 * %true if @flush_color >= 0 and there's something to flush. %false
2096 * otherwise.
2097 */
2098static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2099 int flush_color, int work_color)
491{ 2100{
492 int active = 0; 2101 bool wait = false;
493 struct wq_barrier barr; 2102 unsigned int cpu;
494 2103
495 WARN_ON(cwq->thread == current); 2104 if (flush_color >= 0) {
496 2105 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
497 spin_lock_irq(&cwq->lock); 2106 atomic_set(&wq->nr_cwqs_to_flush, 1);
498 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
499 insert_wq_barrier(cwq, &barr, &cwq->worklist);
500 active = 1;
501 } 2107 }
502 spin_unlock_irq(&cwq->lock);
503 2108
504 if (active) { 2109 for_each_cwq_cpu(cpu, wq) {
505 wait_for_completion(&barr.done); 2110 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
506 destroy_work_on_stack(&barr.work); 2111 struct global_cwq *gcwq = cwq->gcwq;
2112
2113 spin_lock_irq(&gcwq->lock);
2114
2115 if (flush_color >= 0) {
2116 BUG_ON(cwq->flush_color != -1);
2117
2118 if (cwq->nr_in_flight[flush_color]) {
2119 cwq->flush_color = flush_color;
2120 atomic_inc(&wq->nr_cwqs_to_flush);
2121 wait = true;
2122 }
2123 }
2124
2125 if (work_color >= 0) {
2126 BUG_ON(work_color != work_next_color(cwq->work_color));
2127 cwq->work_color = work_color;
2128 }
2129
2130 spin_unlock_irq(&gcwq->lock);
507 } 2131 }
508 2132
509 return active; 2133 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2134 complete(&wq->first_flusher->done);
2135
2136 return wait;
510} 2137}
511 2138
512/** 2139/**
@@ -518,20 +2145,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
518 * 2145 *
519 * We sleep until all works which were queued on entry have been handled, 2146 * We sleep until all works which were queued on entry have been handled,
520 * but we are not livelocked by new incoming ones. 2147 * but we are not livelocked by new incoming ones.
521 *
522 * This function used to run the workqueues itself. Now we just wait for the
523 * helper threads to do it.
524 */ 2148 */
525void flush_workqueue(struct workqueue_struct *wq) 2149void flush_workqueue(struct workqueue_struct *wq)
526{ 2150{
527 const struct cpumask *cpu_map = wq_cpu_map(wq); 2151 struct wq_flusher this_flusher = {
528 int cpu; 2152 .list = LIST_HEAD_INIT(this_flusher.list),
2153 .flush_color = -1,
2154 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2155 };
2156 int next_color;
529 2157
530 might_sleep();
531 lock_map_acquire(&wq->lockdep_map); 2158 lock_map_acquire(&wq->lockdep_map);
532 lock_map_release(&wq->lockdep_map); 2159 lock_map_release(&wq->lockdep_map);
533 for_each_cpu(cpu, cpu_map) 2160
534 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 2161 mutex_lock(&wq->flush_mutex);
2162
2163 /*
2164 * Start-to-wait phase
2165 */
2166 next_color = work_next_color(wq->work_color);
2167
2168 if (next_color != wq->flush_color) {
2169 /*
2170 * Color space is not full. The current work_color
2171 * becomes our flush_color and work_color is advanced
2172 * by one.
2173 */
2174 BUG_ON(!list_empty(&wq->flusher_overflow));
2175 this_flusher.flush_color = wq->work_color;
2176 wq->work_color = next_color;
2177
2178 if (!wq->first_flusher) {
2179 /* no flush in progress, become the first flusher */
2180 BUG_ON(wq->flush_color != this_flusher.flush_color);
2181
2182 wq->first_flusher = &this_flusher;
2183
2184 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2185 wq->work_color)) {
2186 /* nothing to flush, done */
2187 wq->flush_color = next_color;
2188 wq->first_flusher = NULL;
2189 goto out_unlock;
2190 }
2191 } else {
2192 /* wait in queue */
2193 BUG_ON(wq->flush_color == this_flusher.flush_color);
2194 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2195 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2196 }
2197 } else {
2198 /*
2199 * Oops, color space is full, wait on overflow queue.
2200 * The next flush completion will assign us
2201 * flush_color and transfer to flusher_queue.
2202 */
2203 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2204 }
2205
2206 mutex_unlock(&wq->flush_mutex);
2207
2208 wait_for_completion(&this_flusher.done);
2209
2210 /*
2211 * Wake-up-and-cascade phase
2212 *
2213 * First flushers are responsible for cascading flushes and
2214 * handling overflow. Non-first flushers can simply return.
2215 */
2216 if (wq->first_flusher != &this_flusher)
2217 return;
2218
2219 mutex_lock(&wq->flush_mutex);
2220
2221 /* we might have raced, check again with mutex held */
2222 if (wq->first_flusher != &this_flusher)
2223 goto out_unlock;
2224
2225 wq->first_flusher = NULL;
2226
2227 BUG_ON(!list_empty(&this_flusher.list));
2228 BUG_ON(wq->flush_color != this_flusher.flush_color);
2229
2230 while (true) {
2231 struct wq_flusher *next, *tmp;
2232
2233 /* complete all the flushers sharing the current flush color */
2234 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2235 if (next->flush_color != wq->flush_color)
2236 break;
2237 list_del_init(&next->list);
2238 complete(&next->done);
2239 }
2240
2241 BUG_ON(!list_empty(&wq->flusher_overflow) &&
2242 wq->flush_color != work_next_color(wq->work_color));
2243
2244 /* this flush_color is finished, advance by one */
2245 wq->flush_color = work_next_color(wq->flush_color);
2246
2247 /* one color has been freed, handle overflow queue */
2248 if (!list_empty(&wq->flusher_overflow)) {
2249 /*
2250 * Assign the same color to all overflowed
2251 * flushers, advance work_color and append to
2252 * flusher_queue. This is the start-to-wait
2253 * phase for these overflowed flushers.
2254 */
2255 list_for_each_entry(tmp, &wq->flusher_overflow, list)
2256 tmp->flush_color = wq->work_color;
2257
2258 wq->work_color = work_next_color(wq->work_color);
2259
2260 list_splice_tail_init(&wq->flusher_overflow,
2261 &wq->flusher_queue);
2262 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2263 }
2264
2265 if (list_empty(&wq->flusher_queue)) {
2266 BUG_ON(wq->flush_color != wq->work_color);
2267 break;
2268 }
2269
2270 /*
2271 * Need to flush more colors. Make the next flusher
2272 * the new first flusher and arm cwqs.
2273 */
2274 BUG_ON(wq->flush_color == wq->work_color);
2275 BUG_ON(wq->flush_color != next->flush_color);
2276
2277 list_del_init(&next->list);
2278 wq->first_flusher = next;
2279
2280 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2281 break;
2282
2283 /*
2284 * Meh... this color is already done, clear first
2285 * flusher and repeat cascading.
2286 */
2287 wq->first_flusher = NULL;
2288 }
2289
2290out_unlock:
2291 mutex_unlock(&wq->flush_mutex);
535} 2292}
536EXPORT_SYMBOL_GPL(flush_workqueue); 2293EXPORT_SYMBOL_GPL(flush_workqueue);
537 2294
@@ -547,43 +2304,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
547 */ 2304 */
548int flush_work(struct work_struct *work) 2305int flush_work(struct work_struct *work)
549{ 2306{
2307 struct worker *worker = NULL;
2308 struct global_cwq *gcwq;
550 struct cpu_workqueue_struct *cwq; 2309 struct cpu_workqueue_struct *cwq;
551 struct list_head *prev;
552 struct wq_barrier barr; 2310 struct wq_barrier barr;
553 2311
554 might_sleep(); 2312 might_sleep();
555 cwq = get_wq_data(work); 2313 gcwq = get_work_gcwq(work);
556 if (!cwq) 2314 if (!gcwq)
557 return 0; 2315 return 0;
558 2316
559 lock_map_acquire(&cwq->wq->lockdep_map); 2317 spin_lock_irq(&gcwq->lock);
560 lock_map_release(&cwq->wq->lockdep_map);
561
562 prev = NULL;
563 spin_lock_irq(&cwq->lock);
564 if (!list_empty(&work->entry)) { 2318 if (!list_empty(&work->entry)) {
565 /* 2319 /*
566 * See the comment near try_to_grab_pending()->smp_rmb(). 2320 * See the comment near try_to_grab_pending()->smp_rmb().
567 * If it was re-queued under us we are not going to wait. 2321 * If it was re-queued to a different gcwq under us, we
2322 * are not going to wait.
568 */ 2323 */
569 smp_rmb(); 2324 smp_rmb();
570 if (unlikely(cwq != get_wq_data(work))) 2325 cwq = get_work_cwq(work);
571 goto out; 2326 if (unlikely(!cwq || gcwq != cwq->gcwq))
572 prev = &work->entry; 2327 goto already_gone;
573 } else { 2328 } else {
574 if (cwq->current_work != work) 2329 worker = find_worker_executing_work(gcwq, work);
575 goto out; 2330 if (!worker)
576 prev = &cwq->worklist; 2331 goto already_gone;
2332 cwq = worker->current_cwq;
577 } 2333 }
578 insert_wq_barrier(cwq, &barr, prev->next); 2334
579out: 2335 insert_wq_barrier(cwq, &barr, work, worker);
580 spin_unlock_irq(&cwq->lock); 2336 spin_unlock_irq(&gcwq->lock);
581 if (!prev) 2337
582 return 0; 2338 lock_map_acquire(&cwq->wq->lockdep_map);
2339 lock_map_release(&cwq->wq->lockdep_map);
583 2340
584 wait_for_completion(&barr.done); 2341 wait_for_completion(&barr.done);
585 destroy_work_on_stack(&barr.work); 2342 destroy_work_on_stack(&barr.work);
586 return 1; 2343 return 1;
2344already_gone:
2345 spin_unlock_irq(&gcwq->lock);
2346 return 0;
587} 2347}
588EXPORT_SYMBOL_GPL(flush_work); 2348EXPORT_SYMBOL_GPL(flush_work);
589 2349
@@ -593,54 +2353,55 @@ EXPORT_SYMBOL_GPL(flush_work);
593 */ 2353 */
594static int try_to_grab_pending(struct work_struct *work) 2354static int try_to_grab_pending(struct work_struct *work)
595{ 2355{
596 struct cpu_workqueue_struct *cwq; 2356 struct global_cwq *gcwq;
597 int ret = -1; 2357 int ret = -1;
598 2358
599 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) 2359 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
600 return 0; 2360 return 0;
601 2361
602 /* 2362 /*
603 * The queueing is in progress, or it is already queued. Try to 2363 * The queueing is in progress, or it is already queued. Try to
604 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 2364 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
605 */ 2365 */
606 2366 gcwq = get_work_gcwq(work);
607 cwq = get_wq_data(work); 2367 if (!gcwq)
608 if (!cwq)
609 return ret; 2368 return ret;
610 2369
611 spin_lock_irq(&cwq->lock); 2370 spin_lock_irq(&gcwq->lock);
612 if (!list_empty(&work->entry)) { 2371 if (!list_empty(&work->entry)) {
613 /* 2372 /*
614 * This work is queued, but perhaps we locked the wrong cwq. 2373 * This work is queued, but perhaps we locked the wrong gcwq.
615 * In that case we must see the new value after rmb(), see 2374 * In that case we must see the new value after rmb(), see
616 * insert_work()->wmb(). 2375 * insert_work()->wmb().
617 */ 2376 */
618 smp_rmb(); 2377 smp_rmb();
619 if (cwq == get_wq_data(work)) { 2378 if (gcwq == get_work_gcwq(work)) {
620 debug_work_deactivate(work); 2379 debug_work_deactivate(work);
621 list_del_init(&work->entry); 2380 list_del_init(&work->entry);
2381 cwq_dec_nr_in_flight(get_work_cwq(work),
2382 get_work_color(work));
622 ret = 1; 2383 ret = 1;
623 } 2384 }
624 } 2385 }
625 spin_unlock_irq(&cwq->lock); 2386 spin_unlock_irq(&gcwq->lock);
626 2387
627 return ret; 2388 return ret;
628} 2389}
629 2390
630static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, 2391static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
631 struct work_struct *work)
632{ 2392{
633 struct wq_barrier barr; 2393 struct wq_barrier barr;
634 int running = 0; 2394 struct worker *worker;
635 2395
636 spin_lock_irq(&cwq->lock); 2396 spin_lock_irq(&gcwq->lock);
637 if (unlikely(cwq->current_work == work)) { 2397
638 insert_wq_barrier(cwq, &barr, cwq->worklist.next); 2398 worker = find_worker_executing_work(gcwq, work);
639 running = 1; 2399 if (unlikely(worker))
640 } 2400 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
641 spin_unlock_irq(&cwq->lock);
642 2401
643 if (unlikely(running)) { 2402 spin_unlock_irq(&gcwq->lock);
2403
2404 if (unlikely(worker)) {
644 wait_for_completion(&barr.done); 2405 wait_for_completion(&barr.done);
645 destroy_work_on_stack(&barr.work); 2406 destroy_work_on_stack(&barr.work);
646 } 2407 }
@@ -648,9 +2409,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
648 2409
649static void wait_on_work(struct work_struct *work) 2410static void wait_on_work(struct work_struct *work)
650{ 2411{
651 struct cpu_workqueue_struct *cwq;
652 struct workqueue_struct *wq;
653 const struct cpumask *cpu_map;
654 int cpu; 2412 int cpu;
655 2413
656 might_sleep(); 2414 might_sleep();
@@ -658,15 +2416,8 @@ static void wait_on_work(struct work_struct *work)
658 lock_map_acquire(&work->lockdep_map); 2416 lock_map_acquire(&work->lockdep_map);
659 lock_map_release(&work->lockdep_map); 2417 lock_map_release(&work->lockdep_map);
660 2418
661 cwq = get_wq_data(work); 2419 for_each_gcwq_cpu(cpu)
662 if (!cwq) 2420 wait_on_cpu_work(get_gcwq(cpu), work);
663 return;
664
665 wq = cwq->wq;
666 cpu_map = wq_cpu_map(wq);
667
668 for_each_cpu(cpu, cpu_map)
669 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
670} 2421}
671 2422
672static int __cancel_work_timer(struct work_struct *work, 2423static int __cancel_work_timer(struct work_struct *work,
@@ -681,7 +2432,7 @@ static int __cancel_work_timer(struct work_struct *work,
681 wait_on_work(work); 2432 wait_on_work(work);
682 } while (unlikely(ret < 0)); 2433 } while (unlikely(ret < 0));
683 2434
684 clear_wq_data(work); 2435 clear_work_data(work);
685 return ret; 2436 return ret;
686} 2437}
687 2438
@@ -727,8 +2478,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
727} 2478}
728EXPORT_SYMBOL(cancel_delayed_work_sync); 2479EXPORT_SYMBOL(cancel_delayed_work_sync);
729 2480
730static struct workqueue_struct *keventd_wq __read_mostly;
731
732/** 2481/**
733 * schedule_work - put work task in global workqueue 2482 * schedule_work - put work task in global workqueue
734 * @work: job to be done 2483 * @work: job to be done
@@ -742,7 +2491,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
742 */ 2491 */
743int schedule_work(struct work_struct *work) 2492int schedule_work(struct work_struct *work)
744{ 2493{
745 return queue_work(keventd_wq, work); 2494 return queue_work(system_wq, work);
746} 2495}
747EXPORT_SYMBOL(schedule_work); 2496EXPORT_SYMBOL(schedule_work);
748 2497
@@ -755,7 +2504,7 @@ EXPORT_SYMBOL(schedule_work);
755 */ 2504 */
756int schedule_work_on(int cpu, struct work_struct *work) 2505int schedule_work_on(int cpu, struct work_struct *work)
757{ 2506{
758 return queue_work_on(cpu, keventd_wq, work); 2507 return queue_work_on(cpu, system_wq, work);
759} 2508}
760EXPORT_SYMBOL(schedule_work_on); 2509EXPORT_SYMBOL(schedule_work_on);
761 2510
@@ -770,7 +2519,7 @@ EXPORT_SYMBOL(schedule_work_on);
770int schedule_delayed_work(struct delayed_work *dwork, 2519int schedule_delayed_work(struct delayed_work *dwork,
771 unsigned long delay) 2520 unsigned long delay)
772{ 2521{
773 return queue_delayed_work(keventd_wq, dwork, delay); 2522 return queue_delayed_work(system_wq, dwork, delay);
774} 2523}
775EXPORT_SYMBOL(schedule_delayed_work); 2524EXPORT_SYMBOL(schedule_delayed_work);
776 2525
@@ -783,9 +2532,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
783void flush_delayed_work(struct delayed_work *dwork) 2532void flush_delayed_work(struct delayed_work *dwork)
784{ 2533{
785 if (del_timer_sync(&dwork->timer)) { 2534 if (del_timer_sync(&dwork->timer)) {
786 struct cpu_workqueue_struct *cwq; 2535 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
787 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); 2536 &dwork->work);
788 __queue_work(cwq, &dwork->work);
789 put_cpu(); 2537 put_cpu();
790 } 2538 }
791 flush_work(&dwork->work); 2539 flush_work(&dwork->work);
@@ -804,7 +2552,7 @@ EXPORT_SYMBOL(flush_delayed_work);
804int schedule_delayed_work_on(int cpu, 2552int schedule_delayed_work_on(int cpu,
805 struct delayed_work *dwork, unsigned long delay) 2553 struct delayed_work *dwork, unsigned long delay)
806{ 2554{
807 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); 2555 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
808} 2556}
809EXPORT_SYMBOL(schedule_delayed_work_on); 2557EXPORT_SYMBOL(schedule_delayed_work_on);
810 2558
@@ -820,8 +2568,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
820int schedule_on_each_cpu(work_func_t func) 2568int schedule_on_each_cpu(work_func_t func)
821{ 2569{
822 int cpu; 2570 int cpu;
823 int orig = -1; 2571 struct work_struct __percpu *works;
824 struct work_struct *works;
825 2572
826 works = alloc_percpu(struct work_struct); 2573 works = alloc_percpu(struct work_struct);
827 if (!works) 2574 if (!works)
@@ -829,23 +2576,12 @@ int schedule_on_each_cpu(work_func_t func)
829 2576
830 get_online_cpus(); 2577 get_online_cpus();
831 2578
832 /*
833 * When running in keventd don't schedule a work item on
834 * itself. Can just call directly because the work queue is
835 * already bound. This also is faster.
836 */
837 if (current_is_keventd())
838 orig = raw_smp_processor_id();
839
840 for_each_online_cpu(cpu) { 2579 for_each_online_cpu(cpu) {
841 struct work_struct *work = per_cpu_ptr(works, cpu); 2580 struct work_struct *work = per_cpu_ptr(works, cpu);
842 2581
843 INIT_WORK(work, func); 2582 INIT_WORK(work, func);
844 if (cpu != orig) 2583 schedule_work_on(cpu, work);
845 schedule_work_on(cpu, work);
846 } 2584 }
847 if (orig >= 0)
848 func(per_cpu_ptr(works, orig));
849 2585
850 for_each_online_cpu(cpu) 2586 for_each_online_cpu(cpu)
851 flush_work(per_cpu_ptr(works, cpu)); 2587 flush_work(per_cpu_ptr(works, cpu));
@@ -881,7 +2617,7 @@ int schedule_on_each_cpu(work_func_t func)
881 */ 2617 */
882void flush_scheduled_work(void) 2618void flush_scheduled_work(void)
883{ 2619{
884 flush_workqueue(keventd_wq); 2620 flush_workqueue(system_wq);
885} 2621}
886EXPORT_SYMBOL(flush_scheduled_work); 2622EXPORT_SYMBOL(flush_scheduled_work);
887 2623
@@ -913,170 +2649,170 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
913 2649
914int keventd_up(void) 2650int keventd_up(void)
915{ 2651{
916 return keventd_wq != NULL; 2652 return system_wq != NULL;
917} 2653}
918 2654
919int current_is_keventd(void) 2655static int alloc_cwqs(struct workqueue_struct *wq)
920{ 2656{
921 struct cpu_workqueue_struct *cwq; 2657 /*
922 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ 2658 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
923 int ret = 0; 2659 * Make sure that the alignment isn't lower than that of
924 2660 * unsigned long long.
925 BUG_ON(!keventd_wq); 2661 */
2662 const size_t size = sizeof(struct cpu_workqueue_struct);
2663 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2664 __alignof__(unsigned long long));
2665#ifdef CONFIG_SMP
2666 bool percpu = !(wq->flags & WQ_UNBOUND);
2667#else
2668 bool percpu = false;
2669#endif
926 2670
927 cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); 2671 if (percpu)
928 if (current == cwq->thread) 2672 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
929 ret = 1; 2673 else {
2674 void *ptr;
930 2675
931 return ret; 2676 /*
2677 * Allocate enough room to align cwq and put an extra
2678 * pointer at the end pointing back to the originally
2679 * allocated pointer which will be used for free.
2680 */
2681 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2682 if (ptr) {
2683 wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2684 *(void **)(wq->cpu_wq.single + 1) = ptr;
2685 }
2686 }
932 2687
2688 /* just in case, make sure it's actually aligned */
2689 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2690 return wq->cpu_wq.v ? 0 : -ENOMEM;
933} 2691}
934 2692
935static struct cpu_workqueue_struct * 2693static void free_cwqs(struct workqueue_struct *wq)
936init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
937{ 2694{
938 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 2695#ifdef CONFIG_SMP
939 2696 bool percpu = !(wq->flags & WQ_UNBOUND);
940 cwq->wq = wq; 2697#else
941 spin_lock_init(&cwq->lock); 2698 bool percpu = false;
942 INIT_LIST_HEAD(&cwq->worklist); 2699#endif
943 init_waitqueue_head(&cwq->more_work);
944 2700
945 return cwq; 2701 if (percpu)
2702 free_percpu(wq->cpu_wq.pcpu);
2703 else if (wq->cpu_wq.single) {
2704 /* the pointer to free is stored right after the cwq */
2705 kfree(*(void **)(wq->cpu_wq.single + 1));
2706 }
946} 2707}
947 2708
948static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2709static int wq_clamp_max_active(int max_active, unsigned int flags,
2710 const char *name)
949{ 2711{
950 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 2712 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
951 struct workqueue_struct *wq = cwq->wq;
952 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
953 struct task_struct *p;
954 2713
955 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); 2714 if (max_active < 1 || max_active > lim)
956 /* 2715 printk(KERN_WARNING "workqueue: max_active %d requested for %s "
957 * Nobody can add the work_struct to this cwq, 2716 "is out of range, clamping between %d and %d\n",
958 * if (caller is __create_workqueue) 2717 max_active, name, 1, lim);
959 * nobody should see this wq
960 * else // caller is CPU_UP_PREPARE
961 * cpu is not on cpu_online_map
962 * so we can abort safely.
963 */
964 if (IS_ERR(p))
965 return PTR_ERR(p);
966 if (cwq->wq->rt)
967 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
968 cwq->thread = p;
969 2718
970 trace_workqueue_creation(cwq->thread, cpu); 2719 return clamp_val(max_active, 1, lim);
971
972 return 0;
973} 2720}
974 2721
975static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2722struct workqueue_struct *__alloc_workqueue_key(const char *name,
2723 unsigned int flags,
2724 int max_active,
2725 struct lock_class_key *key,
2726 const char *lock_name)
976{ 2727{
977 struct task_struct *p = cwq->thread; 2728 struct workqueue_struct *wq;
2729 unsigned int cpu;
978 2730
979 if (p != NULL) { 2731 /*
980 if (cpu >= 0) 2732 * Unbound workqueues aren't concurrency managed and should be
981 kthread_bind(p, cpu); 2733 * dispatched to workers immediately.
982 wake_up_process(p); 2734 */
983 } 2735 if (flags & WQ_UNBOUND)
984} 2736 flags |= WQ_HIGHPRI;
985 2737
986struct workqueue_struct *__create_workqueue_key(const char *name, 2738 max_active = max_active ?: WQ_DFL_ACTIVE;
987 int singlethread, 2739 max_active = wq_clamp_max_active(max_active, flags, name);
988 int freezeable,
989 int rt,
990 struct lock_class_key *key,
991 const char *lock_name)
992{
993 struct workqueue_struct *wq;
994 struct cpu_workqueue_struct *cwq;
995 int err = 0, cpu;
996 2740
997 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 2741 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
998 if (!wq) 2742 if (!wq)
999 return NULL; 2743 goto err;
1000 2744
1001 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 2745 wq->flags = flags;
1002 if (!wq->cpu_wq) { 2746 wq->saved_max_active = max_active;
1003 kfree(wq); 2747 mutex_init(&wq->flush_mutex);
1004 return NULL; 2748 atomic_set(&wq->nr_cwqs_to_flush, 0);
1005 } 2749 INIT_LIST_HEAD(&wq->flusher_queue);
2750 INIT_LIST_HEAD(&wq->flusher_overflow);
1006 2751
1007 wq->name = name; 2752 wq->name = name;
1008 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 2753 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
1009 wq->singlethread = singlethread;
1010 wq->freezeable = freezeable;
1011 wq->rt = rt;
1012 INIT_LIST_HEAD(&wq->list); 2754 INIT_LIST_HEAD(&wq->list);
1013 2755
1014 if (singlethread) { 2756 if (alloc_cwqs(wq) < 0)
1015 cwq = init_cpu_workqueue(wq, singlethread_cpu); 2757 goto err;
1016 err = create_workqueue_thread(cwq, singlethread_cpu); 2758
1017 start_workqueue_thread(cwq, -1); 2759 for_each_cwq_cpu(cpu, wq) {
1018 } else { 2760 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1019 cpu_maps_update_begin(); 2761 struct global_cwq *gcwq = get_gcwq(cpu);
1020 /* 2762
1021 * We must place this wq on list even if the code below fails. 2763 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
1022 * cpu_down(cpu) can remove cpu from cpu_populated_map before 2764 cwq->gcwq = gcwq;
1023 * destroy_workqueue() takes the lock, in that case we leak 2765 cwq->wq = wq;
1024 * cwq[cpu]->thread. 2766 cwq->flush_color = -1;
1025 */ 2767 cwq->max_active = max_active;
1026 spin_lock(&workqueue_lock); 2768 INIT_LIST_HEAD(&cwq->delayed_works);
1027 list_add(&wq->list, &workqueues);
1028 spin_unlock(&workqueue_lock);
1029 /*
1030 * We must initialize cwqs for each possible cpu even if we
1031 * are going to call destroy_workqueue() finally. Otherwise
1032 * cpu_up() can hit the uninitialized cwq once we drop the
1033 * lock.
1034 */
1035 for_each_possible_cpu(cpu) {
1036 cwq = init_cpu_workqueue(wq, cpu);
1037 if (err || !cpu_online(cpu))
1038 continue;
1039 err = create_workqueue_thread(cwq, cpu);
1040 start_workqueue_thread(cwq, cpu);
1041 }
1042 cpu_maps_update_done();
1043 } 2769 }
1044 2770
1045 if (err) { 2771 if (flags & WQ_RESCUER) {
1046 destroy_workqueue(wq); 2772 struct worker *rescuer;
1047 wq = NULL; 2773
2774 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
2775 goto err;
2776
2777 wq->rescuer = rescuer = alloc_worker();
2778 if (!rescuer)
2779 goto err;
2780
2781 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
2782 if (IS_ERR(rescuer->task))
2783 goto err;
2784
2785 wq->rescuer = rescuer;
2786 rescuer->task->flags |= PF_THREAD_BOUND;
2787 wake_up_process(rescuer->task);
1048 } 2788 }
1049 return wq;
1050}
1051EXPORT_SYMBOL_GPL(__create_workqueue_key);
1052 2789
1053static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1054{
1055 /* 2790 /*
1056 * Our caller is either destroy_workqueue() or CPU_POST_DEAD, 2791 * workqueue_lock protects global freeze state and workqueues
1057 * cpu_add_remove_lock protects cwq->thread. 2792 * list. Grab it, set max_active accordingly and add the new
2793 * workqueue to workqueues list.
1058 */ 2794 */
1059 if (cwq->thread == NULL) 2795 spin_lock(&workqueue_lock);
1060 return;
1061 2796
1062 lock_map_acquire(&cwq->wq->lockdep_map); 2797 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
1063 lock_map_release(&cwq->wq->lockdep_map); 2798 for_each_cwq_cpu(cpu, wq)
2799 get_cwq(cpu, wq)->max_active = 0;
1064 2800
1065 flush_cpu_workqueue(cwq); 2801 list_add(&wq->list, &workqueues);
1066 /* 2802
1067 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, 2803 spin_unlock(&workqueue_lock);
1068 * a concurrent flush_workqueue() can insert a barrier after us. 2804
1069 * However, in that case run_workqueue() won't return and check 2805 return wq;
1070 * kthread_should_stop() until it flushes all work_struct's. 2806err:
1071 * When ->worklist becomes empty it is safe to exit because no 2807 if (wq) {
1072 * more work_structs can be queued on this cwq: flush_workqueue 2808 free_cwqs(wq);
1073 * checks list_empty(), and a "normal" queue_work() can't use 2809 free_mayday_mask(wq->mayday_mask);
1074 * a dead CPU. 2810 kfree(wq->rescuer);
1075 */ 2811 kfree(wq);
1076 trace_workqueue_destruction(cwq->thread); 2812 }
1077 kthread_stop(cwq->thread); 2813 return NULL;
1078 cwq->thread = NULL;
1079} 2814}
2815EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
1080 2816
1081/** 2817/**
1082 * destroy_workqueue - safely terminate a workqueue 2818 * destroy_workqueue - safely terminate a workqueue
@@ -1086,72 +2822,516 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1086 */ 2822 */
1087void destroy_workqueue(struct workqueue_struct *wq) 2823void destroy_workqueue(struct workqueue_struct *wq)
1088{ 2824{
1089 const struct cpumask *cpu_map = wq_cpu_map(wq); 2825 unsigned int cpu;
1090 int cpu;
1091 2826
1092 cpu_maps_update_begin(); 2827 flush_workqueue(wq);
2828
2829 /*
2830 * wq list is used to freeze wq, remove from list after
2831 * flushing is complete in case freeze races us.
2832 */
1093 spin_lock(&workqueue_lock); 2833 spin_lock(&workqueue_lock);
1094 list_del(&wq->list); 2834 list_del(&wq->list);
1095 spin_unlock(&workqueue_lock); 2835 spin_unlock(&workqueue_lock);
1096 2836
1097 for_each_cpu(cpu, cpu_map) 2837 /* sanity check */
1098 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 2838 for_each_cwq_cpu(cpu, wq) {
1099 cpu_maps_update_done(); 2839 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2840 int i;
2841
2842 for (i = 0; i < WORK_NR_COLORS; i++)
2843 BUG_ON(cwq->nr_in_flight[i]);
2844 BUG_ON(cwq->nr_active);
2845 BUG_ON(!list_empty(&cwq->delayed_works));
2846 }
2847
2848 if (wq->flags & WQ_RESCUER) {
2849 kthread_stop(wq->rescuer->task);
2850 free_mayday_mask(wq->mayday_mask);
2851 }
1100 2852
1101 free_percpu(wq->cpu_wq); 2853 free_cwqs(wq);
1102 kfree(wq); 2854 kfree(wq);
1103} 2855}
1104EXPORT_SYMBOL_GPL(destroy_workqueue); 2856EXPORT_SYMBOL_GPL(destroy_workqueue);
1105 2857
2858/**
2859 * workqueue_set_max_active - adjust max_active of a workqueue
2860 * @wq: target workqueue
2861 * @max_active: new max_active value.
2862 *
2863 * Set max_active of @wq to @max_active.
2864 *
2865 * CONTEXT:
2866 * Don't call from IRQ context.
2867 */
2868void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2869{
2870 unsigned int cpu;
2871
2872 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
2873
2874 spin_lock(&workqueue_lock);
2875
2876 wq->saved_max_active = max_active;
2877
2878 for_each_cwq_cpu(cpu, wq) {
2879 struct global_cwq *gcwq = get_gcwq(cpu);
2880
2881 spin_lock_irq(&gcwq->lock);
2882
2883 if (!(wq->flags & WQ_FREEZEABLE) ||
2884 !(gcwq->flags & GCWQ_FREEZING))
2885 get_cwq(gcwq->cpu, wq)->max_active = max_active;
2886
2887 spin_unlock_irq(&gcwq->lock);
2888 }
2889
2890 spin_unlock(&workqueue_lock);
2891}
2892EXPORT_SYMBOL_GPL(workqueue_set_max_active);
2893
2894/**
2895 * workqueue_congested - test whether a workqueue is congested
2896 * @cpu: CPU in question
2897 * @wq: target workqueue
2898 *
2899 * Test whether @wq's cpu workqueue for @cpu is congested. There is
2900 * no synchronization around this function and the test result is
2901 * unreliable and only useful as advisory hints or for debugging.
2902 *
2903 * RETURNS:
2904 * %true if congested, %false otherwise.
2905 */
2906bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
2907{
2908 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2909
2910 return !list_empty(&cwq->delayed_works);
2911}
2912EXPORT_SYMBOL_GPL(workqueue_congested);
2913
2914/**
2915 * work_cpu - return the last known associated cpu for @work
2916 * @work: the work of interest
2917 *
2918 * RETURNS:
2919 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
2920 */
2921unsigned int work_cpu(struct work_struct *work)
2922{
2923 struct global_cwq *gcwq = get_work_gcwq(work);
2924
2925 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
2926}
2927EXPORT_SYMBOL_GPL(work_cpu);
2928
2929/**
2930 * work_busy - test whether a work is currently pending or running
2931 * @work: the work to be tested
2932 *
2933 * Test whether @work is currently pending or running. There is no
2934 * synchronization around this function and the test result is
2935 * unreliable and only useful as advisory hints or for debugging.
2936 * Especially for reentrant wqs, the pending state might hide the
2937 * running state.
2938 *
2939 * RETURNS:
2940 * OR'd bitmask of WORK_BUSY_* bits.
2941 */
2942unsigned int work_busy(struct work_struct *work)
2943{
2944 struct global_cwq *gcwq = get_work_gcwq(work);
2945 unsigned long flags;
2946 unsigned int ret = 0;
2947
2948 if (!gcwq)
2949 return false;
2950
2951 spin_lock_irqsave(&gcwq->lock, flags);
2952
2953 if (work_pending(work))
2954 ret |= WORK_BUSY_PENDING;
2955 if (find_worker_executing_work(gcwq, work))
2956 ret |= WORK_BUSY_RUNNING;
2957
2958 spin_unlock_irqrestore(&gcwq->lock, flags);
2959
2960 return ret;
2961}
2962EXPORT_SYMBOL_GPL(work_busy);
2963
2964/*
2965 * CPU hotplug.
2966 *
2967 * There are two challenges in supporting CPU hotplug. Firstly, there
2968 * are a lot of assumptions on strong associations among work, cwq and
2969 * gcwq which make migrating pending and scheduled works very
2970 * difficult to implement without impacting hot paths. Secondly,
2971 * gcwqs serve mix of short, long and very long running works making
2972 * blocked draining impractical.
2973 *
2974 * This is solved by allowing a gcwq to be detached from CPU, running
2975 * it with unbound (rogue) workers and allowing it to be reattached
2976 * later if the cpu comes back online. A separate thread is created
2977 * to govern a gcwq in such state and is called the trustee of the
2978 * gcwq.
2979 *
2980 * Trustee states and their descriptions.
2981 *
2982 * START Command state used on startup. On CPU_DOWN_PREPARE, a
2983 * new trustee is started with this state.
2984 *
2985 * IN_CHARGE Once started, trustee will enter this state after
2986 * assuming the manager role and making all existing
2987 * workers rogue. DOWN_PREPARE waits for trustee to
2988 * enter this state. After reaching IN_CHARGE, trustee
2989 * tries to execute the pending worklist until it's empty
2990 * and the state is set to BUTCHER, or the state is set
2991 * to RELEASE.
2992 *
2993 * BUTCHER Command state which is set by the cpu callback after
2994 * the cpu has went down. Once this state is set trustee
2995 * knows that there will be no new works on the worklist
2996 * and once the worklist is empty it can proceed to
2997 * killing idle workers.
2998 *
2999 * RELEASE Command state which is set by the cpu callback if the
3000 * cpu down has been canceled or it has come online
3001 * again. After recognizing this state, trustee stops
3002 * trying to drain or butcher and clears ROGUE, rebinds
3003 * all remaining workers back to the cpu and releases
3004 * manager role.
3005 *
3006 * DONE Trustee will enter this state after BUTCHER or RELEASE
3007 * is complete.
3008 *
3009 * trustee CPU draining
3010 * took over down complete
3011 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3012 * | | ^
3013 * | CPU is back online v return workers |
3014 * ----------------> RELEASE --------------
3015 */
3016
3017/**
3018 * trustee_wait_event_timeout - timed event wait for trustee
3019 * @cond: condition to wait for
3020 * @timeout: timeout in jiffies
3021 *
3022 * wait_event_timeout() for trustee to use. Handles locking and
3023 * checks for RELEASE request.
3024 *
3025 * CONTEXT:
3026 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3027 * multiple times. To be used by trustee.
3028 *
3029 * RETURNS:
3030 * Positive indicating left time if @cond is satisfied, 0 if timed
3031 * out, -1 if canceled.
3032 */
3033#define trustee_wait_event_timeout(cond, timeout) ({ \
3034 long __ret = (timeout); \
3035 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3036 __ret) { \
3037 spin_unlock_irq(&gcwq->lock); \
3038 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3039 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3040 __ret); \
3041 spin_lock_irq(&gcwq->lock); \
3042 } \
3043 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3044})
3045
3046/**
3047 * trustee_wait_event - event wait for trustee
3048 * @cond: condition to wait for
3049 *
3050 * wait_event() for trustee to use. Automatically handles locking and
3051 * checks for CANCEL request.
3052 *
3053 * CONTEXT:
3054 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3055 * multiple times. To be used by trustee.
3056 *
3057 * RETURNS:
3058 * 0 if @cond is satisfied, -1 if canceled.
3059 */
3060#define trustee_wait_event(cond) ({ \
3061 long __ret1; \
3062 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3063 __ret1 < 0 ? -1 : 0; \
3064})
3065
3066static int __cpuinit trustee_thread(void *__gcwq)
3067{
3068 struct global_cwq *gcwq = __gcwq;
3069 struct worker *worker;
3070 struct work_struct *work;
3071 struct hlist_node *pos;
3072 long rc;
3073 int i;
3074
3075 BUG_ON(gcwq->cpu != smp_processor_id());
3076
3077 spin_lock_irq(&gcwq->lock);
3078 /*
3079 * Claim the manager position and make all workers rogue.
3080 * Trustee must be bound to the target cpu and can't be
3081 * cancelled.
3082 */
3083 BUG_ON(gcwq->cpu != smp_processor_id());
3084 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3085 BUG_ON(rc < 0);
3086
3087 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3088
3089 list_for_each_entry(worker, &gcwq->idle_list, entry)
3090 worker->flags |= WORKER_ROGUE;
3091
3092 for_each_busy_worker(worker, i, pos, gcwq)
3093 worker->flags |= WORKER_ROGUE;
3094
3095 /*
3096 * Call schedule() so that we cross rq->lock and thus can
3097 * guarantee sched callbacks see the rogue flag. This is
3098 * necessary as scheduler callbacks may be invoked from other
3099 * cpus.
3100 */
3101 spin_unlock_irq(&gcwq->lock);
3102 schedule();
3103 spin_lock_irq(&gcwq->lock);
3104
3105 /*
3106 * Sched callbacks are disabled now. Zap nr_running. After
3107 * this, nr_running stays zero and need_more_worker() and
3108 * keep_working() are always true as long as the worklist is
3109 * not empty.
3110 */
3111 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3112
3113 spin_unlock_irq(&gcwq->lock);
3114 del_timer_sync(&gcwq->idle_timer);
3115 spin_lock_irq(&gcwq->lock);
3116
3117 /*
3118 * We're now in charge. Notify and proceed to drain. We need
3119 * to keep the gcwq running during the whole CPU down
3120 * procedure as other cpu hotunplug callbacks may need to
3121 * flush currently running tasks.
3122 */
3123 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3124 wake_up_all(&gcwq->trustee_wait);
3125
3126 /*
3127 * The original cpu is in the process of dying and may go away
3128 * anytime now. When that happens, we and all workers would
3129 * be migrated to other cpus. Try draining any left work. We
3130 * want to get it over with ASAP - spam rescuers, wake up as
3131 * many idlers as necessary and create new ones till the
3132 * worklist is empty. Note that if the gcwq is frozen, there
3133 * may be frozen works in freezeable cwqs. Don't declare
3134 * completion while frozen.
3135 */
3136 while (gcwq->nr_workers != gcwq->nr_idle ||
3137 gcwq->flags & GCWQ_FREEZING ||
3138 gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3139 int nr_works = 0;
3140
3141 list_for_each_entry(work, &gcwq->worklist, entry) {
3142 send_mayday(work);
3143 nr_works++;
3144 }
3145
3146 list_for_each_entry(worker, &gcwq->idle_list, entry) {
3147 if (!nr_works--)
3148 break;
3149 wake_up_process(worker->task);
3150 }
3151
3152 if (need_to_create_worker(gcwq)) {
3153 spin_unlock_irq(&gcwq->lock);
3154 worker = create_worker(gcwq, false);
3155 spin_lock_irq(&gcwq->lock);
3156 if (worker) {
3157 worker->flags |= WORKER_ROGUE;
3158 start_worker(worker);
3159 }
3160 }
3161
3162 /* give a breather */
3163 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3164 break;
3165 }
3166
3167 /*
3168 * Either all works have been scheduled and cpu is down, or
3169 * cpu down has already been canceled. Wait for and butcher
3170 * all workers till we're canceled.
3171 */
3172 do {
3173 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3174 while (!list_empty(&gcwq->idle_list))
3175 destroy_worker(list_first_entry(&gcwq->idle_list,
3176 struct worker, entry));
3177 } while (gcwq->nr_workers && rc >= 0);
3178
3179 /*
3180 * At this point, either draining has completed and no worker
3181 * is left, or cpu down has been canceled or the cpu is being
3182 * brought back up. There shouldn't be any idle one left.
3183 * Tell the remaining busy ones to rebind once it finishes the
3184 * currently scheduled works by scheduling the rebind_work.
3185 */
3186 WARN_ON(!list_empty(&gcwq->idle_list));
3187
3188 for_each_busy_worker(worker, i, pos, gcwq) {
3189 struct work_struct *rebind_work = &worker->rebind_work;
3190
3191 /*
3192 * Rebind_work may race with future cpu hotplug
3193 * operations. Use a separate flag to mark that
3194 * rebinding is scheduled.
3195 */
3196 worker->flags |= WORKER_REBIND;
3197 worker->flags &= ~WORKER_ROGUE;
3198
3199 /* queue rebind_work, wq doesn't matter, use the default one */
3200 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3201 work_data_bits(rebind_work)))
3202 continue;
3203
3204 debug_work_activate(rebind_work);
3205 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3206 worker->scheduled.next,
3207 work_color_to_flags(WORK_NO_COLOR));
3208 }
3209
3210 /* relinquish manager role */
3211 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3212
3213 /* notify completion */
3214 gcwq->trustee = NULL;
3215 gcwq->trustee_state = TRUSTEE_DONE;
3216 wake_up_all(&gcwq->trustee_wait);
3217 spin_unlock_irq(&gcwq->lock);
3218 return 0;
3219}
3220
3221/**
3222 * wait_trustee_state - wait for trustee to enter the specified state
3223 * @gcwq: gcwq the trustee of interest belongs to
3224 * @state: target state to wait for
3225 *
3226 * Wait for the trustee to reach @state. DONE is already matched.
3227 *
3228 * CONTEXT:
3229 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3230 * multiple times. To be used by cpu_callback.
3231 */
3232static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3233{
3234 if (!(gcwq->trustee_state == state ||
3235 gcwq->trustee_state == TRUSTEE_DONE)) {
3236 spin_unlock_irq(&gcwq->lock);
3237 __wait_event(gcwq->trustee_wait,
3238 gcwq->trustee_state == state ||
3239 gcwq->trustee_state == TRUSTEE_DONE);
3240 spin_lock_irq(&gcwq->lock);
3241 }
3242}
3243
1106static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 3244static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1107 unsigned long action, 3245 unsigned long action,
1108 void *hcpu) 3246 void *hcpu)
1109{ 3247{
1110 unsigned int cpu = (unsigned long)hcpu; 3248 unsigned int cpu = (unsigned long)hcpu;
1111 struct cpu_workqueue_struct *cwq; 3249 struct global_cwq *gcwq = get_gcwq(cpu);
1112 struct workqueue_struct *wq; 3250 struct task_struct *new_trustee = NULL;
1113 int err = 0; 3251 struct worker *uninitialized_var(new_worker);
3252 unsigned long flags;
1114 3253
1115 action &= ~CPU_TASKS_FROZEN; 3254 action &= ~CPU_TASKS_FROZEN;
1116 3255
1117 switch (action) { 3256 switch (action) {
3257 case CPU_DOWN_PREPARE:
3258 new_trustee = kthread_create(trustee_thread, gcwq,
3259 "workqueue_trustee/%d\n", cpu);
3260 if (IS_ERR(new_trustee))
3261 return notifier_from_errno(PTR_ERR(new_trustee));
3262 kthread_bind(new_trustee, cpu);
3263 /* fall through */
1118 case CPU_UP_PREPARE: 3264 case CPU_UP_PREPARE:
1119 cpumask_set_cpu(cpu, cpu_populated_map); 3265 BUG_ON(gcwq->first_idle);
1120 } 3266 new_worker = create_worker(gcwq, false);
1121undo: 3267 if (!new_worker) {
1122 list_for_each_entry(wq, &workqueues, list) { 3268 if (new_trustee)
1123 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 3269 kthread_stop(new_trustee);
1124 3270 return NOTIFY_BAD;
1125 switch (action) {
1126 case CPU_UP_PREPARE:
1127 err = create_workqueue_thread(cwq, cpu);
1128 if (!err)
1129 break;
1130 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1131 wq->name, cpu);
1132 action = CPU_UP_CANCELED;
1133 err = -ENOMEM;
1134 goto undo;
1135
1136 case CPU_ONLINE:
1137 start_workqueue_thread(cwq, cpu);
1138 break;
1139
1140 case CPU_UP_CANCELED:
1141 start_workqueue_thread(cwq, -1);
1142 case CPU_POST_DEAD:
1143 cleanup_workqueue_thread(cwq);
1144 break;
1145 } 3271 }
1146 } 3272 }
1147 3273
3274 /* some are called w/ irq disabled, don't disturb irq status */
3275 spin_lock_irqsave(&gcwq->lock, flags);
3276
1148 switch (action) { 3277 switch (action) {
1149 case CPU_UP_CANCELED: 3278 case CPU_DOWN_PREPARE:
3279 /* initialize trustee and tell it to acquire the gcwq */
3280 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3281 gcwq->trustee = new_trustee;
3282 gcwq->trustee_state = TRUSTEE_START;
3283 wake_up_process(gcwq->trustee);
3284 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3285 /* fall through */
3286 case CPU_UP_PREPARE:
3287 BUG_ON(gcwq->first_idle);
3288 gcwq->first_idle = new_worker;
3289 break;
3290
3291 case CPU_DYING:
3292 /*
3293 * Before this, the trustee and all workers except for
3294 * the ones which are still executing works from
3295 * before the last CPU down must be on the cpu. After
3296 * this, they'll all be diasporas.
3297 */
3298 gcwq->flags |= GCWQ_DISASSOCIATED;
3299 break;
3300
1150 case CPU_POST_DEAD: 3301 case CPU_POST_DEAD:
1151 cpumask_clear_cpu(cpu, cpu_populated_map); 3302 gcwq->trustee_state = TRUSTEE_BUTCHER;
3303 /* fall through */
3304 case CPU_UP_CANCELED:
3305 destroy_worker(gcwq->first_idle);
3306 gcwq->first_idle = NULL;
3307 break;
3308
3309 case CPU_DOWN_FAILED:
3310 case CPU_ONLINE:
3311 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3312 if (gcwq->trustee_state != TRUSTEE_DONE) {
3313 gcwq->trustee_state = TRUSTEE_RELEASE;
3314 wake_up_process(gcwq->trustee);
3315 wait_trustee_state(gcwq, TRUSTEE_DONE);
3316 }
3317
3318 /*
3319 * Trustee is done and there might be no worker left.
3320 * Put the first_idle in and request a real manager to
3321 * take a look.
3322 */
3323 spin_unlock_irq(&gcwq->lock);
3324 kthread_bind(gcwq->first_idle->task, cpu);
3325 spin_lock_irq(&gcwq->lock);
3326 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3327 start_worker(gcwq->first_idle);
3328 gcwq->first_idle = NULL;
3329 break;
1152 } 3330 }
1153 3331
1154 return notifier_from_errno(err); 3332 spin_unlock_irqrestore(&gcwq->lock, flags);
3333
3334 return notifier_from_errno(0);
1155} 3335}
1156 3336
1157#ifdef CONFIG_SMP 3337#ifdef CONFIG_SMP
@@ -1201,14 +3381,199 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
1201EXPORT_SYMBOL_GPL(work_on_cpu); 3381EXPORT_SYMBOL_GPL(work_on_cpu);
1202#endif /* CONFIG_SMP */ 3382#endif /* CONFIG_SMP */
1203 3383
1204void __init init_workqueues(void) 3384#ifdef CONFIG_FREEZER
3385
3386/**
3387 * freeze_workqueues_begin - begin freezing workqueues
3388 *
3389 * Start freezing workqueues. After this function returns, all
3390 * freezeable workqueues will queue new works to their frozen_works
3391 * list instead of gcwq->worklist.
3392 *
3393 * CONTEXT:
3394 * Grabs and releases workqueue_lock and gcwq->lock's.
3395 */
3396void freeze_workqueues_begin(void)
1205{ 3397{
1206 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); 3398 unsigned int cpu;
1207 3399
1208 cpumask_copy(cpu_populated_map, cpu_online_mask); 3400 spin_lock(&workqueue_lock);
1209 singlethread_cpu = cpumask_first(cpu_possible_mask); 3401
1210 cpu_singlethread_map = cpumask_of(singlethread_cpu); 3402 BUG_ON(workqueue_freezing);
1211 hotcpu_notifier(workqueue_cpu_callback, 0); 3403 workqueue_freezing = true;
1212 keventd_wq = create_workqueue("events"); 3404
1213 BUG_ON(!keventd_wq); 3405 for_each_gcwq_cpu(cpu) {
3406 struct global_cwq *gcwq = get_gcwq(cpu);
3407 struct workqueue_struct *wq;
3408
3409 spin_lock_irq(&gcwq->lock);
3410
3411 BUG_ON(gcwq->flags & GCWQ_FREEZING);
3412 gcwq->flags |= GCWQ_FREEZING;
3413
3414 list_for_each_entry(wq, &workqueues, list) {
3415 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3416
3417 if (cwq && wq->flags & WQ_FREEZEABLE)
3418 cwq->max_active = 0;
3419 }
3420
3421 spin_unlock_irq(&gcwq->lock);
3422 }
3423
3424 spin_unlock(&workqueue_lock);
3425}
3426
3427/**
3428 * freeze_workqueues_busy - are freezeable workqueues still busy?
3429 *
3430 * Check whether freezing is complete. This function must be called
3431 * between freeze_workqueues_begin() and thaw_workqueues().
3432 *
3433 * CONTEXT:
3434 * Grabs and releases workqueue_lock.
3435 *
3436 * RETURNS:
3437 * %true if some freezeable workqueues are still busy. %false if
3438 * freezing is complete.
3439 */
3440bool freeze_workqueues_busy(void)
3441{
3442 unsigned int cpu;
3443 bool busy = false;
3444
3445 spin_lock(&workqueue_lock);
3446
3447 BUG_ON(!workqueue_freezing);
3448
3449 for_each_gcwq_cpu(cpu) {
3450 struct workqueue_struct *wq;
3451 /*
3452 * nr_active is monotonically decreasing. It's safe
3453 * to peek without lock.
3454 */
3455 list_for_each_entry(wq, &workqueues, list) {
3456 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3457
3458 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3459 continue;
3460
3461 BUG_ON(cwq->nr_active < 0);
3462 if (cwq->nr_active) {
3463 busy = true;
3464 goto out_unlock;
3465 }
3466 }
3467 }
3468out_unlock:
3469 spin_unlock(&workqueue_lock);
3470 return busy;
3471}
3472
3473/**
3474 * thaw_workqueues - thaw workqueues
3475 *
3476 * Thaw workqueues. Normal queueing is restored and all collected
3477 * frozen works are transferred to their respective gcwq worklists.
3478 *
3479 * CONTEXT:
3480 * Grabs and releases workqueue_lock and gcwq->lock's.
3481 */
3482void thaw_workqueues(void)
3483{
3484 unsigned int cpu;
3485
3486 spin_lock(&workqueue_lock);
3487
3488 if (!workqueue_freezing)
3489 goto out_unlock;
3490
3491 for_each_gcwq_cpu(cpu) {
3492 struct global_cwq *gcwq = get_gcwq(cpu);
3493 struct workqueue_struct *wq;
3494
3495 spin_lock_irq(&gcwq->lock);
3496
3497 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3498 gcwq->flags &= ~GCWQ_FREEZING;
3499
3500 list_for_each_entry(wq, &workqueues, list) {
3501 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3502
3503 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3504 continue;
3505
3506 /* restore max_active and repopulate worklist */
3507 cwq->max_active = wq->saved_max_active;
3508
3509 while (!list_empty(&cwq->delayed_works) &&
3510 cwq->nr_active < cwq->max_active)
3511 cwq_activate_first_delayed(cwq);
3512 }
3513
3514 wake_up_worker(gcwq);
3515
3516 spin_unlock_irq(&gcwq->lock);
3517 }
3518
3519 workqueue_freezing = false;
3520out_unlock:
3521 spin_unlock(&workqueue_lock);
3522}
3523#endif /* CONFIG_FREEZER */
3524
3525static int __init init_workqueues(void)
3526{
3527 unsigned int cpu;
3528 int i;
3529
3530 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3531
3532 /* initialize gcwqs */
3533 for_each_gcwq_cpu(cpu) {
3534 struct global_cwq *gcwq = get_gcwq(cpu);
3535
3536 spin_lock_init(&gcwq->lock);
3537 INIT_LIST_HEAD(&gcwq->worklist);
3538 gcwq->cpu = cpu;
3539 if (cpu == WORK_CPU_UNBOUND)
3540 gcwq->flags |= GCWQ_DISASSOCIATED;
3541
3542 INIT_LIST_HEAD(&gcwq->idle_list);
3543 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3544 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3545
3546 init_timer_deferrable(&gcwq->idle_timer);
3547 gcwq->idle_timer.function = idle_worker_timeout;
3548 gcwq->idle_timer.data = (unsigned long)gcwq;
3549
3550 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3551 (unsigned long)gcwq);
3552
3553 ida_init(&gcwq->worker_ida);
3554
3555 gcwq->trustee_state = TRUSTEE_DONE;
3556 init_waitqueue_head(&gcwq->trustee_wait);
3557 }
3558
3559 /* create the initial worker */
3560 for_each_online_gcwq_cpu(cpu) {
3561 struct global_cwq *gcwq = get_gcwq(cpu);
3562 struct worker *worker;
3563
3564 worker = create_worker(gcwq, true);
3565 BUG_ON(!worker);
3566 spin_lock_irq(&gcwq->lock);
3567 start_worker(worker);
3568 spin_unlock_irq(&gcwq->lock);
3569 }
3570
3571 system_wq = alloc_workqueue("events", 0, 0);
3572 system_long_wq = alloc_workqueue("events_long", 0, 0);
3573 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3574 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3575 WQ_UNBOUND_MAX_ACTIVE);
3576 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
3577 return 0;
1214} 3578}
3579early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..2d10fc98dc79
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);