aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c141
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/audit.h26
-rw-r--r--kernel/audit_tree.c237
-rw-r--r--kernel/audit_watch.c274
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c19
-rw-r--r--kernel/cgroup.c30
-rw-r--r--kernel/compat.c38
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c21
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/gdbstub.c189
-rw-r--r--kernel/debug/kdb/kdb_bp.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c138
-rw-r--r--kernel/debug/kdb/kdb_private.h9
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/exec_domain.c22
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c22
-rw-r--r--kernel/gcov/fs.c244
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/hrtimer.c20
-rw-r--r--kernel/hw_breakpoint.c93
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kfifo.c751
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kthread.c164
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/module.c1092
-rw-r--r--kernel/mutex.c23
-rw-r--r--kernel/panic.c60
-rw-r--r--kernel/params.c233
-rw-r--r--kernel/perf_event.c492
-rw-r--r--kernel/pid.c56
-rw-r--r--kernel/pm_qos_params.c16
-rw-r--r--kernel/posix-cpu-timers.c44
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c21
-rw-r--r--kernel/power/snapshot.c85
-rw-r--r--kernel/printk.c43
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/range.c4
-rw-r--r--kernel/rcupdate.c160
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutorture.c3
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/sched.c421
-rw-r--r--kernel/sched_clock.c95
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c545
-rw-r--r--kernel/sched_rt.c3
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c1068
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c17
-rw-r--r--kernel/softlockup.c293
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c204
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c81
-rw-r--r--kernel/time.c16
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/tick-sched.c15
-rw-r--r--kernel/time/timekeeping.c93
-rw-r--r--kernel/timer.c41
-rw-r--r--kernel/trace/Kconfig83
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/blktrace.c88
-rw-r--r--kernel/trace/ftrace.c24
-rw-r--r--kernel/trace/kmemtrace.c529
-rw-r--r--kernel/trace/ring_buffer.c45
-rw-r--r--kernel/trace/trace.c189
-rw-r--r--kernel/trace/trace.h107
-rw-r--r--kernel/trace/trace_boot.c185
-rw-r--r--kernel/trace/trace_clock.c7
-rw-r--r--kernel/trace/trace_entries.h94
-rw-r--r--kernel/trace/trace_event_perf.c30
-rw-r--r--kernel/trace/trace_events.c440
-rw-r--r--kernel/trace/trace_events_filter.c27
-rw-r--r--kernel/trace/trace_export.c8
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c13
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kdb.c136
-rw-r--r--kernel/trace/trace_kprobe.c426
-rw-r--r--kernel/trace/trace_ksym.c508
-rw-r--r--kernel/trace/trace_output.c69
-rw-r--r--kernel/trace/trace_sched_wakeup.c7
-rw-r--r--kernel/trace/trace_selftest.c87
-rw-r--r--kernel/trace/trace_stack.c8
-rw-r--r--kernel/trace/trace_syscalls.c7
-rw-r--r--kernel/trace/trace_sysprof.c329
-rw-r--r--kernel/watchdog.c577
-rw-r--r--kernel/workqueue.c3232
-rw-r--r--kernel/workqueue_sched.h9
106 files changed, 8473 insertions, 6985 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 057472fbc272..0b72d1a74be0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_SMP) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
75obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_GCOV_KERNEL) += gcov/
77obj-$(CONFIG_KPROBES) += kprobes.o 78obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += debug/ 79obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 82obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
82obj-$(CONFIG_SECCOMP) += seccomp.o 83obj-$(CONFIG_SECCOMP) += seccomp.o
83obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -99,8 +100,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 100obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 101obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_SMP) += sched_cpupri.o 102obj-$(CONFIG_SMP) += sched_cpupri.o
102obj-$(CONFIG_SLOW_WORK) += slow-work.o
103obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
104obj-$(CONFIG_PERF_EVENTS) += perf_event.o 103obj-$(CONFIG_PERF_EVENTS) += perf_event.o
105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 104obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 105obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 385b88461c29..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
122 spin_unlock(&acct_lock); 122 spin_unlock(&acct_lock);
123 123
124 /* May block */ 124 /* May block */
125 if (vfs_statfs(file->f_path.dentry, &sbuf)) 125 if (vfs_statfs(&file->f_path, &sbuf))
126 return res; 126 return res;
127 suspend = sbuf.f_blocks * SUSPEND; 127 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 128 resume = sbuf.f_blocks * RESUME;
diff --git a/kernel/async.c b/kernel/async.c
index 15319d6c18fe..cd9dbb913c77 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/bug.h>
53#include <linux/module.h> 52#include <linux/module.h>
54#include <linux/wait.h> 53#include <linux/wait.h>
55#include <linux/sched.h> 54#include <linux/sched.h>
56#include <linux/init.h>
57#include <linux/kthread.h>
58#include <linux/delay.h>
59#include <linux/slab.h> 55#include <linux/slab.h>
56#include <linux/workqueue.h>
60#include <asm/atomic.h> 57#include <asm/atomic.h>
61 58
62static async_cookie_t next_cookie = 1; 59static async_cookie_t next_cookie = 1;
63 60
64#define MAX_THREADS 256
65#define MAX_WORK 32768 61#define MAX_WORK 32768
66 62
67static LIST_HEAD(async_pending); 63static LIST_HEAD(async_pending);
68static LIST_HEAD(async_running); 64static LIST_HEAD(async_running);
69static DEFINE_SPINLOCK(async_lock); 65static DEFINE_SPINLOCK(async_lock);
70 66
71static int async_enabled = 0;
72
73struct async_entry { 67struct async_entry {
74 struct list_head list; 68 struct list_head list;
75 async_cookie_t cookie; 69 struct work_struct work;
76 async_func_ptr *func; 70 async_cookie_t cookie;
77 void *data; 71 async_func_ptr *func;
78 struct list_head *running; 72 void *data;
73 struct list_head *running;
79}; 74};
80 75
81static DECLARE_WAIT_QUEUE_HEAD(async_done); 76static DECLARE_WAIT_QUEUE_HEAD(async_done);
82static DECLARE_WAIT_QUEUE_HEAD(async_new);
83 77
84static atomic_t entry_count; 78static atomic_t entry_count;
85static atomic_t thread_count;
86 79
87extern int initcall_debug; 80extern int initcall_debug;
88 81
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running)
117 spin_unlock_irqrestore(&async_lock, flags); 110 spin_unlock_irqrestore(&async_lock, flags);
118 return ret; 111 return ret;
119} 112}
113
120/* 114/*
121 * pick the first pending entry and run it 115 * pick the first pending entry and run it
122 */ 116 */
123static void run_one_entry(void) 117static void async_run_entry_fn(struct work_struct *work)
124{ 118{
119 struct async_entry *entry =
120 container_of(work, struct async_entry, work);
125 unsigned long flags; 121 unsigned long flags;
126 struct async_entry *entry;
127 ktime_t calltime, delta, rettime; 122 ktime_t calltime, delta, rettime;
128 123
129 /* 1) pick one task from the pending queue */ 124 /* 1) move self to the running queue */
130
131 spin_lock_irqsave(&async_lock, flags); 125 spin_lock_irqsave(&async_lock, flags);
132 if (list_empty(&async_pending))
133 goto out;
134 entry = list_first_entry(&async_pending, struct async_entry, list);
135
136 /* 2) move it to the running queue */
137 list_move_tail(&entry->list, entry->running); 126 list_move_tail(&entry->list, entry->running);
138 spin_unlock_irqrestore(&async_lock, flags); 127 spin_unlock_irqrestore(&async_lock, flags);
139 128
140 /* 3) run it (and print duration)*/ 129 /* 2) run (and print duration) */
141 if (initcall_debug && system_state == SYSTEM_BOOTING) { 130 if (initcall_debug && system_state == SYSTEM_BOOTING) {
142 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
143 entry->func, task_pid_nr(current)); 132 entry->func, task_pid_nr(current));
@@ -153,31 +142,25 @@ static void run_one_entry(void)
153 (long long)ktime_to_ns(delta) >> 10); 142 (long long)ktime_to_ns(delta) >> 10);
154 } 143 }
155 144
156 /* 4) remove it from the running queue */ 145 /* 3) remove self from the running queue */
157 spin_lock_irqsave(&async_lock, flags); 146 spin_lock_irqsave(&async_lock, flags);
158 list_del(&entry->list); 147 list_del(&entry->list);
159 148
160 /* 5) free the entry */ 149 /* 4) free the entry */
161 kfree(entry); 150 kfree(entry);
162 atomic_dec(&entry_count); 151 atomic_dec(&entry_count);
163 152
164 spin_unlock_irqrestore(&async_lock, flags); 153 spin_unlock_irqrestore(&async_lock, flags);
165 154
166 /* 6) wake up any waiters. */ 155 /* 5) wake up any waiters */
167 wake_up(&async_done); 156 wake_up(&async_done);
168 return;
169
170out:
171 spin_unlock_irqrestore(&async_lock, flags);
172} 157}
173 158
174
175static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) 159static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
176{ 160{
177 struct async_entry *entry; 161 struct async_entry *entry;
178 unsigned long flags; 162 unsigned long flags;
179 async_cookie_t newcookie; 163 async_cookie_t newcookie;
180
181 164
182 /* allow irq-off callers */ 165 /* allow irq-off callers */
183 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); 166 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
186 * If we're out of memory or if there's too much work 169 * If we're out of memory or if there's too much work
187 * pending already, we execute synchronously. 170 * pending already, we execute synchronously.
188 */ 171 */
189 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { 172 if (!entry || atomic_read(&entry_count) > MAX_WORK) {
190 kfree(entry); 173 kfree(entry);
191 spin_lock_irqsave(&async_lock, flags); 174 spin_lock_irqsave(&async_lock, flags);
192 newcookie = next_cookie++; 175 newcookie = next_cookie++;
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
196 ptr(data, newcookie); 179 ptr(data, newcookie);
197 return newcookie; 180 return newcookie;
198 } 181 }
182 INIT_WORK(&entry->work, async_run_entry_fn);
199 entry->func = ptr; 183 entry->func = ptr;
200 entry->data = data; 184 entry->data = data;
201 entry->running = running; 185 entry->running = running;
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
205 list_add_tail(&entry->list, &async_pending); 189 list_add_tail(&entry->list, &async_pending);
206 atomic_inc(&entry_count); 190 atomic_inc(&entry_count);
207 spin_unlock_irqrestore(&async_lock, flags); 191 spin_unlock_irqrestore(&async_lock, flags);
208 wake_up(&async_new); 192
193 /* schedule for execution */
194 queue_work(system_unbound_wq, &entry->work);
195
209 return newcookie; 196 return newcookie;
210} 197}
211 198
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
312 async_synchronize_cookie_domain(cookie, &async_running); 299 async_synchronize_cookie_domain(cookie, &async_running);
313} 300}
314EXPORT_SYMBOL_GPL(async_synchronize_cookie); 301EXPORT_SYMBOL_GPL(async_synchronize_cookie);
315
316
317static int async_thread(void *unused)
318{
319 DECLARE_WAITQUEUE(wq, current);
320 add_wait_queue(&async_new, &wq);
321
322 while (!kthread_should_stop()) {
323 int ret = HZ;
324 set_current_state(TASK_INTERRUPTIBLE);
325 /*
326 * check the list head without lock.. false positives
327 * are dealt with inside run_one_entry() while holding
328 * the lock.
329 */
330 rmb();
331 if (!list_empty(&async_pending))
332 run_one_entry();
333 else
334 ret = schedule_timeout(HZ);
335
336 if (ret == 0) {
337 /*
338 * we timed out, this means we as thread are redundant.
339 * we sign off and die, but we to avoid any races there
340 * is a last-straw check to see if work snuck in.
341 */
342 atomic_dec(&thread_count);
343 wmb(); /* manager must see our departure first */
344 if (list_empty(&async_pending))
345 break;
346 /*
347 * woops work came in between us timing out and us
348 * signing off; we need to stay alive and keep working.
349 */
350 atomic_inc(&thread_count);
351 }
352 }
353 remove_wait_queue(&async_new, &wq);
354
355 return 0;
356}
357
358static int async_manager_thread(void *unused)
359{
360 DECLARE_WAITQUEUE(wq, current);
361 add_wait_queue(&async_new, &wq);
362
363 while (!kthread_should_stop()) {
364 int tc, ec;
365
366 set_current_state(TASK_INTERRUPTIBLE);
367
368 tc = atomic_read(&thread_count);
369 rmb();
370 ec = atomic_read(&entry_count);
371
372 while (tc < ec && tc < MAX_THREADS) {
373 if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
374 tc))) {
375 msleep(100);
376 continue;
377 }
378 atomic_inc(&thread_count);
379 tc++;
380 }
381
382 schedule();
383 }
384 remove_wait_queue(&async_new, &wq);
385
386 return 0;
387}
388
389static int __init async_init(void)
390{
391 async_enabled =
392 !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
393
394 WARN_ON(!async_enabled);
395 return 0;
396}
397
398core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index 8296aa516c5a..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#include <linux/netlink.h> 58#include <linux/netlink.h>
59#include <linux/inotify.h>
60#include <linux/freezer.h> 59#include <linux/freezer.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
62 61
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
103extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
104extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
105 105
106extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
107
106/* audit watch functions */ 108/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch); 109#ifdef CONFIG_AUDIT_WATCH
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch); 110extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch); 111extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); 112extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule); 113extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
113extern void audit_remove_watch(struct audit_watch *watch); 114extern void audit_remove_watch_rule(struct audit_krule *krule);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch); 115extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch); 116extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
118 117#else
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, 118#define audit_put_watch(w) {}
120 struct audit_watch *watch); 119#define audit_get_watch(w) {}
120#define audit_to_watch(k, p, l, o) (-EINVAL)
121#define audit_add_watch(k, l) (-EINVAL)
122#define audit_remove_watch_rule(k) BUG()
123#define audit_watch_path(w) ""
124#define audit_watch_compare(w, i, d) 0
125
126#endif /* CONFIG_AUDIT_WATCH */
121 127
122#ifdef CONFIG_AUDIT_TREE 128#ifdef CONFIG_AUDIT_TREE
123extern struct audit_chunk *audit_tree_lookup(const struct inode *); 129extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
1#include "audit.h" 1#include "audit.h"
2#include <linux/inotify.h> 2#include <linux/fsnotify_backend.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
22 22
23struct audit_chunk { 23struct audit_chunk {
24 struct list_head hash; 24 struct list_head hash;
25 struct inotify_watch watch; 25 struct fsnotify_mark mark;
26 struct list_head trees; /* with root here */ 26 struct list_head trees; /* with root here */
27 int dead; 27 int dead;
28 int count; 28 int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
59 * tree is refcounted; one reference for "some rules on rules_list refer to 59 * tree is refcounted; one reference for "some rules on rules_list refer to
60 * it", one for each chunk with pointer to it. 60 * it", one for each chunk with pointer to it.
61 * 61 *
62 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount 62 * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
63 * of watch contributes 1 to .refs). 63 * of watch contributes 1 to .refs).
64 * 64 *
65 * node.index allows to get from node.list to containing chunk. 65 * node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
68 * that makes a difference. Some. 68 * that makes a difference. Some.
69 */ 69 */
70 70
71static struct inotify_handle *rtree_ih; 71static struct fsnotify_group *audit_tree_group;
72 72
73static struct audit_tree *alloc_tree(const char *s) 73static struct audit_tree *alloc_tree(const char *s)
74{ 74{
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
111 return tree->pathname; 111 return tree->pathname;
112} 112}
113 113
114static struct audit_chunk *alloc_chunk(int count)
115{
116 struct audit_chunk *chunk;
117 size_t size;
118 int i;
119
120 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
121 chunk = kzalloc(size, GFP_KERNEL);
122 if (!chunk)
123 return NULL;
124
125 INIT_LIST_HEAD(&chunk->hash);
126 INIT_LIST_HEAD(&chunk->trees);
127 chunk->count = count;
128 atomic_long_set(&chunk->refs, 1);
129 for (i = 0; i < count; i++) {
130 INIT_LIST_HEAD(&chunk->owners[i].list);
131 chunk->owners[i].index = i;
132 }
133 inotify_init_watch(&chunk->watch);
134 return chunk;
135}
136
137static void free_chunk(struct audit_chunk *chunk) 114static void free_chunk(struct audit_chunk *chunk)
138{ 115{
139 int i; 116 int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
157 audit_put_chunk(chunk); 134 audit_put_chunk(chunk);
158} 135}
159 136
137static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
138{
139 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
140 call_rcu(&chunk->head, __put_chunk);
141}
142
143static struct audit_chunk *alloc_chunk(int count)
144{
145 struct audit_chunk *chunk;
146 size_t size;
147 int i;
148
149 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
150 chunk = kzalloc(size, GFP_KERNEL);
151 if (!chunk)
152 return NULL;
153
154 INIT_LIST_HEAD(&chunk->hash);
155 INIT_LIST_HEAD(&chunk->trees);
156 chunk->count = count;
157 atomic_long_set(&chunk->refs, 1);
158 for (i = 0; i < count; i++) {
159 INIT_LIST_HEAD(&chunk->owners[i].list);
160 chunk->owners[i].index = i;
161 }
162 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
163 return chunk;
164}
165
160enum {HASH_SIZE = 128}; 166enum {HASH_SIZE = 128};
161static struct list_head chunk_hash_heads[HASH_SIZE]; 167static struct list_head chunk_hash_heads[HASH_SIZE];
162static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); 168static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
167 return chunk_hash_heads + n % HASH_SIZE; 173 return chunk_hash_heads + n % HASH_SIZE;
168} 174}
169 175
170/* hash_lock is held by caller */ 176/* hash_lock & entry->lock is held by caller */
171static void insert_hash(struct audit_chunk *chunk) 177static void insert_hash(struct audit_chunk *chunk)
172{ 178{
173 struct list_head *list = chunk_hash(chunk->watch.inode); 179 struct fsnotify_mark *entry = &chunk->mark;
180 struct list_head *list;
181
182 if (!entry->i.inode)
183 return;
184 list = chunk_hash(entry->i.inode);
174 list_add_rcu(&chunk->hash, list); 185 list_add_rcu(&chunk->hash, list);
175} 186}
176 187
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
181 struct audit_chunk *p; 192 struct audit_chunk *p;
182 193
183 list_for_each_entry_rcu(p, list, hash) { 194 list_for_each_entry_rcu(p, list, hash) {
184 if (p->watch.inode == inode) { 195 /* mark.inode may have gone NULL, but who cares? */
196 if (p->mark.i.inode == inode) {
185 atomic_long_inc(&p->refs); 197 atomic_long_inc(&p->refs);
186 return p; 198 return p;
187 } 199 }
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
210static void untag_chunk(struct node *p) 222static void untag_chunk(struct node *p)
211{ 223{
212 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark;
213 struct audit_chunk *new; 226 struct audit_chunk *new;
214 struct audit_tree *owner; 227 struct audit_tree *owner;
215 int size = chunk->count - 1; 228 int size = chunk->count - 1;
216 int i, j; 229 int i, j;
217 230
218 if (!pin_inotify_watch(&chunk->watch)) { 231 fsnotify_get_mark(entry);
219 /*
220 * Filesystem is shutting down; all watches are getting
221 * evicted, just take it off the node list for this
222 * tree and let the eviction logics take care of the
223 * rest.
224 */
225 owner = p->owner;
226 if (owner->root == chunk) {
227 list_del_init(&owner->same_root);
228 owner->root = NULL;
229 }
230 list_del_init(&p->list);
231 p->owner = NULL;
232 put_tree(owner);
233 return;
234 }
235 232
236 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
237 234
238 /* 235 spin_lock(&entry->lock);
239 * pin_inotify_watch() succeeded, so the watch won't go away 236 if (chunk->dead || !entry->i.inode) {
240 * from under us. 237 spin_unlock(&entry->lock);
241 */
242 mutex_lock(&chunk->watch.inode->inotify_mutex);
243 if (chunk->dead) {
244 mutex_unlock(&chunk->watch.inode->inotify_mutex);
245 goto out; 238 goto out;
246 } 239 }
247 240
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
256 list_del_init(&p->list); 249 list_del_init(&p->list);
257 list_del_rcu(&chunk->hash); 250 list_del_rcu(&chunk->hash);
258 spin_unlock(&hash_lock); 251 spin_unlock(&hash_lock);
259 inotify_evict_watch(&chunk->watch); 252 spin_unlock(&entry->lock);
260 mutex_unlock(&chunk->watch.inode->inotify_mutex); 253 fsnotify_destroy_mark(entry);
261 put_inotify_watch(&chunk->watch); 254 fsnotify_put_mark(entry);
262 goto out; 255 goto out;
263 } 256 }
264 257
265 new = alloc_chunk(size); 258 new = alloc_chunk(size);
266 if (!new) 259 if (!new)
267 goto Fallback; 260 goto Fallback;
268 if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { 261 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
269 free_chunk(new); 263 free_chunk(new);
270 goto Fallback; 264 goto Fallback;
271 } 265 }
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
298 list_for_each_entry(owner, &new->trees, same_root) 292 list_for_each_entry(owner, &new->trees, same_root)
299 owner->root = new; 293 owner->root = new;
300 spin_unlock(&hash_lock); 294 spin_unlock(&hash_lock);
301 inotify_evict_watch(&chunk->watch); 295 spin_unlock(&entry->lock);
302 mutex_unlock(&chunk->watch.inode->inotify_mutex); 296 fsnotify_destroy_mark(entry);
303 put_inotify_watch(&chunk->watch); 297 fsnotify_put_mark(entry);
304 goto out; 298 goto out;
305 299
306Fallback: 300Fallback:
@@ -314,31 +308,33 @@ Fallback:
314 p->owner = NULL; 308 p->owner = NULL;
315 put_tree(owner); 309 put_tree(owner);
316 spin_unlock(&hash_lock); 310 spin_unlock(&hash_lock);
317 mutex_unlock(&chunk->watch.inode->inotify_mutex); 311 spin_unlock(&entry->lock);
318out: 312out:
319 unpin_inotify_watch(&chunk->watch); 313 fsnotify_put_mark(entry);
320 spin_lock(&hash_lock); 314 spin_lock(&hash_lock);
321} 315}
322 316
323static int create_chunk(struct inode *inode, struct audit_tree *tree) 317static int create_chunk(struct inode *inode, struct audit_tree *tree)
324{ 318{
319 struct fsnotify_mark *entry;
325 struct audit_chunk *chunk = alloc_chunk(1); 320 struct audit_chunk *chunk = alloc_chunk(1);
326 if (!chunk) 321 if (!chunk)
327 return -ENOMEM; 322 return -ENOMEM;
328 323
329 if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { 324 entry = &chunk->mark;
325 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
330 free_chunk(chunk); 326 free_chunk(chunk);
331 return -ENOSPC; 327 return -ENOSPC;
332 } 328 }
333 329
334 mutex_lock(&inode->inotify_mutex); 330 spin_lock(&entry->lock);
335 spin_lock(&hash_lock); 331 spin_lock(&hash_lock);
336 if (tree->goner) { 332 if (tree->goner) {
337 spin_unlock(&hash_lock); 333 spin_unlock(&hash_lock);
338 chunk->dead = 1; 334 chunk->dead = 1;
339 inotify_evict_watch(&chunk->watch); 335 spin_unlock(&entry->lock);
340 mutex_unlock(&inode->inotify_mutex); 336 fsnotify_destroy_mark(entry);
341 put_inotify_watch(&chunk->watch); 337 fsnotify_put_mark(entry);
342 return 0; 338 return 0;
343 } 339 }
344 chunk->owners[0].index = (1U << 31); 340 chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
351 } 347 }
352 insert_hash(chunk); 348 insert_hash(chunk);
353 spin_unlock(&hash_lock); 349 spin_unlock(&hash_lock);
354 mutex_unlock(&inode->inotify_mutex); 350 spin_unlock(&entry->lock);
355 return 0; 351 return 0;
356} 352}
357 353
358/* the first tagged inode becomes root of tree */ 354/* the first tagged inode becomes root of tree */
359static int tag_chunk(struct inode *inode, struct audit_tree *tree) 355static int tag_chunk(struct inode *inode, struct audit_tree *tree)
360{ 356{
361 struct inotify_watch *watch; 357 struct fsnotify_mark *old_entry, *chunk_entry;
362 struct audit_tree *owner; 358 struct audit_tree *owner;
363 struct audit_chunk *chunk, *old; 359 struct audit_chunk *chunk, *old;
364 struct node *p; 360 struct node *p;
365 int n; 361 int n;
366 362
367 if (inotify_find_watch(rtree_ih, inode, &watch) < 0) 363 old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
364 if (!old_entry)
368 return create_chunk(inode, tree); 365 return create_chunk(inode, tree);
369 366
370 old = container_of(watch, struct audit_chunk, watch); 367 old = container_of(old_entry, struct audit_chunk, mark);
371 368
372 /* are we already there? */ 369 /* are we already there? */
373 spin_lock(&hash_lock); 370 spin_lock(&hash_lock);
374 for (n = 0; n < old->count; n++) { 371 for (n = 0; n < old->count; n++) {
375 if (old->owners[n].owner == tree) { 372 if (old->owners[n].owner == tree) {
376 spin_unlock(&hash_lock); 373 spin_unlock(&hash_lock);
377 put_inotify_watch(&old->watch); 374 fsnotify_put_mark(old_entry);
378 return 0; 375 return 0;
379 } 376 }
380 } 377 }
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
382 379
383 chunk = alloc_chunk(old->count + 1); 380 chunk = alloc_chunk(old->count + 1);
384 if (!chunk) { 381 if (!chunk) {
385 put_inotify_watch(&old->watch); 382 fsnotify_put_mark(old_entry);
386 return -ENOMEM; 383 return -ENOMEM;
387 } 384 }
388 385
389 mutex_lock(&inode->inotify_mutex); 386 chunk_entry = &chunk->mark;
390 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 387
391 mutex_unlock(&inode->inotify_mutex); 388 spin_lock(&old_entry->lock);
392 put_inotify_watch(&old->watch); 389 if (!old_entry->i.inode) {
390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry);
393 free_chunk(chunk); 393 free_chunk(chunk);
394 return -ENOENT;
395 }
396
397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock);
400 free_chunk(chunk);
401 fsnotify_put_mark(old_entry);
394 return -ENOSPC; 402 return -ENOSPC;
395 } 403 }
404
405 /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
406 spin_lock(&chunk_entry->lock);
396 spin_lock(&hash_lock); 407 spin_lock(&hash_lock);
408
409 /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
397 if (tree->goner) { 410 if (tree->goner) {
398 spin_unlock(&hash_lock); 411 spin_unlock(&hash_lock);
399 chunk->dead = 1; 412 chunk->dead = 1;
400 inotify_evict_watch(&chunk->watch); 413 spin_unlock(&chunk_entry->lock);
401 mutex_unlock(&inode->inotify_mutex); 414 spin_unlock(&old_entry->lock);
402 put_inotify_watch(&old->watch); 415
403 put_inotify_watch(&chunk->watch); 416 fsnotify_destroy_mark(chunk_entry);
417
418 fsnotify_put_mark(chunk_entry);
419 fsnotify_put_mark(old_entry);
404 return 0; 420 return 0;
405 } 421 }
406 list_replace_init(&old->trees, &chunk->trees); 422 list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
426 list_add(&tree->same_root, &chunk->trees); 442 list_add(&tree->same_root, &chunk->trees);
427 } 443 }
428 spin_unlock(&hash_lock); 444 spin_unlock(&hash_lock);
429 inotify_evict_watch(&old->watch); 445 spin_unlock(&chunk_entry->lock);
430 mutex_unlock(&inode->inotify_mutex); 446 spin_unlock(&old_entry->lock);
431 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ 447 fsnotify_destroy_mark(old_entry);
432 put_inotify_watch(&old->watch); /* and kill it */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
449 fsnotify_put_mark(old_entry); /* and kill it */
433 return 0; 450 return 0;
434} 451}
435 452
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
584 601
585 spin_lock(&hash_lock); 602 spin_lock(&hash_lock);
586 list_for_each_entry(node, &tree->chunks, list) { 603 list_for_each_entry(node, &tree->chunks, list) {
587 struct inode *inode = find_chunk(node)->watch.inode; 604 struct audit_chunk *chunk = find_chunk(node);
605 /* this could be NULL if the watch is dieing else where... */
606 struct inode *inode = chunk->mark.i.inode;
588 node->index |= 1U<<31; 607 node->index |= 1U<<31;
589 if (iterate_mounts(compare_root, inode, root_mnt)) 608 if (iterate_mounts(compare_root, inode, root_mnt))
590 node->index &= ~(1U<<31); 609 node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
846 * Here comes the stuff asynchronous to auditctl operations 865 * Here comes the stuff asynchronous to auditctl operations
847 */ 866 */
848 867
849/* inode->inotify_mutex is locked */
850static void evict_chunk(struct audit_chunk *chunk) 868static void evict_chunk(struct audit_chunk *chunk)
851{ 869{
852 struct audit_tree *owner; 870 struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
885 mutex_unlock(&audit_filter_mutex); 903 mutex_unlock(&audit_filter_mutex);
886} 904}
887 905
888static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, 906static int audit_tree_handle_event(struct fsnotify_group *group,
889 u32 cookie, const char *dname, struct inode *inode) 907 struct fsnotify_mark *inode_mark,
908 struct fsnotify_mark *vfsmonut_mark,
909 struct fsnotify_event *event)
910{
911 BUG();
912 return -EOPNOTSUPP;
913}
914
915static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
890{ 916{
891 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 917 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
892 918
893 if (mask & IN_IGNORED) { 919 evict_chunk(chunk);
894 evict_chunk(chunk); 920 fsnotify_put_mark(entry);
895 put_inotify_watch(watch);
896 }
897} 921}
898 922
899static void destroy_watch(struct inotify_watch *watch) 923static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
924 struct fsnotify_mark *inode_mark,
925 struct fsnotify_mark *vfsmount_mark,
926 __u32 mask, void *data, int data_type)
900{ 927{
901 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 928 return false;
902 call_rcu(&chunk->head, __put_chunk);
903} 929}
904 930
905static const struct inotify_operations rtree_inotify_ops = { 931static const struct fsnotify_ops audit_tree_ops = {
906 .handle_event = handle_event, 932 .handle_event = audit_tree_handle_event,
907 .destroy_watch = destroy_watch, 933 .should_send_event = audit_tree_send_event,
934 .free_group_priv = NULL,
935 .free_event_priv = NULL,
936 .freeing_mark = audit_tree_freeing_mark,
908}; 937};
909 938
910static int __init audit_tree_init(void) 939static int __init audit_tree_init(void)
911{ 940{
912 int i; 941 int i;
913 942
914 rtree_ih = inotify_init(&rtree_inotify_ops); 943 audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
915 if (IS_ERR(rtree_ih)) 944 if (IS_ERR(audit_tree_group))
916 audit_panic("cannot initialize inotify handle for rectree watches"); 945 audit_panic("cannot initialize fsnotify group for rectree watches");
917 946
918 for (i = 0; i < HASH_SIZE; i++) 947 for (i = 0; i < HASH_SIZE; i++)
919 INIT_LIST_HEAD(&chunk_hash_heads[i]); 948 INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..f0c9b2e7542d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fsnotify_backend.h>
27#include <linux/namei.h> 28#include <linux/namei.h>
28#include <linux/netlink.h> 29#include <linux/netlink.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31#include <linux/inotify.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include "audit.h" 33#include "audit.h"
34 34
35/* 35/*
36 * Reference counting: 36 * Reference counting:
37 * 37 *
38 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED 38 * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
39 * event. Each audit_watch holds a reference to its associated parent. 39 * event. Each audit_watch holds a reference to its associated parent.
40 * 40 *
41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to 41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
51 unsigned long ino; /* associated inode number */ 51 unsigned long ino; /* associated inode number */
52 struct audit_parent *parent; /* associated parent */ 52 struct audit_parent *parent; /* associated parent */
53 struct list_head wlist; /* entry in parent->watches list */ 53 struct list_head wlist; /* entry in parent->watches list */
54 struct list_head rules; /* associated rules */ 54 struct list_head rules; /* anchor for krule->rlist */
55}; 55};
56 56
57struct audit_parent { 57struct audit_parent {
58 struct list_head ilist; /* entry in inotify registration list */ 58 struct list_head watches; /* anchor for audit_watch->wlist */
59 struct list_head watches; /* associated watches */ 59 struct fsnotify_mark mark; /* fsnotify mark on the inode */
60 struct inotify_watch wdata; /* inotify watch data */
61 unsigned flags; /* status flags */
62}; 60};
63 61
64/* Inotify handle. */ 62/* fsnotify handle. */
65struct inotify_handle *audit_ih; 63struct fsnotify_group *audit_watch_group;
66 64
67/* 65/* fsnotify events we care about. */
68 * audit_parent status flags: 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
69 * 67 FS_MOVE_SELF | FS_EVENT_ON_CHILD)
70 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
71 * a filesystem event to ensure we're adding audit watches to a valid parent.
72 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
73 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
74 * we can receive while holding nameidata.
75 */
76#define AUDIT_PARENT_INVALID 0x001
77 68
78/* Inotify events we care about. */ 69static void audit_free_parent(struct audit_parent *parent)
79#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 70{
71 WARN_ON(!list_empty(&parent->watches));
72 kfree(parent);
73}
80 74
81static void audit_free_parent(struct inotify_watch *i_watch) 75static void audit_watch_free_mark(struct fsnotify_mark *entry)
82{ 76{
83 struct audit_parent *parent; 77 struct audit_parent *parent;
84 78
85 parent = container_of(i_watch, struct audit_parent, wdata); 79 parent = container_of(entry, struct audit_parent, mark);
86 WARN_ON(!list_empty(&parent->watches)); 80 audit_free_parent(parent);
87 kfree(parent); 81}
82
83static void audit_get_parent(struct audit_parent *parent)
84{
85 if (likely(parent))
86 fsnotify_get_mark(&parent->mark);
87}
88
89static void audit_put_parent(struct audit_parent *parent)
90{
91 if (likely(parent))
92 fsnotify_put_mark(&parent->mark);
93}
94
95/*
96 * Find and return the audit_parent on the given inode. If found a reference
97 * is taken on this parent.
98 */
99static inline struct audit_parent *audit_find_parent(struct inode *inode)
100{
101 struct audit_parent *parent = NULL;
102 struct fsnotify_mark *entry;
103
104 entry = fsnotify_find_inode_mark(audit_watch_group, inode);
105 if (entry)
106 parent = container_of(entry, struct audit_parent, mark);
107
108 return parent;
88} 109}
89 110
90void audit_get_watch(struct audit_watch *watch) 111void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
105void audit_remove_watch(struct audit_watch *watch) 126void audit_remove_watch(struct audit_watch *watch)
106{ 127{
107 list_del(&watch->wlist); 128 list_del(&watch->wlist);
108 put_inotify_watch(&watch->parent->wdata); 129 audit_put_parent(watch->parent);
109 watch->parent = NULL; 130 watch->parent = NULL;
110 audit_put_watch(watch); /* match initial get */ 131 audit_put_watch(watch); /* match initial get */
111} 132}
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
115 return watch->path; 136 return watch->path;
116} 137}
117 138
118struct list_head *audit_watch_rules(struct audit_watch *watch) 139int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
119{
120 return &watch->rules;
121}
122
123unsigned long audit_watch_inode(struct audit_watch *watch)
124{ 140{
125 return watch->ino; 141 return (watch->ino != (unsigned long)-1) &&
126} 142 (watch->ino == ino) &&
127 143 (watch->dev == dev);
128dev_t audit_watch_dev(struct audit_watch *watch)
129{
130 return watch->dev;
131} 144}
132 145
133/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
134static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct nameidata *ndp)
135{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode;
136 struct audit_parent *parent; 150 struct audit_parent *parent;
137 s32 wd; 151 int ret;
138 152
139 parent = kzalloc(sizeof(*parent), GFP_KERNEL); 153 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
140 if (unlikely(!parent)) 154 if (unlikely(!parent))
141 return ERR_PTR(-ENOMEM); 155 return ERR_PTR(-ENOMEM);
142 156
143 INIT_LIST_HEAD(&parent->watches); 157 INIT_LIST_HEAD(&parent->watches);
144 parent->flags = 0; 158
145 159 fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
146 inotify_init_watch(&parent->wdata); 160 parent->mark.mask = AUDIT_FS_WATCH;
147 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ 161 ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
148 get_inotify_watch(&parent->wdata); 162 if (ret < 0) {
149 wd = inotify_add_watch(audit_ih, &parent->wdata, 163 audit_free_parent(parent);
150 ndp->path.dentry->d_inode, AUDIT_IN_WATCH); 164 return ERR_PTR(ret);
151 if (wd < 0) {
152 audit_free_parent(&parent->wdata);
153 return ERR_PTR(wd);
154 } 165 }
155 166
156 return parent; 167 return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
179{ 190{
180 struct audit_watch *watch; 191 struct audit_watch *watch;
181 192
182 if (!audit_ih) 193 if (!audit_watch_group)
183 return -EOPNOTSUPP; 194 return -EOPNOTSUPP;
184 195
185 if (path[0] != '/' || path[len-1] == '/' || 196 if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
217 228
218 new->dev = old->dev; 229 new->dev = old->dev;
219 new->ino = old->ino; 230 new->ino = old->ino;
220 get_inotify_watch(&old->parent->wdata); 231 audit_get_parent(old->parent);
221 new->parent = old->parent; 232 new->parent = old->parent;
222 233
223out: 234out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
251 struct audit_entry *oentry, *nentry; 262 struct audit_entry *oentry, *nentry;
252 263
253 mutex_lock(&audit_filter_mutex); 264 mutex_lock(&audit_filter_mutex);
265 /* Run all of the watches on this parent looking for the one that
266 * matches the given dname */
254 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
255 if (audit_compare_dname_path(dname, owatch->path, NULL)) 268 if (audit_compare_dname_path(dname, owatch->path, NULL))
256 continue; 269 continue;
257 270
258 /* If the update involves invalidating rules, do the inode-based 271 /* If the update involves invalidating rules, do the inode-based
259 * filtering now, so we don't omit records. */ 272 * filtering now, so we don't omit records. */
260 if (invalidating && current->audit_context) 273 if (invalidating && !audit_dummy_context())
261 audit_filter_inodes(current, current->audit_context); 274 audit_filter_inodes(current, current->audit_context);
262 275
276 /* updating ino will likely change which audit_hash_list we
277 * are on so we need a new watch for the new list */
263 nwatch = audit_dupe_watch(owatch); 278 nwatch = audit_dupe_watch(owatch);
264 if (IS_ERR(nwatch)) { 279 if (IS_ERR(nwatch)) {
265 mutex_unlock(&audit_filter_mutex); 280 mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
275 list_del(&oentry->rule.rlist); 290 list_del(&oentry->rule.rlist);
276 list_del_rcu(&oentry->list); 291 list_del_rcu(&oentry->list);
277 292
278 nentry = audit_dupe_rule(&oentry->rule, nwatch); 293 nentry = audit_dupe_rule(&oentry->rule);
279 if (IS_ERR(nentry)) { 294 if (IS_ERR(nentry)) {
280 list_del(&oentry->rule.list); 295 list_del(&oentry->rule.list);
281 audit_panic("error updating watch, removing"); 296 audit_panic("error updating watch, removing");
282 } else { 297 } else {
283 int h = audit_hash_ino((u32)ino); 298 int h = audit_hash_ino((u32)ino);
299
300 /*
301 * nentry->rule.watch == oentry->rule.watch so
302 * we must drop that reference and set it to our
303 * new watch.
304 */
305 audit_put_watch(nentry->rule.watch);
306 audit_get_watch(nwatch);
307 nentry->rule.watch = nwatch;
284 list_add(&nentry->rule.rlist, &nwatch->rules); 308 list_add(&nentry->rule.rlist, &nwatch->rules);
285 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 309 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
286 list_replace(&oentry->rule.list, 310 list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
312 struct audit_entry *e; 336 struct audit_entry *e;
313 337
314 mutex_lock(&audit_filter_mutex); 338 mutex_lock(&audit_filter_mutex);
315 parent->flags |= AUDIT_PARENT_INVALID;
316 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 339 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
317 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 340 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
318 e = container_of(r, struct audit_entry, rule); 341 e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
325 audit_remove_watch(w); 348 audit_remove_watch(w);
326 } 349 }
327 mutex_unlock(&audit_filter_mutex); 350 mutex_unlock(&audit_filter_mutex);
328}
329
330/* Unregister inotify watches for parents on in_list.
331 * Generates an IN_IGNORED event. */
332void audit_inotify_unregister(struct list_head *in_list)
333{
334 struct audit_parent *p, *n;
335 351
336 list_for_each_entry_safe(p, n, in_list, ilist) { 352 fsnotify_destroy_mark(&parent->mark);
337 list_del(&p->ilist);
338 inotify_rm_watch(audit_ih, &p->wdata);
339 /* the unpin matching the pin in audit_do_del_rule() */
340 unpin_inotify_watch(&p->wdata);
341 }
342} 353}
343 354
344/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
389 } 400 }
390} 401}
391 402
392/* Associate the given rule with an existing parent inotify_watch. 403/* Associate the given rule with an existing parent.
393 * Caller must hold audit_filter_mutex. */ 404 * Caller must hold audit_filter_mutex. */
394static void audit_add_to_parent(struct audit_krule *krule, 405static void audit_add_to_parent(struct audit_krule *krule,
395 struct audit_parent *parent) 406 struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
397 struct audit_watch *w, *watch = krule->watch; 408 struct audit_watch *w, *watch = krule->watch;
398 int watch_found = 0; 409 int watch_found = 0;
399 410
411 BUG_ON(!mutex_is_locked(&audit_filter_mutex));
412
400 list_for_each_entry(w, &parent->watches, wlist) { 413 list_for_each_entry(w, &parent->watches, wlist) {
401 if (strcmp(watch->path, w->path)) 414 if (strcmp(watch->path, w->path))
402 continue; 415 continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
413 } 426 }
414 427
415 if (!watch_found) { 428 if (!watch_found) {
416 get_inotify_watch(&parent->wdata); 429 audit_get_parent(parent);
417 watch->parent = parent; 430 watch->parent = parent;
418 431
419 list_add(&watch->wlist, &parent->watches); 432 list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
423 436
424/* Find a matching watch entry, or add this one. 437/* Find a matching watch entry, or add this one.
425 * Caller must hold audit_filter_mutex. */ 438 * Caller must hold audit_filter_mutex. */
426int audit_add_watch(struct audit_krule *krule) 439int audit_add_watch(struct audit_krule *krule, struct list_head **list)
427{ 440{
428 struct audit_watch *watch = krule->watch; 441 struct audit_watch *watch = krule->watch;
429 struct inotify_watch *i_watch;
430 struct audit_parent *parent; 442 struct audit_parent *parent;
431 struct nameidata *ndp = NULL, *ndw = NULL; 443 struct nameidata *ndp = NULL, *ndw = NULL;
432 int ret = 0; 444 int h, ret = 0;
433 445
434 mutex_unlock(&audit_filter_mutex); 446 mutex_unlock(&audit_filter_mutex);
435 447
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
441 goto error; 453 goto error;
442 } 454 }
443 455
456 mutex_lock(&audit_filter_mutex);
457
444 /* update watch filter fields */ 458 /* update watch filter fields */
445 if (ndw) { 459 if (ndw) {
446 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; 460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
447 watch->ino = ndw->path.dentry->d_inode->i_ino; 461 watch->ino = ndw->path.dentry->d_inode->i_ino;
448 } 462 }
449 463
450 /* The audit_filter_mutex must not be held during inotify calls because 464 /* either find an old parent or attach a new one */
451 * we hold it during inotify event callback processing. If an existing 465 parent = audit_find_parent(ndp->path.dentry->d_inode);
452 * inotify watch is found, inotify_find_watch() grabs a reference before 466 if (!parent) {
453 * returning.
454 */
455 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
456 &i_watch) < 0) {
457 parent = audit_init_parent(ndp); 467 parent = audit_init_parent(ndp);
458 if (IS_ERR(parent)) { 468 if (IS_ERR(parent)) {
459 /* caller expects mutex locked */
460 mutex_lock(&audit_filter_mutex);
461 ret = PTR_ERR(parent); 469 ret = PTR_ERR(parent);
462 goto error; 470 goto error;
463 } 471 }
464 } else 472 }
465 parent = container_of(i_watch, struct audit_parent, wdata);
466
467 mutex_lock(&audit_filter_mutex);
468 473
469 /* parent was moved before we took audit_filter_mutex */ 474 audit_add_to_parent(krule, parent);
470 if (parent->flags & AUDIT_PARENT_INVALID)
471 ret = -ENOENT;
472 else
473 audit_add_to_parent(krule, parent);
474 475
475 /* match get in audit_init_parent or inotify_find_watch */ 476 /* match get in audit_find_parent or audit_init_parent */
476 put_inotify_watch(&parent->wdata); 477 audit_put_parent(parent);
477 478
479 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h];
478error: 481error:
479 audit_put_nd(ndp, ndw); /* NULL args OK */ 482 audit_put_nd(ndp, ndw); /* NULL args OK */
480 return ret; 483 return ret;
481 484
482} 485}
483 486
484void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) 487void audit_remove_watch_rule(struct audit_krule *krule)
485{ 488{
486 struct audit_watch *watch = krule->watch; 489 struct audit_watch *watch = krule->watch;
487 struct audit_parent *parent = watch->parent; 490 struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
492 audit_remove_watch(watch); 495 audit_remove_watch(watch);
493 496
494 if (list_empty(&parent->watches)) { 497 if (list_empty(&parent->watches)) {
495 /* Put parent on the inotify un-registration 498 audit_get_parent(parent);
496 * list. Grab a reference before releasing 499 fsnotify_destroy_mark(&parent->mark);
497 * audit_filter_mutex, to be released in 500 audit_put_parent(parent);
498 * audit_inotify_unregister().
499 * If filesystem is going away, just leave
500 * the sucker alone, eviction will take
501 * care of it. */
502 if (pin_inotify_watch(&parent->wdata))
503 list_add(&parent->ilist, list);
504 } 501 }
505 } 502 }
506} 503}
507 504
508/* Update watch data in audit rules based on inotify events. */ 505static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
509static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, 506 struct fsnotify_mark *inode_mark,
510 u32 cookie, const char *dname, struct inode *inode) 507 struct fsnotify_mark *vfsmount_mark,
508 __u32 mask, void *data, int data_type)
509{
510 return true;
511}
512
513/* Update watch data in audit rules based on fsnotify events. */
514static int audit_watch_handle_event(struct fsnotify_group *group,
515 struct fsnotify_mark *inode_mark,
516 struct fsnotify_mark *vfsmount_mark,
517 struct fsnotify_event *event)
511{ 518{
519 struct inode *inode;
520 __u32 mask = event->mask;
521 const char *dname = event->file_name;
512 struct audit_parent *parent; 522 struct audit_parent *parent;
513 523
514 parent = container_of(i_watch, struct audit_parent, wdata); 524 parent = container_of(inode_mark, struct audit_parent, mark);
515 525
516 if (mask & (IN_CREATE|IN_MOVED_TO) && inode) 526 BUG_ON(group != audit_watch_group);
517 audit_update_watch(parent, dname, inode->i_sb->s_dev, 527
518 inode->i_ino, 0); 528 switch (event->data_type) {
519 else if (mask & (IN_DELETE|IN_MOVED_FROM)) 529 case (FSNOTIFY_EVENT_PATH):
530 inode = event->path.dentry->d_inode;
531 break;
532 case (FSNOTIFY_EVENT_INODE):
533 inode = event->inode;
534 break;
535 default:
536 BUG();
537 inode = NULL;
538 break;
539 };
540
541 if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
542 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
543 else if (mask & (FS_DELETE|FS_MOVED_FROM))
520 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); 544 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
521 /* inotify automatically removes the watch and sends IN_IGNORED */ 545 else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
522 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
523 audit_remove_parent_watches(parent);
524 /* inotify does not remove the watch, so remove it manually */
525 else if(mask & IN_MOVE_SELF) {
526 audit_remove_parent_watches(parent); 546 audit_remove_parent_watches(parent);
527 inotify_remove_watch_locked(audit_ih, i_watch); 547
528 } else if (mask & IN_IGNORED) 548 return 0;
529 put_inotify_watch(i_watch);
530} 549}
531 550
532static const struct inotify_operations audit_inotify_ops = { 551static const struct fsnotify_ops audit_watch_fsnotify_ops = {
533 .handle_event = audit_handle_ievent, 552 .should_send_event = audit_watch_should_send_event,
534 .destroy_watch = audit_free_parent, 553 .handle_event = audit_watch_handle_event,
554 .free_group_priv = NULL,
555 .freeing_mark = NULL,
556 .free_event_priv = NULL,
535}; 557};
536 558
537static int __init audit_watch_init(void) 559static int __init audit_watch_init(void)
538{ 560{
539 audit_ih = inotify_init(&audit_inotify_ops); 561 audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
540 if (IS_ERR(audit_ih)) 562 if (IS_ERR(audit_watch_group)) {
541 audit_panic("cannot initialize inotify handle"); 563 audit_watch_group = NULL;
564 audit_panic("cannot create audit fsnotify group");
565 }
542 return 0; 566 return 0;
543} 567}
544subsys_initcall(audit_watch_init); 568device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
71{ 71{
72 int i; 72 int i;
73 struct audit_krule *erule = &e->rule; 73 struct audit_krule *erule = &e->rule;
74
74 /* some rules don't have associated watches */ 75 /* some rules don't have associated watches */
75 if (erule->watch) 76 if (erule->watch)
76 audit_put_watch(erule->watch); 77 audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
746 * rule with the new rule in the filterlist, then free the old rule. 747 * rule with the new rule in the filterlist, then free the old rule.
747 * The rlist element is undefined; list manipulations are handled apart from 748 * The rlist element is undefined; list manipulations are handled apart from
748 * the initial copy. */ 749 * the initial copy. */
749struct audit_entry *audit_dupe_rule(struct audit_krule *old, 750struct audit_entry *audit_dupe_rule(struct audit_krule *old)
750 struct audit_watch *watch)
751{ 751{
752 u32 fcount = old->field_count; 752 u32 fcount = old->field_count;
753 struct audit_entry *entry; 753 struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
769 new->prio = old->prio; 769 new->prio = old->prio;
770 new->buflen = old->buflen; 770 new->buflen = old->buflen;
771 new->inode_f = old->inode_f; 771 new->inode_f = old->inode_f;
772 new->watch = NULL;
773 new->field_count = old->field_count; 772 new->field_count = old->field_count;
773
774 /* 774 /*
775 * note that we are OK with not refcounting here; audit_match_tree() 775 * note that we are OK with not refcounting here; audit_match_tree()
776 * never dereferences tree and we can't get false positives there 776 * never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
811 } 811 }
812 } 812 }
813 813
814 if (watch) { 814 if (old->watch) {
815 audit_get_watch(watch); 815 audit_get_watch(old->watch);
816 new->watch = watch; 816 new->watch = old->watch;
817 } 817 }
818 818
819 return entry; 819 return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
866 struct audit_watch *watch = entry->rule.watch; 866 struct audit_watch *watch = entry->rule.watch;
867 struct audit_tree *tree = entry->rule.tree; 867 struct audit_tree *tree = entry->rule.tree;
868 struct list_head *list; 868 struct list_head *list;
869 int h, err; 869 int err;
870#ifdef CONFIG_AUDITSYSCALL 870#ifdef CONFIG_AUDITSYSCALL
871 int dont_count = 0; 871 int dont_count = 0;
872 872
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
889 889
890 if (watch) { 890 if (watch) {
891 /* audit_filter_mutex is dropped and re-taken during this call */ 891 /* audit_filter_mutex is dropped and re-taken during this call */
892 err = audit_add_watch(&entry->rule); 892 err = audit_add_watch(&entry->rule, &list);
893 if (err) { 893 if (err) {
894 mutex_unlock(&audit_filter_mutex); 894 mutex_unlock(&audit_filter_mutex);
895 goto error; 895 goto error;
896 } 896 }
897 /* entry->rule.watch may have changed during audit_add_watch() */
898 watch = entry->rule.watch;
899 h = audit_hash_ino((u32)audit_watch_inode(watch));
900 list = &audit_inode_hash[h];
901 } 897 }
902 if (tree) { 898 if (tree) {
903 err = audit_add_tree_rule(&entry->rule); 899 err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
949 struct audit_watch *watch = entry->rule.watch; 945 struct audit_watch *watch = entry->rule.watch;
950 struct audit_tree *tree = entry->rule.tree; 946 struct audit_tree *tree = entry->rule.tree;
951 struct list_head *list; 947 struct list_head *list;
952 LIST_HEAD(inotify_list);
953 int ret = 0; 948 int ret = 0;
954#ifdef CONFIG_AUDITSYSCALL 949#ifdef CONFIG_AUDITSYSCALL
955 int dont_count = 0; 950 int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
969 } 964 }
970 965
971 if (e->rule.watch) 966 if (e->rule.watch)
972 audit_remove_watch_rule(&e->rule, &inotify_list); 967 audit_remove_watch_rule(&e->rule);
973 968
974 if (e->rule.tree) 969 if (e->rule.tree)
975 audit_remove_tree_rule(&e->rule); 970 audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
987#endif 982#endif
988 mutex_unlock(&audit_filter_mutex); 983 mutex_unlock(&audit_filter_mutex);
989 984
990 if (!list_empty(&inotify_list))
991 audit_inotify_unregister(&inotify_list);
992
993out: 985out:
994 if (watch) 986 if (watch)
995 audit_put_watch(watch); /* match initial get */ 987 audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
1323{ 1315{
1324 struct audit_entry *entry = container_of(r, struct audit_entry, rule); 1316 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1325 struct audit_entry *nentry; 1317 struct audit_entry *nentry;
1326 struct audit_watch *watch;
1327 struct audit_tree *tree;
1328 int err = 0; 1318 int err = 0;
1329 1319
1330 if (!security_audit_rule_known(r)) 1320 if (!security_audit_rule_known(r))
1331 return 0; 1321 return 0;
1332 1322
1333 watch = r->watch; 1323 nentry = audit_dupe_rule(r);
1334 tree = r->tree;
1335 nentry = audit_dupe_rule(r, watch);
1336 if (IS_ERR(nentry)) { 1324 if (IS_ERR(nentry)) {
1337 /* save the first error encountered for the 1325 /* save the first error encountered for the
1338 * return value */ 1326 * return value */
1339 err = PTR_ERR(nentry); 1327 err = PTR_ERR(nentry);
1340 audit_panic("error updating LSM filters"); 1328 audit_panic("error updating LSM filters");
1341 if (watch) 1329 if (r->watch)
1342 list_del(&r->rlist); 1330 list_del(&r->rlist);
1343 list_del_rcu(&entry->list); 1331 list_del_rcu(&entry->list);
1344 list_del(&r->list); 1332 list_del(&r->list);
1345 } else { 1333 } else {
1346 if (watch) { 1334 if (r->watch || r->tree)
1347 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1348 list_del(&r->rlist);
1349 } else if (tree)
1350 list_replace_init(&r->rlist, &nentry->rule.rlist); 1335 list_replace_init(&r->rlist, &nentry->rule.rlist);
1351 list_replace_rcu(&entry->list, &nentry->list); 1336 list_replace_rcu(&entry->list, &nentry->list);
1352 list_replace(&r->list, &nentry->rule.list); 1337 list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..1b31c130d034 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
65#include <linux/binfmts.h> 65#include <linux/binfmts.h>
66#include <linux/highmem.h> 66#include <linux/highmem.h>
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/inotify.h>
69#include <linux/capability.h> 68#include <linux/capability.h>
70#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
71 70
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
549 } 548 }
550 break; 549 break;
551 case AUDIT_WATCH: 550 case AUDIT_WATCH:
552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) 551 if (name)
553 result = (name->dev == audit_watch_dev(rule->watch) && 552 result = audit_watch_compare(rule->watch, name->ino, name->dev);
554 name->ino == audit_watch_inode(rule->watch));
555 break; 553 break;
556 case AUDIT_DIR: 554 case AUDIT_DIR:
557 if (ctx) 555 if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
1726 struct audit_tree_refs *p; 1724 struct audit_tree_refs *p;
1727 struct audit_chunk *chunk; 1725 struct audit_chunk *chunk;
1728 int count; 1726 int count;
1729 if (likely(list_empty(&inode->inotify_watches))) 1727 if (likely(hlist_empty(&inode->i_fsnotify_marks)))
1730 return; 1728 return;
1731 context = current->audit_context; 1729 context = current->audit_context;
1732 p = context->trees; 1730 p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
1769 seq = read_seqbegin(&rename_lock); 1767 seq = read_seqbegin(&rename_lock);
1770 for(;;) { 1768 for(;;) {
1771 struct inode *inode = d->d_inode; 1769 struct inode *inode = d->d_inode;
1772 if (inode && unlikely(!list_empty(&inode->inotify_watches))) { 1770 if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
1773 struct audit_chunk *chunk; 1771 struct audit_chunk *chunk;
1774 chunk = audit_tree_lookup(inode); 1772 chunk = audit_tree_lookup(inode);
1775 if (chunk) { 1773 if (chunk) {
@@ -1837,13 +1835,8 @@ void __audit_getname(const char *name)
1837 context->names[context->name_count].ino = (unsigned long)-1; 1835 context->names[context->name_count].ino = (unsigned long)-1;
1838 context->names[context->name_count].osid = 0; 1836 context->names[context->name_count].osid = 0;
1839 ++context->name_count; 1837 ++context->name_count;
1840 if (!context->pwd.dentry) { 1838 if (!context->pwd.dentry)
1841 read_lock(&current->fs->lock); 1839 get_fs_pwd(current->fs, &context->pwd);
1842 context->pwd = current->fs->pwd;
1843 path_get(&current->fs->pwd);
1844 read_unlock(&current->fs->lock);
1845 }
1846
1847} 1840}
1848 1841
1849/* audit_putname - intercept a putname request 1842/* audit_putname - intercept a putname request
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a8ce09954404..c9483d8f6140 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1102,7 +1102,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1102 if (opts->release_agent) 1102 if (opts->release_agent)
1103 return -EINVAL; 1103 return -EINVAL;
1104 opts->release_agent = 1104 opts->release_agent =
1105 kstrndup(token + 14, PATH_MAX, GFP_KERNEL); 1105 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1106 if (!opts->release_agent)
1107 return -ENOMEM; 1107 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
@@ -1123,7 +1123,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123 if (opts->name) 1123 if (opts->name)
1124 return -EINVAL; 1124 return -EINVAL;
1125 opts->name = kstrndup(name, 1125 opts->name = kstrndup(name,
1126 MAX_CGROUP_ROOT_NAMELEN, 1126 MAX_CGROUP_ROOT_NAMELEN - 1,
1127 GFP_KERNEL); 1127 GFP_KERNEL);
1128 if (!opts->name) 1128 if (!opts->name)
1129 return -ENOMEM; 1129 return -ENOMEM;
@@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = {
1623 .kill_sb = cgroup_kill_sb, 1623 .kill_sb = cgroup_kill_sb,
1624}; 1624};
1625 1625
1626static struct kobject *cgroup_kobj;
1627
1626static inline struct cgroup *__d_cgrp(struct dentry *dentry) 1628static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1627{ 1629{
1628 return dentry->d_fsdata; 1630 return dentry->d_fsdata;
@@ -1789,19 +1791,20 @@ out:
1789} 1791}
1790 1792
1791/** 1793/**
1792 * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup 1794 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
1795 * @from: attach to all cgroups of a given task
1793 * @tsk: the task to be attached 1796 * @tsk: the task to be attached
1794 */ 1797 */
1795int cgroup_attach_task_current_cg(struct task_struct *tsk) 1798int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1796{ 1799{
1797 struct cgroupfs_root *root; 1800 struct cgroupfs_root *root;
1798 struct cgroup *cur_cg;
1799 int retval = 0; 1801 int retval = 0;
1800 1802
1801 cgroup_lock(); 1803 cgroup_lock();
1802 for_each_active_root(root) { 1804 for_each_active_root(root) {
1803 cur_cg = task_cgroup_from_root(current, root); 1805 struct cgroup *from_cg = task_cgroup_from_root(from, root);
1804 retval = cgroup_attach_task(cur_cg, tsk); 1806
1807 retval = cgroup_attach_task(from_cg, tsk);
1805 if (retval) 1808 if (retval)
1806 break; 1809 break;
1807 } 1810 }
@@ -1809,7 +1812,7 @@ int cgroup_attach_task_current_cg(struct task_struct *tsk)
1809 1812
1810 return retval; 1813 return retval;
1811} 1814}
1812EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); 1815EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1813 1816
1814/* 1817/*
1815 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1818 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
@@ -3894,9 +3897,18 @@ int __init cgroup_init(void)
3894 hhead = css_set_hash(init_css_set.subsys); 3897 hhead = css_set_hash(init_css_set.subsys);
3895 hlist_add_head(&init_css_set.hlist, hhead); 3898 hlist_add_head(&init_css_set.hlist, hhead);
3896 BUG_ON(!init_root_id(&rootnode)); 3899 BUG_ON(!init_root_id(&rootnode));
3900
3901 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
3902 if (!cgroup_kobj) {
3903 err = -ENOMEM;
3904 goto out;
3905 }
3906
3897 err = register_filesystem(&cgroup_fs_type); 3907 err = register_filesystem(&cgroup_fs_type);
3898 if (err < 0) 3908 if (err < 0) {
3909 kobject_put(cgroup_kobj);
3899 goto out; 3910 goto out;
3911 }
3900 3912
3901 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 3913 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
3902 3914
diff --git a/kernel/compat.c b/kernel/compat.c
index 5adab05a3172..c9e2ec0b34a8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
279 struct compat_rlimit __user *rlim) 279 struct compat_rlimit __user *rlim)
280{ 280{
281 struct rlimit r; 281 struct rlimit r;
282 int ret;
283 mm_segment_t old_fs = get_fs ();
284
285 if (resource >= RLIM_NLIMITS)
286 return -EINVAL;
287 282
288 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || 283 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
289 __get_user(r.rlim_cur, &rlim->rlim_cur) || 284 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
294 r.rlim_cur = RLIM_INFINITY; 289 r.rlim_cur = RLIM_INFINITY;
295 if (r.rlim_max == COMPAT_RLIM_INFINITY) 290 if (r.rlim_max == COMPAT_RLIM_INFINITY)
296 r.rlim_max = RLIM_INFINITY; 291 r.rlim_max = RLIM_INFINITY;
297 set_fs(KERNEL_DS); 292 return do_prlimit(current, resource, &r, NULL);
298 ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
299 set_fs(old_fs);
300 return ret;
301} 293}
302 294
303#ifdef COMPAT_RLIM_OLD_INFINITY 295#ifdef COMPAT_RLIM_OLD_INFINITY
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
329 321
330#endif 322#endif
331 323
332asmlinkage long compat_sys_getrlimit (unsigned int resource, 324asmlinkage long compat_sys_getrlimit(unsigned int resource,
333 struct compat_rlimit __user *rlim) 325 struct compat_rlimit __user *rlim)
334{ 326{
335 struct rlimit r; 327 struct rlimit r;
336 int ret; 328 int ret;
337 mm_segment_t old_fs = get_fs();
338 329
339 set_fs(KERNEL_DS); 330 ret = do_prlimit(current, resource, NULL, &r);
340 ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
341 set_fs(old_fs);
342 if (!ret) { 331 if (!ret) {
343 if (r.rlim_cur > COMPAT_RLIM_INFINITY) 332 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
344 r.rlim_cur = COMPAT_RLIM_INFINITY; 333 r.rlim_cur = COMPAT_RLIM_INFINITY;
@@ -1137,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1137 1126
1138 return 0; 1127 return 0;
1139} 1128}
1129
1130/*
1131 * Allocate user-space memory for the duration of a single system call,
1132 * in order to marshall parameters inside a compat thunk.
1133 */
1134void __user *compat_alloc_user_space(unsigned long len)
1135{
1136 void __user *ptr;
1137
1138 /* If len would occupy more than half of the entire compat space... */
1139 if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
1140 return NULL;
1141
1142 ptr = arch_compat_alloc_user_space(len);
1143
1144 if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
1145 return NULL;
1146
1147 return ptr;
1148}
1149EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
235 return -EINVAL; 235 return -EINVAL;
236 236
237 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
238 set_cpu_active(cpu, false);
239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
240 if (err) { 239 if (err) {
241 set_cpu_active(cpu, true);
242
243 nr_calls--; 240 nr_calls--;
244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
245 printk("%s: attempt to take down CPU %u failed\n", 242 printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 246
250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251 if (err) { 248 if (err) {
252 set_cpu_active(cpu, true);
253 /* CPU didn't die: tell everyone. Can't complain. */ 249 /* CPU didn't die: tell everyone. Can't complain. */
254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
255 251
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 goto out_notify; 317 goto out_notify;
322 BUG_ON(!cpu_online(cpu)); 318 BUG_ON(!cpu_online(cpu));
323 319
324 set_cpu_active(cpu, true);
325
326 /* Now call notifier in preparation. */ 320 /* Now call notifier in preparation. */
327 cpu_notify(CPU_ONLINE | mod, hcpu); 321 cpu_notify(CPU_ONLINE | mod, hcpu);
328 322
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7cb37d86a005..b23c0979bbe7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2113 * but making no active use of cpusets. 2113 * but making no active use of cpusets.
2114 * 2114 *
2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2116 * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117 * 2117 *
2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2119 * before calling generate_sched_domains(). 2119 * before calling generate_sched_domains().
2120 */ 2120 */
2121static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2121void cpuset_update_active_cpus(void)
2122 unsigned long phase, void *unused_cpu)
2123{ 2122{
2124 struct sched_domain_attr *attr; 2123 struct sched_domain_attr *attr;
2125 cpumask_var_t *doms; 2124 cpumask_var_t *doms;
2126 int ndoms; 2125 int ndoms;
2127 2126
2128 switch (phase) {
2129 case CPU_ONLINE:
2130 case CPU_ONLINE_FROZEN:
2131 case CPU_DOWN_PREPARE:
2132 case CPU_DOWN_PREPARE_FROZEN:
2133 case CPU_DOWN_FAILED:
2134 case CPU_DOWN_FAILED_FROZEN:
2135 break;
2136
2137 default:
2138 return NOTIFY_DONE;
2139 }
2140
2141 cgroup_lock(); 2127 cgroup_lock();
2142 mutex_lock(&callback_mutex); 2128 mutex_lock(&callback_mutex);
2143 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2148 2134
2149 /* Have scheduler rebuild the domains */ 2135 /* Have scheduler rebuild the domains */
2150 partition_sched_domains(ndoms, doms, attr); 2136 partition_sched_domains(ndoms, doms, attr);
2151
2152 return NOTIFY_OK;
2153} 2137}
2154 2138
2155#ifdef CONFIG_MEMORY_HOTPLUG 2139#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
2203 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205 2189
2206 hotcpu_notifier(cpuset_track_online_cpus, 0);
2207 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208 2191
2209 cpuset_wq = create_singlethread_workqueue("cpuset"); 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/cred.c b/kernel/cred.c
index 60bc8b1e32e6..9a3e22641fe7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -22,10 +22,6 @@
22#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else 24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \ 25#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 26 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif 27#endif
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 51d14fe87648..de407c78178d 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -605,6 +605,8 @@ cpu_master_loop:
605 if (dbg_kdb_mode) { 605 if (dbg_kdb_mode) {
606 kgdb_connected = 1; 606 kgdb_connected = 1;
607 error = kdb_stub(ks); 607 error = kdb_stub(ks);
608 if (error == -1)
609 continue;
608 kgdb_connected = 0; 610 kgdb_connected = 0;
609 } else { 611 } else {
610 error = gdb_serial_stub(ks); 612 error = gdb_serial_stub(ks);
@@ -739,7 +741,7 @@ static struct console kgdbcons = {
739}; 741};
740 742
741#ifdef CONFIG_MAGIC_SYSRQ 743#ifdef CONFIG_MAGIC_SYSRQ
742static void sysrq_handle_dbg(int key, struct tty_struct *tty) 744static void sysrq_handle_dbg(int key)
743{ 745{
744 if (!dbg_io_ops) { 746 if (!dbg_io_ops) {
745 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); 747 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 6e81fd59566b..481a7bd2dfe7 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES +
52 * GDB remote protocol parser: 52 * GDB remote protocol parser:
53 */ 53 */
54 54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB 55#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void) 56static int gdbstub_read_wait(void)
68{ 57{
@@ -123,8 +112,8 @@ static void get_packet(char *buffer)
123 buffer[count] = 0; 112 buffer[count] = 0;
124 113
125 if (ch == '#') { 114 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4; 115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait()); 116 xmitcsum += hex_to_bin(gdbstub_read_wait());
128 117
129 if (checksum != xmitcsum) 118 if (checksum != xmitcsum)
130 /* failed checksum */ 119 /* failed checksum */
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
236 * buf. Return a pointer to the last char put in buf (null). May 225 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error. 226 * return an error.
238 */ 227 */
239int kgdb_mem2hex(char *mem, char *buf, int count) 228char *kgdb_mem2hex(char *mem, char *buf, int count)
240{ 229{
241 char *tmp; 230 char *tmp;
242 int err; 231 int err;
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
248 tmp = buf + count; 237 tmp = buf + count;
249 238
250 err = probe_kernel_read(tmp, mem, count); 239 err = probe_kernel_read(tmp, mem, count);
251 if (!err) { 240 if (err)
252 while (count > 0) { 241 return NULL;
253 buf = pack_hex_byte(buf, *tmp); 242 while (count > 0) {
254 tmp++; 243 buf = pack_hex_byte(buf, *tmp);
255 count--; 244 tmp++;
256 } 245 count--;
257
258 *buf = 0;
259 } 246 }
247 *buf = 0;
260 248
261 return err; 249 return buf;
262} 250}
263 251
264/* 252/*
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
280 tmp_hex = tmp_raw - 1; 268 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) { 269 while (tmp_hex >= buf) {
282 tmp_raw--; 270 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--); 271 *tmp_raw = hex_to_bin(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4; 272 *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
285 } 273 }
286 274
287 return probe_kernel_write(mem, tmp_raw, count); 275 return probe_kernel_write(mem, tmp_raw, count);
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
304 (*ptr)++; 292 (*ptr)++;
305 } 293 }
306 while (**ptr) { 294 while (**ptr) {
307 hex_val = hex(**ptr); 295 hex_val = hex_to_bin(**ptr);
308 if (hex_val < 0) 296 if (hex_val < 0)
309 break; 297 break;
310 298
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
339 return probe_kernel_write(mem, c, size); 327 return probe_kernel_write(mem, c, size);
340} 328}
341 329
330#if DBG_MAX_REG_NUM > 0
331void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
332{
333 int i;
334 int idx = 0;
335 char *ptr = (char *)gdb_regs;
336
337 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
338 dbg_get_reg(i, ptr + idx, regs);
339 idx += dbg_reg_def[i].size;
340 }
341}
342
343void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
344{
345 int i;
346 int idx = 0;
347 char *ptr = (char *)gdb_regs;
348
349 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
350 dbg_set_reg(i, ptr + idx, regs);
351 idx += dbg_reg_def[i].size;
352 }
353}
354#endif /* DBG_MAX_REG_NUM > 0 */
355
342/* Write memory due to an 'M' or 'X' packet. */ 356/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary) 357static int write_mem_msg(int binary)
344{ 358{
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error)
378 * remapped to negative TIDs. 392 * remapped to negative TIDs.
379 */ 393 */
380 394
381#define BUF_THREAD_ID_SIZE 16 395#define BUF_THREAD_ID_SIZE 8
382 396
383static char *pack_threadid(char *pkt, unsigned char *id) 397static char *pack_threadid(char *pkt, unsigned char *id)
384{ 398{
385 char *limit; 399 unsigned char *limit;
400 int lzero = 1;
401
402 limit = id + (BUF_THREAD_ID_SIZE / 2);
403 while (id < limit) {
404 if (!lzero || *id != 0) {
405 pkt = pack_hex_byte(pkt, *id);
406 lzero = 0;
407 }
408 id++;
409 }
386 410
387 limit = pkt + BUF_THREAD_ID_SIZE; 411 if (lzero)
388 while (pkt < limit) 412 pkt = pack_hex_byte(pkt, 0);
389 pkt = pack_hex_byte(pkt, *id++);
390 413
391 return pkt; 414 return pkt;
392} 415}
393 416
394static void int_to_threadref(unsigned char *id, int value) 417static void int_to_threadref(unsigned char *id, int value)
395{ 418{
396 unsigned char *scan; 419 put_unaligned_be32(value, id);
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403} 420}
404 421
405static struct task_struct *getthread(struct pt_regs *regs, int tid) 422static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo); 480 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464} 481}
465 482
466/* Handle the 'g' get registers request */ 483static void gdb_get_regs_helper(struct kgdb_state *ks)
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{ 484{
469 struct task_struct *thread; 485 struct task_struct *thread;
470 void *local_debuggerinfo; 486 void *local_debuggerinfo;
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
505 */ 521 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread); 522 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 } 523 }
524}
525
526/* Handle the 'g' get registers request */
527static void gdb_cmd_getregs(struct kgdb_state *ks)
528{
529 gdb_get_regs_helper(ks);
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); 530 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509} 531}
510 532
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
527 char *ptr = &remcom_in_buffer[1]; 549 char *ptr = &remcom_in_buffer[1];
528 unsigned long length; 550 unsigned long length;
529 unsigned long addr; 551 unsigned long addr;
530 int err; 552 char *err;
531 553
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && 554 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) { 555 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); 556 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err) 557 if (!err)
536 error_packet(remcom_out_buffer, err); 558 error_packet(remcom_out_buffer, -EINVAL);
537 } else { 559 } else {
538 error_packet(remcom_out_buffer, -EINVAL); 560 error_packet(remcom_out_buffer, -EINVAL);
539 } 561 }
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
550 strcpy(remcom_out_buffer, "OK"); 572 strcpy(remcom_out_buffer, "OK");
551} 573}
552 574
575#if DBG_MAX_REG_NUM > 0
576static char *gdb_hex_reg_helper(int regnum, char *out)
577{
578 int i;
579 int offset = 0;
580
581 for (i = 0; i < regnum; i++)
582 offset += dbg_reg_def[i].size;
583 return kgdb_mem2hex((char *)gdb_regs + offset, out,
584 dbg_reg_def[i].size);
585}
586
587/* Handle the 'p' individual regster get */
588static void gdb_cmd_reg_get(struct kgdb_state *ks)
589{
590 unsigned long regnum;
591 char *ptr = &remcom_in_buffer[1];
592
593 kgdb_hex2long(&ptr, &regnum);
594 if (regnum >= DBG_MAX_REG_NUM) {
595 error_packet(remcom_out_buffer, -EINVAL);
596 return;
597 }
598 gdb_get_regs_helper(ks);
599 gdb_hex_reg_helper(regnum, remcom_out_buffer);
600}
601
602/* Handle the 'P' individual regster set */
603static void gdb_cmd_reg_set(struct kgdb_state *ks)
604{
605 unsigned long regnum;
606 char *ptr = &remcom_in_buffer[1];
607 int i = 0;
608
609 kgdb_hex2long(&ptr, &regnum);
610 if (*ptr++ != '=' ||
611 !(!kgdb_usethread || kgdb_usethread == current) ||
612 !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
613 error_packet(remcom_out_buffer, -EINVAL);
614 return;
615 }
616 memset(gdb_regs, 0, sizeof(gdb_regs));
617 while (i < sizeof(gdb_regs) * 2)
618 if (hex_to_bin(ptr[i]) >= 0)
619 i++;
620 else
621 break;
622 i = i / 2;
623 kgdb_hex2mem(ptr, (char *)gdb_regs, i);
624 dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
625 strcpy(remcom_out_buffer, "OK");
626}
627#endif /* DBG_MAX_REG_NUM > 0 */
628
553/* Handle the 'X' memory binary write bytes */ 629/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks) 630static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{ 631{
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
612{ 688{
613 struct task_struct *g; 689 struct task_struct *g;
614 struct task_struct *p; 690 struct task_struct *p;
615 unsigned char thref[8]; 691 unsigned char thref[BUF_THREAD_ID_SIZE];
616 char *ptr; 692 char *ptr;
617 int i; 693 int i;
618 int cpu; 694 int cpu;
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
632 for_each_online_cpu(cpu) { 708 for_each_online_cpu(cpu) {
633 ks->thr_query = 0; 709 ks->thr_query = 0;
634 int_to_threadref(thref, -cpu - 2); 710 int_to_threadref(thref, -cpu - 2);
635 pack_threadid(ptr, thref); 711 ptr = pack_threadid(ptr, thref);
636 ptr += BUF_THREAD_ID_SIZE;
637 *(ptr++) = ','; 712 *(ptr++) = ',';
638 i++; 713 i++;
639 } 714 }
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
642 do_each_thread(g, p) { 717 do_each_thread(g, p) {
643 if (i >= ks->thr_query && !finished) { 718 if (i >= ks->thr_query && !finished) {
644 int_to_threadref(thref, p->pid); 719 int_to_threadref(thref, p->pid);
645 pack_threadid(ptr, thref); 720 ptr = pack_threadid(ptr, thref);
646 ptr += BUF_THREAD_ID_SIZE;
647 *(ptr++) = ','; 721 *(ptr++) = ',';
648 ks->thr_query++; 722 ks->thr_query++;
649 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) 723 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
858 int error = 0; 932 int error = 0;
859 int tmp; 933 int tmp;
860 934
861 /* Clear the out buffer. */ 935 /* Initialize comm buffer and globals. */
862 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); 936 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
937 kgdb_usethread = kgdb_info[ks->cpu].task;
938 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
939 ks->pass_exception = 0;
863 940
864 if (kgdb_connected) { 941 if (kgdb_connected) {
865 unsigned char thref[8]; 942 unsigned char thref[BUF_THREAD_ID_SIZE];
866 char *ptr; 943 char *ptr;
867 944
868 /* Reply to host that an exception has occurred */ 945 /* Reply to host that an exception has occurred */
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
876 put_packet(remcom_out_buffer); 953 put_packet(remcom_out_buffer);
877 } 954 }
878 955
879 kgdb_usethread = kgdb_info[ks->cpu].task;
880 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
881 ks->pass_exception = 0;
882
883 while (1) { 956 while (1) {
884 error = 0; 957 error = 0;
885 958
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
904 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 977 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
905 gdb_cmd_memwrite(ks); 978 gdb_cmd_memwrite(ks);
906 break; 979 break;
980#if DBG_MAX_REG_NUM > 0
981 case 'p': /* pXX Return gdb register XX (in hex) */
982 gdb_cmd_reg_get(ks);
983 break;
984 case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
985 gdb_cmd_reg_set(ks);
986 break;
987#endif /* DBG_MAX_REG_NUM > 0 */
907 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 988 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_binwrite(ks); 989 gdb_cmd_binwrite(ks);
909 break; 990 break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 75bd9b3ebbb7..20059ef4459a 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv)
274 int i, bpno; 274 int i, bpno;
275 kdb_bp_t *bp, *bp_check; 275 kdb_bp_t *bp, *bp_check;
276 int diag; 276 int diag;
277 int free;
278 char *symname = NULL; 277 char *symname = NULL;
279 long offset = 0ul; 278 long offset = 0ul;
280 int nextarg; 279 int nextarg;
@@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv)
305 /* 304 /*
306 * Find an empty bp structure to allocate 305 * Find an empty bp structure to allocate
307 */ 306 */
308 free = KDB_MAXBPT;
309 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { 307 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
310 if (bp->bp_free) 308 if (bp->bp_free)
311 break; 309 break;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..caf057a3de0e 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
312 312
313 if (endp == arg) { 313 if (endp == arg) {
314 /* 314 /*
315 * Try base 16, for us folks too lazy to type the 315 * Also try base 16, for us folks too lazy to type the
316 * leading 0x... 316 * leading 0x...
317 */ 317 */
318 val = simple_strtoul(arg, &endp, 16); 318 val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
325 return 0; 325 return 0;
326} 326}
327 327
328int kdbgetu64arg(const char *arg, u64 *value)
329{
330 char *endp;
331 u64 val;
332
333 val = simple_strtoull(arg, &endp, 0);
334
335 if (endp == arg) {
336
337 val = simple_strtoull(arg, &endp, 16);
338 if (endp == arg)
339 return KDB_BADINT;
340 }
341
342 *value = val;
343
344 return 0;
345}
346
328/* 347/*
329 * kdb_set - This function implements the 'set' command. Alter an 348 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one. 349 * existing environment variable or create a new one.
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
1770 */ 1789 */
1771static int kdb_rd(int argc, const char **argv) 1790static int kdb_rd(int argc, const char **argv)
1772{ 1791{
1773 int diag = kdb_check_regs(); 1792 int len = kdb_check_regs();
1774 if (diag) 1793#if DBG_MAX_REG_NUM > 0
1775 return diag; 1794 int i;
1795 char *rname;
1796 int rsize;
1797 u64 reg64;
1798 u32 reg32;
1799 u16 reg16;
1800 u8 reg8;
1801
1802 if (len)
1803 return len;
1804
1805 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1806 rsize = dbg_reg_def[i].size * 2;
1807 if (rsize > 16)
1808 rsize = 2;
1809 if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
1810 len = 0;
1811 kdb_printf("\n");
1812 }
1813 if (len)
1814 len += kdb_printf(" ");
1815 switch(dbg_reg_def[i].size * 8) {
1816 case 8:
1817 rname = dbg_get_reg(i, &reg8, kdb_current_regs);
1818 if (!rname)
1819 break;
1820 len += kdb_printf("%s: %02x", rname, reg8);
1821 break;
1822 case 16:
1823 rname = dbg_get_reg(i, &reg16, kdb_current_regs);
1824 if (!rname)
1825 break;
1826 len += kdb_printf("%s: %04x", rname, reg16);
1827 break;
1828 case 32:
1829 rname = dbg_get_reg(i, &reg32, kdb_current_regs);
1830 if (!rname)
1831 break;
1832 len += kdb_printf("%s: %08x", rname, reg32);
1833 break;
1834 case 64:
1835 rname = dbg_get_reg(i, &reg64, kdb_current_regs);
1836 if (!rname)
1837 break;
1838 len += kdb_printf("%s: %016llx", rname, reg64);
1839 break;
1840 default:
1841 len += kdb_printf("%s: ??", dbg_reg_def[i].name);
1842 }
1843 }
1844 kdb_printf("\n");
1845#else
1846 if (len)
1847 return len;
1776 1848
1777 kdb_dumpregs(kdb_current_regs); 1849 kdb_dumpregs(kdb_current_regs);
1850#endif
1778 return 0; 1851 return 0;
1779} 1852}
1780 1853
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
1782 * kdb_rm - This function implements the 'rm' (register modify) command. 1855 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents 1856 * rm register-name new-contents
1784 * Remarks: 1857 * Remarks:
1785 * Currently doesn't allow modification of control or 1858 * Allows register modification with the same restrictions as gdb
1786 * debug registers.
1787 */ 1859 */
1788static int kdb_rm(int argc, const char **argv) 1860static int kdb_rm(int argc, const char **argv)
1789{ 1861{
1862#if DBG_MAX_REG_NUM > 0
1790 int diag; 1863 int diag;
1791 int ind = 0; 1864 const char *rname;
1792 unsigned long contents; 1865 int i;
1866 u64 reg64;
1867 u32 reg32;
1868 u16 reg16;
1869 u8 reg8;
1793 1870
1794 if (argc != 2) 1871 if (argc != 2)
1795 return KDB_ARGCOUNT; 1872 return KDB_ARGCOUNT;
1796 /* 1873 /*
1797 * Allow presence or absence of leading '%' symbol. 1874 * Allow presence or absence of leading '%' symbol.
1798 */ 1875 */
1799 if (argv[1][0] == '%') 1876 rname = argv[1];
1800 ind = 1; 1877 if (*rname == '%')
1878 rname++;
1801 1879
1802 diag = kdbgetularg(argv[2], &contents); 1880 diag = kdbgetu64arg(argv[2], &reg64);
1803 if (diag) 1881 if (diag)
1804 return diag; 1882 return diag;
1805 1883
1806 diag = kdb_check_regs(); 1884 diag = kdb_check_regs();
1807 if (diag) 1885 if (diag)
1808 return diag; 1886 return diag;
1887
1888 diag = KDB_BADREG;
1889 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1890 if (strcmp(rname, dbg_reg_def[i].name) == 0) {
1891 diag = 0;
1892 break;
1893 }
1894 }
1895 if (!diag) {
1896 switch(dbg_reg_def[i].size * 8) {
1897 case 8:
1898 reg8 = reg64;
1899 dbg_set_reg(i, &reg8, kdb_current_regs);
1900 break;
1901 case 16:
1902 reg16 = reg64;
1903 dbg_set_reg(i, &reg16, kdb_current_regs);
1904 break;
1905 case 32:
1906 reg32 = reg64;
1907 dbg_set_reg(i, &reg32, kdb_current_regs);
1908 break;
1909 case 64:
1910 dbg_set_reg(i, &reg64, kdb_current_regs);
1911 break;
1912 }
1913 }
1914 return diag;
1915#else
1809 kdb_printf("ERROR: Register set currently not implemented\n"); 1916 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0; 1917 return 0;
1918#endif
1811} 1919}
1812 1920
1813#if defined(CONFIG_MAGIC_SYSRQ) 1921#if defined(CONFIG_MAGIC_SYSRQ)
@@ -1821,7 +1929,7 @@ static int kdb_sr(int argc, const char **argv)
1821 if (argc != 1) 1929 if (argc != 1)
1822 return KDB_ARGCOUNT; 1930 return KDB_ARGCOUNT;
1823 kdb_trap_printk++; 1931 kdb_trap_printk++;
1824 __handle_sysrq(*argv[1], NULL, 0); 1932 __handle_sysrq(*argv[1], false);
1825 kdb_trap_printk--; 1933 kdb_trap_printk--;
1826 1934
1827 return 0; 1935 return 0;
@@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val)
2440 */ 2548 */
2441static int kdb_summary(int argc, const char **argv) 2549static int kdb_summary(int argc, const char **argv)
2442{ 2550{
2551 struct timespec now;
2443 struct kdb_tm tm; 2552 struct kdb_tm tm;
2444 struct sysinfo val; 2553 struct sysinfo val;
2445 2554
@@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv)
2454 kdb_printf("domainname %s\n", init_uts_ns.name.domainname); 2563 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2455 kdb_printf("ccversion %s\n", __stringify(CCVERSION)); 2564 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2456 2565
2457 kdb_gmtime(&xtime, &tm); 2566 now = __current_kernel_time();
2567 kdb_gmtime(&now, &tm);
2458 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " 2568 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2459 "tz_minuteswest %d\n", 2569 "tz_minuteswest %d\n",
2460 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, 2570 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..be775f7e81e0 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 144extern int kdb_putword(unsigned long, unsigned long, size_t);
145 145
146extern int kdbgetularg(const char *, unsigned long *); 146extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **);
148extern char *kdbgetenv(const char *); 147extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **); 149 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *); 150extern int kdbgetsymval(const char *, kdb_symtab_t *);
@@ -257,7 +255,14 @@ extern void kdb_ps1(const struct task_struct *p);
257extern void kdb_print_nameval(const char *name, unsigned long val); 255extern void kdb_print_nameval(const char *name, unsigned long val);
258extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 256extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
259extern void kdb_meminfo_proc_show(void); 257extern void kdb_meminfo_proc_show(void);
258#ifdef CONFIG_KALLSYMS
260extern const char *kdb_walk_kallsyms(loff_t *pos); 259extern const char *kdb_walk_kallsyms(loff_t *pos);
260#else /* ! CONFIG_KALLSYMS */
261static inline const char *kdb_walk_kallsyms(loff_t *pos)
262{
263 return NULL;
264}
265#endif /* ! CONFIG_KALLSYMS */
261extern char *kdb_getstr(char *, size_t, char *); 266extern char *kdb_getstr(char *, size_t, char *);
262 267
263/* Defines for kdb_symbol_print */ 268/* Defines for kdb_symbol_print */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 45344d5c53dd..6b2485dcb050 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -82,8 +82,8 @@ static char *kdb_name_table[100]; /* arbitrary size */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) 82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{ 83{
84 int ret = 0; 84 int ret = 0;
85 unsigned long symbolsize; 85 unsigned long symbolsize = 0;
86 unsigned long offset; 86 unsigned long offset = 0;
87#define knt1_size 128 /* must be >= kallsyms table size */ 87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL; 88 char *knt1 = NULL;
89 89
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index dd62f8e714ca..0dbeae374225 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -134,23 +134,14 @@ unregister:
134 return 0; 134 return 0;
135} 135}
136 136
137int 137int __set_personality(unsigned int personality)
138__set_personality(unsigned int personality)
139{ 138{
140 struct exec_domain *ep, *oep; 139 struct exec_domain *oep = current_thread_info()->exec_domain;
141
142 ep = lookup_exec_domain(personality);
143 if (ep == current_thread_info()->exec_domain) {
144 current->personality = personality;
145 module_put(ep->module);
146 return 0;
147 }
148 140
141 current_thread_info()->exec_domain = lookup_exec_domain(personality);
149 current->personality = personality; 142 current->personality = personality;
150 oep = current_thread_info()->exec_domain;
151 current_thread_info()->exec_domain = ep;
152
153 module_put(oep->module); 143 module_put(oep->module);
144
154 return 0; 145 return 0;
155} 146}
156 147
@@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 183{
193 unsigned int old = current->personality; 184 unsigned int old = current->personality;
194 185
195 if (personality != 0xffffffff) { 186 if (personality != 0xffffffff)
196 set_personality(personality); 187 set_personality(personality);
197 if (current->personality != personality)
198 return -EINVAL;
199 }
200 188
201 return old; 189 return old;
202} 190}
diff --git a/kernel/exit.c b/kernel/exit.c
index ceffc67b564a..03120229db28 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -771,9 +771,12 @@ static void forget_original_parent(struct task_struct *father)
771 struct task_struct *p, *n, *reaper; 771 struct task_struct *p, *n, *reaper;
772 LIST_HEAD(dead_children); 772 LIST_HEAD(dead_children);
773 773
774 exit_ptrace(father);
775
776 write_lock_irq(&tasklist_lock); 774 write_lock_irq(&tasklist_lock);
775 /*
776 * Note that exit_ptrace() and find_new_reaper() might
777 * drop tasklist_lock and reacquire it.
778 */
779 exit_ptrace(father);
777 reaper = find_new_reaper(father); 780 reaper = find_new_reaper(father);
778 781
779 list_for_each_entry_safe(p, n, &father->children, sibling) { 782 list_for_each_entry_safe(p, n, &father->children, sibling) {
@@ -1383,8 +1386,7 @@ static int wait_task_stopped(struct wait_opts *wo,
1383 if (!unlikely(wo->wo_flags & WNOWAIT)) 1386 if (!unlikely(wo->wo_flags & WNOWAIT))
1384 *p_code = 0; 1387 *p_code = 0;
1385 1388
1386 /* don't need the RCU readlock here as we're holding a spinlock */ 1389 uid = task_uid(p);
1387 uid = __task_cred(p)->uid;
1388unlock_sig: 1390unlock_sig:
1389 spin_unlock_irq(&p->sighand->siglock); 1391 spin_unlock_irq(&p->sighand->siglock);
1390 if (!exit_code) 1392 if (!exit_code)
@@ -1457,7 +1459,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1457 } 1459 }
1458 if (!unlikely(wo->wo_flags & WNOWAIT)) 1460 if (!unlikely(wo->wo_flags & WNOWAIT))
1459 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1461 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1460 uid = __task_cred(p)->uid; 1462 uid = task_uid(p);
1461 spin_unlock_irq(&p->sighand->siglock); 1463 spin_unlock_irq(&p->sighand->siglock);
1462 1464
1463 pid = task_pid_vnr(p); 1465 pid = task_pid_vnr(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..c445f8cc408d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -300,7 +300,7 @@ out:
300#ifdef CONFIG_MMU 300#ifdef CONFIG_MMU
301static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 301static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
302{ 302{
303 struct vm_area_struct *mpnt, *tmp, **pprev; 303 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
304 struct rb_node **rb_link, *rb_parent; 304 struct rb_node **rb_link, *rb_parent;
305 int retval; 305 int retval;
306 unsigned long charge; 306 unsigned long charge;
@@ -328,6 +328,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (retval) 328 if (retval)
329 goto out; 329 goto out;
330 330
331 prev = NULL;
331 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 332 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
332 struct file *file; 333 struct file *file;
333 334
@@ -355,11 +356,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
355 if (IS_ERR(pol)) 356 if (IS_ERR(pol))
356 goto fail_nomem_policy; 357 goto fail_nomem_policy;
357 vma_set_policy(tmp, pol); 358 vma_set_policy(tmp, pol);
359 tmp->vm_mm = mm;
358 if (anon_vma_fork(tmp, mpnt)) 360 if (anon_vma_fork(tmp, mpnt))
359 goto fail_nomem_anon_vma_fork; 361 goto fail_nomem_anon_vma_fork;
360 tmp->vm_flags &= ~VM_LOCKED; 362 tmp->vm_flags &= ~VM_LOCKED;
361 tmp->vm_mm = mm; 363 tmp->vm_next = tmp->vm_prev = NULL;
362 tmp->vm_next = NULL;
363 file = tmp->vm_file; 364 file = tmp->vm_file;
364 if (file) { 365 if (file) {
365 struct inode *inode = file->f_path.dentry->d_inode; 366 struct inode *inode = file->f_path.dentry->d_inode;
@@ -392,6 +393,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
392 */ 393 */
393 *pprev = tmp; 394 *pprev = tmp;
394 pprev = &tmp->vm_next; 395 pprev = &tmp->vm_next;
396 tmp->vm_prev = prev;
397 prev = tmp;
395 398
396 __vma_link_rb(mm, tmp, rb_link, rb_parent); 399 __vma_link_rb(mm, tmp, rb_link, rb_parent);
397 rb_link = &tmp->vm_rb.rb_right; 400 rb_link = &tmp->vm_rb.rb_right;
@@ -752,13 +755,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
752 struct fs_struct *fs = current->fs; 755 struct fs_struct *fs = current->fs;
753 if (clone_flags & CLONE_FS) { 756 if (clone_flags & CLONE_FS) {
754 /* tsk->fs is already what we want */ 757 /* tsk->fs is already what we want */
755 write_lock(&fs->lock); 758 spin_lock(&fs->lock);
756 if (fs->in_exec) { 759 if (fs->in_exec) {
757 write_unlock(&fs->lock); 760 spin_unlock(&fs->lock);
758 return -EAGAIN; 761 return -EAGAIN;
759 } 762 }
760 fs->users++; 763 fs->users++;
761 write_unlock(&fs->lock); 764 spin_unlock(&fs->lock);
762 return 0; 765 return 0;
763 } 766 }
764 tsk->fs = copy_fs_struct(fs); 767 tsk->fs = copy_fs_struct(fs);
@@ -899,6 +902,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
899 tty_audit_fork(sig); 902 tty_audit_fork(sig);
900 903
901 sig->oom_adj = current->signal->oom_adj; 904 sig->oom_adj = current->signal->oom_adj;
905 sig->oom_score_adj = current->signal->oom_score_adj;
902 906
903 return 0; 907 return 0;
904} 908}
@@ -907,7 +911,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
907{ 911{
908 unsigned long new_flags = p->flags; 912 unsigned long new_flags = p->flags;
909 913
910 new_flags &= ~PF_SUPERPRIV; 914 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
911 new_flags |= PF_FORKNOEXEC; 915 new_flags |= PF_FORKNOEXEC;
912 new_flags |= PF_STARTING; 916 new_flags |= PF_STARTING;
913 p->flags = new_flags; 917 p->flags = new_flags;
@@ -1675,13 +1679,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1675 1679
1676 if (new_fs) { 1680 if (new_fs) {
1677 fs = current->fs; 1681 fs = current->fs;
1678 write_lock(&fs->lock); 1682 spin_lock(&fs->lock);
1679 current->fs = new_fs; 1683 current->fs = new_fs;
1680 if (--fs->users) 1684 if (--fs->users)
1681 new_fs = NULL; 1685 new_fs = NULL;
1682 else 1686 else
1683 new_fs = fs; 1687 new_fs = fs;
1684 write_unlock(&fs->lock); 1688 spin_unlock(&fs->lock);
1685 } 1689 }
1686 1690
1687 if (new_mm) { 1691 if (new_mm) {
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a3..f83972b16564 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
33 * @children: child nodes 33 * @children: child nodes
34 * @all: list head for list of all nodes 34 * @all: list head for list of all nodes
35 * @parent: parent node 35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory 36 * @loaded_info: array of pointers to profiling data sets for loaded object
37 * @ghost: when an object file containing profiling data is unloaded we keep a 37 * files.
38 * copy of the profiling data here to allow collecting coverage data 38 * @num_loaded: number of profiling data sets for loaded object files.
39 * for cleanup code. Such a node is called a "ghost". 39 * @unloaded_info: accumulated copy of profiling data sets for unloaded
40 * object files. Used only when gcov_persist=1.
40 * @dentry: main debugfs entry, either a directory or data file 41 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links 42 * @links: associated symbolic links
42 * @name: data file basename 43 * @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
51 struct list_head children; 52 struct list_head children;
52 struct list_head all; 53 struct list_head all;
53 struct gcov_node *parent; 54 struct gcov_node *parent;
54 struct gcov_info *info; 55 struct gcov_info **loaded_info;
55 struct gcov_info *ghost; 56 struct gcov_info *unloaded_info;
56 struct dentry *dentry; 57 struct dentry *dentry;
57 struct dentry **links; 58 struct dentry **links;
59 int num_loaded;
58 char name[0]; 60 char name[0];
59}; 61};
60 62
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
136}; 138};
137 139
138/* 140/*
139 * Return the profiling data set for a given node. This can either be the 141 * Return a profiling data set associated with the given node. This is
140 * original profiling data structure or a duplicate (also called "ghost") 142 * either a data set for a loaded object file or a data set copy in case
141 * in case the associated object file has been unloaded. 143 * all associated object files have been unloaded.
142 */ 144 */
143static struct gcov_info *get_node_info(struct gcov_node *node) 145static struct gcov_info *get_node_info(struct gcov_node *node)
144{ 146{
145 if (node->info) 147 if (node->num_loaded > 0)
146 return node->info; 148 return node->loaded_info[0];
147 149
148 return node->ghost; 150 return node->unloaded_info;
151}
152
153/*
154 * Return a newly allocated profiling data set which contains the sum of
155 * all profiling data associated with the given node.
156 */
157static struct gcov_info *get_accumulated_info(struct gcov_node *node)
158{
159 struct gcov_info *info;
160 int i = 0;
161
162 if (node->unloaded_info)
163 info = gcov_info_dup(node->unloaded_info);
164 else
165 info = gcov_info_dup(node->loaded_info[i++]);
166 if (!info)
167 return NULL;
168 for (; i < node->num_loaded; i++)
169 gcov_info_add(info, node->loaded_info[i]);
170
171 return info;
149} 172}
150 173
151/* 174/*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
163 mutex_lock(&node_lock); 186 mutex_lock(&node_lock);
164 /* 187 /*
165 * Read from a profiling data copy to minimize reference tracking 188 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access. 189 * complexity and concurrent access and to keep accumulating multiple
190 * profiling data sets associated with one node simple.
167 */ 191 */
168 info = gcov_info_dup(get_node_info(node)); 192 info = get_accumulated_info(node);
169 if (!info) 193 if (!info)
170 goto out_unlock; 194 goto out_unlock;
171 iter = gcov_iter_new(info); 195 iter = gcov_iter_new(info);
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name)
225 return NULL; 249 return NULL;
226} 250}
227 251
252/*
253 * Reset all profiling data associated with the specified node.
254 */
255static void reset_node(struct gcov_node *node)
256{
257 int i;
258
259 if (node->unloaded_info)
260 gcov_info_reset(node->unloaded_info);
261 for (i = 0; i < node->num_loaded; i++)
262 gcov_info_reset(node->loaded_info[i]);
263}
264
228static void remove_node(struct gcov_node *node); 265static void remove_node(struct gcov_node *node);
229 266
230/* 267/*
231 * write() implementation for gcov data files. Reset profiling data for the 268 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is 269 * corresponding file. If all associated object files have been unloaded,
233 * a "ghost" node), remove the debug fs node as well. 270 * remove the debug fs node as well.
234 */ 271 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr, 272static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos) 273 size_t len, loff_t *pos)
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
245 node = get_node_by_name(info->filename); 282 node = get_node_by_name(info->filename);
246 if (node) { 283 if (node) {
247 /* Reset counts or remove node for unloaded modules. */ 284 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost) 285 if (node->num_loaded == 0)
249 remove_node(node); 286 remove_node(node);
250 else 287 else
251 gcov_info_reset(node->info); 288 reset_node(node);
252 } 289 }
253 /* Reset counts for open file. */ 290 /* Reset counts for open file. */
254 gcov_info_reset(info); 291 gcov_info_reset(info);
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
378 INIT_LIST_HEAD(&node->list); 415 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children); 416 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all); 417 INIT_LIST_HEAD(&node->all);
381 node->info = info; 418 if (node->loaded_info) {
419 node->loaded_info[0] = info;
420 node->num_loaded = 1;
421 }
382 node->parent = parent; 422 node->parent = parent;
383 if (name) 423 if (name)
384 strcpy(node->name, name); 424 strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
394 struct gcov_node *node; 434 struct gcov_node *node;
395 435
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); 436 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) { 437 if (!node)
398 pr_warning("out of memory\n"); 438 goto err_nomem;
399 return NULL; 439 if (info) {
440 node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
441 GFP_KERNEL);
442 if (!node->loaded_info)
443 goto err_nomem;
400 } 444 }
401 init_node(node, info, name, parent); 445 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */ 446 /* Differentiate between gcov data file nodes and directory nodes. */
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
416 list_add(&node->all, &all_head); 460 list_add(&node->all, &all_head);
417 461
418 return node; 462 return node;
463
464err_nomem:
465 kfree(node);
466 pr_warning("out of memory\n");
467 return NULL;
419} 468}
420 469
421/* Remove symbolic links associated with node. */ 470/* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
441 list_del(&node->all); 490 list_del(&node->all);
442 debugfs_remove(node->dentry); 491 debugfs_remove(node->dentry);
443 remove_links(node); 492 remove_links(node);
444 if (node->ghost) 493 kfree(node->loaded_info);
445 gcov_info_free(node->ghost); 494 if (node->unloaded_info)
495 gcov_info_free(node->unloaded_info);
446 kfree(node); 496 kfree(node);
447} 497}
448 498
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
477 527
478/* 528/*
479 * write() implementation for reset file. Reset all profiling data to zero 529 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes. 530 * and remove nodes for which all associated object files are unloaded.
481 */ 531 */
482static ssize_t reset_write(struct file *file, const char __user *addr, 532static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos) 533 size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
487 mutex_lock(&node_lock); 537 mutex_lock(&node_lock);
488restart: 538restart:
489 list_for_each_entry(node, &all_head, all) { 539 list_for_each_entry(node, &all_head, all) {
490 if (node->info) 540 if (node->num_loaded > 0)
491 gcov_info_reset(node->info); 541 reset_node(node);
492 else if (list_empty(&node->children)) { 542 else if (list_empty(&node->children)) {
493 remove_node(node); 543 remove_node(node);
494 /* Several nodes may have gone - restart loop. */ 544 /* Several nodes may have gone - restart loop. */
@@ -564,37 +614,115 @@ err_remove:
564} 614}
565 615
566/* 616/*
567 * The profiling data set associated with this node is being unloaded. Store a 617 * Associate a profiling data set with an existing node. Needs to be called
568 * copy of the profiling data and turn this node into a "ghost". 618 * with node_lock held.
569 */ 619 */
570static int ghost_node(struct gcov_node *node) 620static void add_info(struct gcov_node *node, struct gcov_info *info)
571{ 621{
572 node->ghost = gcov_info_dup(node->info); 622 struct gcov_info **loaded_info;
573 if (!node->ghost) { 623 int num = node->num_loaded;
574 pr_warning("could not save data for '%s' (out of memory)\n", 624
575 node->info->filename); 625 /*
576 return -ENOMEM; 626 * Prepare new array. This is done first to simplify cleanup in
627 * case the new data set is incompatible, the node only contains
628 * unloaded data sets and there's not enough memory for the array.
629 */
630 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
631 if (!loaded_info) {
632 pr_warning("could not add '%s' (out of memory)\n",
633 info->filename);
634 return;
635 }
636 memcpy(loaded_info, node->loaded_info,
637 num * sizeof(struct gcov_info *));
638 loaded_info[num] = info;
639 /* Check if the new data set is compatible. */
640 if (num == 0) {
641 /*
642 * A module was unloaded, modified and reloaded. The new
643 * data set replaces the copy of the last one.
644 */
645 if (!gcov_info_is_compatible(node->unloaded_info, info)) {
646 pr_warning("discarding saved data for %s "
647 "(incompatible version)\n", info->filename);
648 gcov_info_free(node->unloaded_info);
649 node->unloaded_info = NULL;
650 }
651 } else {
652 /*
653 * Two different versions of the same object file are loaded.
654 * The initial one takes precedence.
655 */
656 if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
657 pr_warning("could not add '%s' (incompatible "
658 "version)\n", info->filename);
659 kfree(loaded_info);
660 return;
661 }
577 } 662 }
578 node->info = NULL; 663 /* Overwrite previous array. */
664 kfree(node->loaded_info);
665 node->loaded_info = loaded_info;
666 node->num_loaded = num + 1;
667}
579 668
580 return 0; 669/*
670 * Return the index of a profiling data set associated with a node.
671 */
672static int get_info_index(struct gcov_node *node, struct gcov_info *info)
673{
674 int i;
675
676 for (i = 0; i < node->num_loaded; i++) {
677 if (node->loaded_info[i] == info)
678 return i;
679 }
680 return -ENOENT;
581} 681}
582 682
583/* 683/*
584 * Profiling data for this node has been loaded again. Add profiling data 684 * Save the data of a profiling data set which is being unloaded.
585 * from previous instantiation and turn this node into a regular node.
586 */ 685 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info) 686static void save_info(struct gcov_node *node, struct gcov_info *info)
588{ 687{
589 if (gcov_info_is_compatible(node->ghost, info)) 688 if (node->unloaded_info)
590 gcov_info_add(info, node->ghost); 689 gcov_info_add(node->unloaded_info, info);
591 else { 690 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n", 691 node->unloaded_info = gcov_info_dup(info);
692 if (!node->unloaded_info) {
693 pr_warning("could not save data for '%s' "
694 "(out of memory)\n", info->filename);
695 }
696 }
697}
698
699/*
700 * Disassociate a profiling data set from a node. Needs to be called with
701 * node_lock held.
702 */
703static void remove_info(struct gcov_node *node, struct gcov_info *info)
704{
705 int i;
706
707 i = get_info_index(node, info);
708 if (i < 0) {
709 pr_warning("could not remove '%s' (not found)\n",
593 info->filename); 710 info->filename);
711 return;
594 } 712 }
595 gcov_info_free(node->ghost); 713 if (gcov_persist)
596 node->ghost = NULL; 714 save_info(node, info);
597 node->info = info; 715 /* Shrink array. */
716 node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
717 node->num_loaded--;
718 if (node->num_loaded > 0)
719 return;
720 /* Last loaded data set was removed. */
721 kfree(node->loaded_info);
722 node->loaded_info = NULL;
723 node->num_loaded = 0;
724 if (!node->unloaded_info)
725 remove_node(node);
598} 726}
599 727
600/* 728/*
@@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
609 node = get_node_by_name(info->filename); 737 node = get_node_by_name(info->filename);
610 switch (action) { 738 switch (action) {
611 case GCOV_ADD: 739 case GCOV_ADD:
612 /* Add new node or revive ghost. */ 740 if (node)
613 if (!node) { 741 add_info(node, info);
742 else
614 add_node(info); 743 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break; 744 break;
624 case GCOV_REMOVE: 745 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */ 746 if (node)
626 if (!node) { 747 remove_info(node, info);
748 else {
627 pr_warning("could not remove '%s' (not found)\n", 749 pr_warning("could not remove '%s' (not found)\n",
628 info->filename); 750 info->filename);
629 break;
630 } 751 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break; 752 break;
637 } 753 }
638 mutex_unlock(&node_lock); 754 mutex_unlock(&node_lock);
diff --git a/kernel/groups.c b/kernel/groups.c
index 53b1916c9492..253dc0f35cf4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
143 right = group_info->ngroups; 143 right = group_info->ngroups;
144 while (left < right) { 144 while (left < right) {
145 unsigned int mid = (left+right)/2; 145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid); 146 if (grp > GROUP_AT(group_info, mid))
147 if (cmp > 0)
148 left = mid + 1; 147 left = mid + 1;
149 else if (cmp < 0) 148 else if (grp < GROUP_AT(group_info, mid))
150 right = mid; 149 right = mid;
151 else 150 else
152 return 1; 151 return 1;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..1decafbb6b1a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
90 do { 90 do {
91 seq = read_seqbegin(&xtime_lock); 91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time(); 92 xts = __current_kernel_time();
93 tom = wall_to_monotonic; 93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq)); 94 } while (read_seqretry(&xtime_lock, seq));
95 95
96 xtim = timespec_to_ktime(xts); 96 xtim = timespec_to_ktime(xts);
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
144static int hrtimer_get_target(int this_cpu, int pinned) 144static int hrtimer_get_target(int this_cpu, int pinned)
145{ 145{
146#ifdef CONFIG_NO_HZ 146#ifdef CONFIG_NO_HZ
147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { 147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
148 int preferred_cpu = get_nohz_load_balancer(); 148 return get_nohz_timer_target();
149
150 if (preferred_cpu >= 0)
151 return preferred_cpu;
152 }
153#endif 149#endif
154 return this_cpu; 150 return this_cpu;
155} 151}
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
612static void retrigger_next_event(void *arg) 608static void retrigger_next_event(void *arg)
613{ 609{
614 struct hrtimer_cpu_base *base; 610 struct hrtimer_cpu_base *base;
615 struct timespec realtime_offset; 611 struct timespec realtime_offset, wtm;
616 unsigned long seq; 612 unsigned long seq;
617 613
618 if (!hrtimer_hres_active()) 614 if (!hrtimer_hres_active())
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg)
620 616
621 do { 617 do {
622 seq = read_seqbegin(&xtime_lock); 618 seq = read_seqbegin(&xtime_lock);
623 set_normalized_timespec(&realtime_offset, 619 wtm = __get_wall_to_monotonic();
624 -wall_to_monotonic.tv_sec,
625 -wall_to_monotonic.tv_nsec);
626 } while (read_seqretry(&xtime_lock, seq)); 620 } while (read_seqretry(&xtime_lock, seq));
621 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
627 622
628 base = &__get_cpu_var(hrtimer_bases); 623 base = &__get_cpu_var(hrtimer_bases);
629 624
@@ -1096,11 +1091,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
1096 */ 1091 */
1097ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 1092ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1098{ 1093{
1099 struct hrtimer_clock_base *base;
1100 unsigned long flags; 1094 unsigned long flags;
1101 ktime_t rem; 1095 ktime_t rem;
1102 1096
1103 base = lock_hrtimer_base(timer, &flags); 1097 lock_hrtimer_base(timer, &flags);
1104 rem = hrtimer_expires_remaining(timer); 1098 rem = hrtimer_expires_remaining(timer);
1105 unlock_hrtimer_base(timer, &flags); 1099 unlock_hrtimer_base(timer, &flags);
1106 1100
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..c7c2aed9e2dc 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/list.h>
44#include <linux/cpu.h> 45#include <linux/cpu.h>
45#include <linux/smp.h> 46#include <linux/smp.h>
46 47
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62 63
63static int nr_slots[TYPE_MAX]; 64static int nr_slots[TYPE_MAX];
64 65
66/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head);
68
65static int constraints_initialized; 69static int constraints_initialized;
66 70
67/* Gather the number of total pinned and un-pinned bp in a cpuset */ 71/* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
103 return 0; 107 return 0;
104} 108}
105 109
106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) 110/*
111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list.
113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
107{ 115{
108 struct perf_event_context *ctx = tsk->perf_event_ctxp; 116 struct perf_event_context *ctx = bp->ctx;
109 struct list_head *list; 117 struct perf_event *iter;
110 struct perf_event *bp;
111 unsigned long flags;
112 int count = 0; 118 int count = 0;
113 119
114 if (WARN_ONCE(!ctx, "No perf context for this task")) 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
115 return 0; 121 if (iter->ctx == ctx && find_slot_idx(iter) == type)
116 122 count += hw_breakpoint_weight(iter);
117 list = &ctx->event_list;
118
119 raw_spin_lock_irqsave(&ctx->lock, flags);
120
121 /*
122 * The current breakpoint counter is not included in the list
123 * at the open() callback time
124 */
125 list_for_each_entry(bp, list, event_entry) {
126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
129 } 123 }
130 124
131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
132
133 return count; 125 return count;
134} 126}
135 127
@@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
149 if (!tsk) 141 if (!tsk)
150 slots->pinned += max_task_bp_pinned(cpu, type); 142 slots->pinned += max_task_bp_pinned(cpu, type);
151 else 143 else
152 slots->pinned += task_bp_pinned(tsk, type); 144 slots->pinned += task_bp_pinned(bp, type);
153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
154 146
155 return; 147 return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
162 if (!tsk) 154 if (!tsk)
163 nr += max_task_bp_pinned(cpu, type); 155 nr += max_task_bp_pinned(cpu, type);
164 else 156 else
165 nr += task_bp_pinned(tsk, type); 157 nr += task_bp_pinned(bp, type);
166 158
167 if (nr > slots->pinned) 159 if (nr > slots->pinned)
168 slots->pinned = nr; 160 slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
188/* 180/*
189 * Add a pinned breakpoint for the given task in our constraint table 181 * Add a pinned breakpoint for the given task in our constraint table
190 */ 182 */
191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, 183static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
192 enum bp_type_idx type, int weight) 184 enum bp_type_idx type, int weight)
193{ 185{
194 unsigned int *tsk_pinned; 186 unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
196 int old_idx = 0; 188 int old_idx = 0;
197 int idx = 0; 189 int idx = 0;
198 190
199 old_count = task_bp_pinned(tsk, type); 191 old_count = task_bp_pinned(bp, type);
200 old_idx = old_count - 1; 192 old_idx = old_count - 1;
201 idx = old_idx + weight; 193 idx = old_idx + weight;
202 194
195 /* tsk_pinned[n] is the number of tasks having n breakpoints */
203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); 196 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
204 if (enable) { 197 if (enable) {
205 tsk_pinned[idx]++; 198 tsk_pinned[idx]++;
@@ -222,23 +215,41 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
222 int cpu = bp->cpu; 215 int cpu = bp->cpu;
223 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->ctx->task;
224 217
218 /* Pinned counter cpu profiling */
219 if (!tsk) {
220
221 if (enable)
222 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
223 else
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
225 return;
226 }
227
225 /* Pinned counter task profiling */ 228 /* Pinned counter task profiling */
226 if (tsk) {
227 if (cpu >= 0) {
228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
229 return;
230 }
231 229
230 if (!enable)
231 list_del(&bp->hw.bp_list);
232
233 if (cpu >= 0) {
234 toggle_bp_task_slot(bp, cpu, enable, type, weight);
235 } else {
232 for_each_online_cpu(cpu) 236 for_each_online_cpu(cpu)
233 toggle_bp_task_slot(tsk, cpu, enable, type, weight); 237 toggle_bp_task_slot(bp, cpu, enable, type, weight);
234 return;
235 } 238 }
236 239
237 /* Pinned counter cpu profiling */
238 if (enable) 240 if (enable)
239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 241 list_add_tail(&bp->hw.bp_list, &bp_task_head);
240 else 242}
241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; 243
244/*
245 * Function to perform processor-specific cleanup during unregistration
246 */
247__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
248{
249 /*
250 * A weak stub function here for those archs that don't define
251 * it inside arch/.../kernel/hw_breakpoint.c
252 */
242} 253}
243 254
244/* 255/*
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
301 weight = hw_breakpoint_weight(bp); 312 weight = hw_breakpoint_weight(bp);
302 313
303 fetch_bp_busy_slots(&slots, bp, type); 314 fetch_bp_busy_slots(&slots, bp, type);
315 /*
316 * Simulate the addition of this breakpoint to the constraints
317 * and see the result.
318 */
304 fetch_this_slot(&slots, weight); 319 fetch_this_slot(&slots, weight);
305 320
306 /* Flexible counters need to keep at least one slot */ 321 /* Flexible counters need to keep at least one slot */
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp)
339{ 354{
340 mutex_lock(&nr_bp_mutex); 355 mutex_lock(&nr_bp_mutex);
341 356
357 arch_unregister_hw_breakpoint(bp);
342 __release_bp_slot(bp); 358 __release_bp_slot(bp);
343 359
344 mutex_unlock(&nr_bp_mutex); 360 mutex_unlock(&nr_bp_mutex);
@@ -417,7 +433,8 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
417 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
418 struct task_struct *tsk) 434 struct task_struct *tsk)
419{ 435{
420 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 436 return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
437 triggered);
421} 438}
422EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 439EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
423 440
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..c3003e9d91a3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
216void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) 216void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
217{ 217{
218 if (suspend) { 218 if (suspend) {
219 if (!desc->action || (desc->action->flags & IRQF_TIMER)) 219 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
220 return; 220 return;
221 desc->status |= IRQ_SUSPENDED; 221 desc->status |= IRQ_SUSPENDED;
222 } 222 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 131b1703936f..c0613f7d6730 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
151 image->nr_segments = nr_segments; 151 image->nr_segments = nr_segments;
152 segment_bytes = nr_segments * sizeof(*segments); 152 segment_bytes = nr_segments * sizeof(*segments);
153 result = copy_from_user(image->segment, segments, segment_bytes); 153 result = copy_from_user(image->segment, segments, segment_bytes);
154 if (result) 154 if (result) {
155 result = -EFAULT;
155 goto out; 156 goto out;
157 }
156 158
157 /* 159 /*
158 * Verify we have good destination addresses. The caller is 160 * Verify we have good destination addresses. The caller is
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image,
827 result = copy_from_user(ptr, buf, uchunk); 829 result = copy_from_user(ptr, buf, uchunk);
828 kunmap(page); 830 kunmap(page);
829 if (result) { 831 if (result) {
830 result = (result < 0) ? result : -EIO; 832 result = -EFAULT;
831 goto out; 833 goto out;
832 } 834 }
833 ubytes -= uchunk; 835 ubytes -= uchunk;
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image,
882 kexec_flush_icache_page(page); 884 kexec_flush_icache_page(page);
883 kunmap(page); 885 kunmap(page);
884 if (result) { 886 if (result) {
885 result = (result < 0) ? result : -EIO; 887 result = -EFAULT;
886 goto out; 888 goto out;
887 } 889 }
888 ubytes -= uchunk; 890 ubytes -= uchunk;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 35edbe22e9a9..01a0700e873f 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,8 +1,7 @@
1/* 1/*
2 * A generic kernel FIFO implementation. 2 * A generic kernel FIFO implementation
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> 4 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
6 * 5 *
7 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -11,7 +10,7 @@
11 * 10 *
12 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details. 14 * GNU General Public License for more details.
16 * 15 *
17 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
@@ -24,422 +23,586 @@
24#include <linux/module.h> 23#include <linux/module.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26#include <linux/err.h> 25#include <linux/err.h>
27#include <linux/kfifo.h>
28#include <linux/log2.h> 26#include <linux/log2.h>
29#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kfifo.h>
30 29
31static void _kfifo_init(struct kfifo *fifo, void *buffer, 30/*
32 unsigned int size) 31 * internal helper to calculate the unused elements in a fifo
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
39
40/**
41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
43 * @buffer: the preallocated buffer to be used.
44 * @size: the size of the internal buffer, this has to be a power of 2.
45 *
46 */ 32 */
47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) 33static inline unsigned int kfifo_unused(struct __kfifo *fifo)
48{ 34{
49 /* size must be a power of 2 */ 35 return (fifo->mask + 1) - (fifo->in - fifo->out);
50 BUG_ON(!is_power_of_2(size));
51
52 _kfifo_init(fifo, buffer, size);
53} 36}
54EXPORT_SYMBOL(kfifo_init);
55 37
56/** 38int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
57 * kfifo_alloc - allocates a new FIFO internal buffer 39 size_t esize, gfp_t gfp_mask)
58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
61 *
62 * This function dynamically allocates a new fifo internal buffer
63 *
64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */
68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 40{
70 unsigned char *buffer;
71
72 /* 41 /*
73 * round up to the next power of 2, since our 'let the indices 42 * round down to the next power of 2, since our 'let the indices
74 * wrap' technique works only in this case. 43 * wrap' technique works only in this case.
75 */ 44 */
76 if (!is_power_of_2(size)) { 45 if (!is_power_of_2(size))
77 BUG_ON(size > 0x80000000); 46 size = rounddown_pow_of_two(size);
78 size = roundup_pow_of_two(size); 47
48 fifo->in = 0;
49 fifo->out = 0;
50 fifo->esize = esize;
51
52 if (size < 2) {
53 fifo->data = NULL;
54 fifo->mask = 0;
55 return -EINVAL;
79 } 56 }
80 57
81 buffer = kmalloc(size, gfp_mask); 58 fifo->data = kmalloc(size * esize, gfp_mask);
82 if (!buffer) { 59
83 _kfifo_init(fifo, NULL, 0); 60 if (!fifo->data) {
61 fifo->mask = 0;
84 return -ENOMEM; 62 return -ENOMEM;
85 } 63 }
86 64 fifo->mask = size - 1;
87 _kfifo_init(fifo, buffer, size);
88 65
89 return 0; 66 return 0;
90} 67}
91EXPORT_SYMBOL(kfifo_alloc); 68EXPORT_SYMBOL(__kfifo_alloc);
92 69
93/** 70void __kfifo_free(struct __kfifo *fifo)
94 * kfifo_free - frees the FIFO internal buffer
95 * @fifo: the fifo to be freed.
96 */
97void kfifo_free(struct kfifo *fifo)
98{ 71{
99 kfree(fifo->buffer); 72 kfree(fifo->data);
100 _kfifo_init(fifo, NULL, 0); 73 fifo->in = 0;
74 fifo->out = 0;
75 fifo->esize = 0;
76 fifo->data = NULL;
77 fifo->mask = 0;
101} 78}
102EXPORT_SYMBOL(kfifo_free); 79EXPORT_SYMBOL(__kfifo_free);
103 80
104/** 81int __kfifo_init(struct __kfifo *fifo, void *buffer,
105 * kfifo_skip - skip output data 82 unsigned int size, size_t esize)
106 * @fifo: the fifo to be used.
107 * @len: number of bytes to skip
108 */
109void kfifo_skip(struct kfifo *fifo, unsigned int len)
110{ 83{
111 if (len < kfifo_len(fifo)) { 84 size /= esize;
112 __kfifo_add_out(fifo, len); 85
113 return; 86 if (!is_power_of_2(size))
87 size = rounddown_pow_of_two(size);
88
89 fifo->in = 0;
90 fifo->out = 0;
91 fifo->esize = esize;
92 fifo->data = buffer;
93
94 if (size < 2) {
95 fifo->mask = 0;
96 return -EINVAL;
114 } 97 }
115 kfifo_reset_out(fifo); 98 fifo->mask = size - 1;
99
100 return 0;
116} 101}
117EXPORT_SYMBOL(kfifo_skip); 102EXPORT_SYMBOL(__kfifo_init);
118 103
119static inline void __kfifo_in_data(struct kfifo *fifo, 104static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
120 const void *from, unsigned int len, unsigned int off) 105 unsigned int len, unsigned int off)
121{ 106{
107 unsigned int size = fifo->mask + 1;
108 unsigned int esize = fifo->esize;
122 unsigned int l; 109 unsigned int l;
123 110
111 off &= fifo->mask;
112 if (esize != 1) {
113 off *= esize;
114 size *= esize;
115 len *= esize;
116 }
117 l = min(len, size - off);
118
119 memcpy(fifo->data + off, src, l);
120 memcpy(fifo->data, src + l, len - l);
124 /* 121 /*
125 * Ensure that we sample the fifo->out index -before- we 122 * make sure that the data in the fifo is up to date before
126 * start putting bytes into the kfifo. 123 * incrementing the fifo->in index counter
127 */ 124 */
125 smp_wmb();
126}
128 127
129 smp_mb(); 128unsigned int __kfifo_in(struct __kfifo *fifo,
130 129 const void *buf, unsigned int len)
131 off = __kfifo_off(fifo, fifo->in + off); 130{
131 unsigned int l;
132 132
133 /* first put the data starting from fifo->in to buffer end */ 133 l = kfifo_unused(fifo);
134 l = min(len, fifo->size - off); 134 if (len > l)
135 memcpy(fifo->buffer + off, from, l); 135 len = l;
136 136
137 /* then put the rest (if any) at the beginning of the buffer */ 137 kfifo_copy_in(fifo, buf, len, fifo->in);
138 memcpy(fifo->buffer, from + l, len - l); 138 fifo->in += len;
139 return len;
139} 140}
141EXPORT_SYMBOL(__kfifo_in);
140 142
141static inline void __kfifo_out_data(struct kfifo *fifo, 143static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
142 void *to, unsigned int len, unsigned int off) 144 unsigned int len, unsigned int off)
143{ 145{
146 unsigned int size = fifo->mask + 1;
147 unsigned int esize = fifo->esize;
144 unsigned int l; 148 unsigned int l;
145 149
150 off &= fifo->mask;
151 if (esize != 1) {
152 off *= esize;
153 size *= esize;
154 len *= esize;
155 }
156 l = min(len, size - off);
157
158 memcpy(dst, fifo->data + off, l);
159 memcpy(dst + l, fifo->data, len - l);
146 /* 160 /*
147 * Ensure that we sample the fifo->in index -before- we 161 * make sure that the data is copied before
148 * start removing bytes from the kfifo. 162 * incrementing the fifo->out index counter
149 */ 163 */
164 smp_wmb();
165}
150 166
151 smp_rmb(); 167unsigned int __kfifo_out_peek(struct __kfifo *fifo,
168 void *buf, unsigned int len)
169{
170 unsigned int l;
152 171
153 off = __kfifo_off(fifo, fifo->out + off); 172 l = fifo->in - fifo->out;
173 if (len > l)
174 len = l;
154 175
155 /* first get the data from fifo->out until the end of the buffer */ 176 kfifo_copy_out(fifo, buf, len, fifo->out);
156 l = min(len, fifo->size - off); 177 return len;
157 memcpy(to, fifo->buffer + off, l); 178}
179EXPORT_SYMBOL(__kfifo_out_peek);
158 180
159 /* then get the rest (if any) from the beginning of the buffer */ 181unsigned int __kfifo_out(struct __kfifo *fifo,
160 memcpy(to + l, fifo->buffer, len - l); 182 void *buf, unsigned int len)
183{
184 len = __kfifo_out_peek(fifo, buf, len);
185 fifo->out += len;
186 return len;
161} 187}
188EXPORT_SYMBOL(__kfifo_out);
162 189
163static inline int __kfifo_from_user_data(struct kfifo *fifo, 190static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off, 191 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout) 192 unsigned int *copied)
166{ 193{
194 unsigned int size = fifo->mask + 1;
195 unsigned int esize = fifo->esize;
167 unsigned int l; 196 unsigned int l;
168 int ret; 197 unsigned long ret;
169 198
199 off &= fifo->mask;
200 if (esize != 1) {
201 off *= esize;
202 size *= esize;
203 len *= esize;
204 }
205 l = min(len, size - off);
206
207 ret = copy_from_user(fifo->data + off, from, l);
208 if (unlikely(ret))
209 ret = DIV_ROUND_UP(ret + len - l, esize);
210 else {
211 ret = copy_from_user(fifo->data, from + l, len - l);
212 if (unlikely(ret))
213 ret = DIV_ROUND_UP(ret, esize);
214 }
170 /* 215 /*
171 * Ensure that we sample the fifo->out index -before- we 216 * make sure that the data in the fifo is up to date before
172 * start putting bytes into the kfifo. 217 * incrementing the fifo->in index counter
173 */ 218 */
219 smp_wmb();
220 *copied = len - ret;
221 /* return the number of elements which are not copied */
222 return ret;
223}
174 224
175 smp_mb(); 225int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
226 unsigned long len, unsigned int *copied)
227{
228 unsigned int l;
229 unsigned long ret;
230 unsigned int esize = fifo->esize;
231 int err;
176 232
177 off = __kfifo_off(fifo, fifo->in + off); 233 if (esize != 1)
234 len /= esize;
178 235
179 /* first put the data starting from fifo->in to buffer end */ 236 l = kfifo_unused(fifo);
180 l = min(len, fifo->size - off); 237 if (len > l)
181 ret = copy_from_user(fifo->buffer + off, from, l); 238 len = l;
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
187 239
188 /* then put the rest (if any) at the beginning of the buffer */ 240 ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
189 ret = copy_from_user(fifo->buffer, from + l, len - l); 241 if (unlikely(ret)) {
190 *lenout += ret ? ret : len - l; 242 len -= ret;
191 return ret ? -EFAULT : 0; 243 err = -EFAULT;
244 } else
245 err = 0;
246 fifo->in += len;
247 return err;
192} 248}
249EXPORT_SYMBOL(__kfifo_from_user);
193 250
194static inline int __kfifo_to_user_data(struct kfifo *fifo, 251static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout) 252 unsigned int len, unsigned int off, unsigned int *copied)
196{ 253{
197 unsigned int l; 254 unsigned int l;
198 int ret; 255 unsigned long ret;
199 256 unsigned int size = fifo->mask + 1;
257 unsigned int esize = fifo->esize;
258
259 off &= fifo->mask;
260 if (esize != 1) {
261 off *= esize;
262 size *= esize;
263 len *= esize;
264 }
265 l = min(len, size - off);
266
267 ret = copy_to_user(to, fifo->data + off, l);
268 if (unlikely(ret))
269 ret = DIV_ROUND_UP(ret + len - l, esize);
270 else {
271 ret = copy_to_user(to + l, fifo->data, len - l);
272 if (unlikely(ret))
273 ret = DIV_ROUND_UP(ret, esize);
274 }
200 /* 275 /*
201 * Ensure that we sample the fifo->in index -before- we 276 * make sure that the data is copied before
202 * start removing bytes from the kfifo. 277 * incrementing the fifo->out index counter
203 */ 278 */
279 smp_wmb();
280 *copied = len - ret;
281 /* return the number of elements which are not copied */
282 return ret;
283}
204 284
205 smp_rmb(); 285int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
286 unsigned long len, unsigned int *copied)
287{
288 unsigned int l;
289 unsigned long ret;
290 unsigned int esize = fifo->esize;
291 int err;
206 292
207 off = __kfifo_off(fifo, fifo->out + off); 293 if (esize != 1)
294 len /= esize;
208 295
209 /* first get the data from fifo->out until the end of the buffer */ 296 l = fifo->in - fifo->out;
210 l = min(len, fifo->size - off); 297 if (len > l)
211 ret = copy_to_user(to, fifo->buffer + off, l); 298 len = l;
212 *lenout = l; 299 ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
213 if (unlikely(ret)) { 300 if (unlikely(ret)) {
214 *lenout -= ret; 301 len -= ret;
215 return -EFAULT; 302 err = -EFAULT;
216 } 303 } else
304 err = 0;
305 fifo->out += len;
306 return err;
307}
308EXPORT_SYMBOL(__kfifo_to_user);
217 309
218 /* then get the rest (if any) from the beginning of the buffer */ 310static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
219 len -= l; 311 int nents, unsigned int len)
220 ret = copy_to_user(to + l, fifo->buffer, len); 312{
221 if (unlikely(ret)) { 313 int n;
222 *lenout += len - ret; 314 unsigned int l;
223 return -EFAULT; 315 unsigned int off;
316 struct page *page;
317
318 if (!nents)
319 return 0;
320
321 if (!len)
322 return 0;
323
324 n = 0;
325 page = virt_to_page(buf);
326 off = offset_in_page(buf);
327 l = 0;
328
329 while (len >= l + PAGE_SIZE - off) {
330 struct page *npage;
331
332 l += PAGE_SIZE;
333 buf += PAGE_SIZE;
334 npage = virt_to_page(buf);
335 if (page_to_phys(page) != page_to_phys(npage) - l) {
336 sg_set_page(sgl, page, l - off, off);
337 sgl = sg_next(sgl);
338 if (++n == nents || sgl == NULL)
339 return n;
340 page = npage;
341 len -= l - off;
342 l = off = 0;
343 }
224 } 344 }
225 *lenout += len; 345 sg_set_page(sgl, page, len, off);
226 return 0; 346 return n + 1;
227} 347}
228 348
229unsigned int __kfifo_in_n(struct kfifo *fifo, 349static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
230 const void *from, unsigned int len, unsigned int recsize) 350 int nents, unsigned int len, unsigned int off)
231{ 351{
232 if (kfifo_avail(fifo) < len + recsize) 352 unsigned int size = fifo->mask + 1;
233 return len + 1; 353 unsigned int esize = fifo->esize;
354 unsigned int l;
355 unsigned int n;
234 356
235 __kfifo_in_data(fifo, from, len, recsize); 357 off &= fifo->mask;
236 return 0; 358 if (esize != 1) {
359 off *= esize;
360 size *= esize;
361 len *= esize;
362 }
363 l = min(len, size - off);
364
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367
368 return n;
237} 369}
238EXPORT_SYMBOL(__kfifo_in_n);
239 370
240/** 371unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
241 * kfifo_in - puts some data into the FIFO 372 struct scatterlist *sgl, int nents, unsigned int len)
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{ 373{
256 len = min(kfifo_avail(fifo), len); 374 unsigned int l;
257 375
258 __kfifo_in_data(fifo, from, len, 0); 376 l = kfifo_unused(fifo);
259 __kfifo_add_in(fifo, len); 377 if (len > l)
260 return len; 378 len = l;
379
380 return setup_sgl(fifo, sgl, nents, len, fifo->in);
261} 381}
262EXPORT_SYMBOL(kfifo_in); 382EXPORT_SYMBOL(__kfifo_dma_in_prepare);
263 383
264unsigned int __kfifo_in_generic(struct kfifo *fifo, 384unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize) 385 struct scatterlist *sgl, int nents, unsigned int len)
266{ 386{
267 return __kfifo_in_rec(fifo, from, len, recsize); 387 unsigned int l;
388
389 l = fifo->in - fifo->out;
390 if (len > l)
391 len = l;
392
393 return setup_sgl(fifo, sgl, nents, len, fifo->out);
268} 394}
269EXPORT_SYMBOL(__kfifo_in_generic); 395EXPORT_SYMBOL(__kfifo_dma_out_prepare);
270 396
271unsigned int __kfifo_out_n(struct kfifo *fifo, 397unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
272 void *to, unsigned int len, unsigned int recsize)
273{ 398{
274 if (kfifo_len(fifo) < len + recsize) 399 unsigned int max = (1 << (recsize << 3)) - 1;
275 return len;
276 400
277 __kfifo_out_data(fifo, to, len, recsize); 401 if (len > max)
278 __kfifo_add_out(fifo, len + recsize); 402 return max;
279 return 0; 403 return len;
280} 404}
281EXPORT_SYMBOL(__kfifo_out_n);
282 405
283/** 406#define __KFIFO_PEEK(data, out, mask) \
284 * kfifo_out - gets some data from the FIFO 407 ((data)[(out) & (mask)])
285 * @fifo: the fifo to be used. 408/*
286 * @to: where the data must be copied. 409 * __kfifo_peek_n internal helper function for determinate the length of
287 * @len: the size of the destination buffer. 410 * the next record in the fifo
288 *
289 * This function copies at most @len bytes from the FIFO into the
290 * @to buffer and returns the number of copied bytes.
291 *
292 * Note that with only one concurrent reader and one concurrent
293 * writer, you don't need extra locking to use these functions.
294 */ 411 */
295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) 412static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
296{ 413{
297 len = min(kfifo_len(fifo), len); 414 unsigned int l;
415 unsigned int mask = fifo->mask;
416 unsigned char *data = fifo->data;
298 417
299 __kfifo_out_data(fifo, to, len, 0); 418 l = __KFIFO_PEEK(data, fifo->out, mask);
300 __kfifo_add_out(fifo, len);
301 419
302 return len; 420 if (--recsize)
421 l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
422
423 return l;
303} 424}
304EXPORT_SYMBOL(kfifo_out);
305 425
306/** 426#define __KFIFO_POKE(data, in, mask, val) \
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it 427 ( \
308 * @fifo: the fifo to be used. 428 (data)[(in) & (mask)] = (unsigned char)(val) \
309 * @to: where the data must be copied. 429 )
310 * @len: the size of the destination buffer. 430
311 * @offset: offset into the fifo 431/*
312 * 432 * __kfifo_poke_n internal helper function for storeing the length of
313 * This function copies at most @len bytes at @offset from the FIFO 433 * the record into the fifo
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */ 434 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, 435static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
318 unsigned offset)
319{ 436{
320 len = min(kfifo_len(fifo), len + offset); 437 unsigned int mask = fifo->mask;
438 unsigned char *data = fifo->data;
321 439
322 __kfifo_out_data(fifo, to, len, offset); 440 __KFIFO_POKE(data, fifo->in, mask, n);
323 return len; 441
442 if (recsize > 1)
443 __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
324} 444}
325EXPORT_SYMBOL(kfifo_out_peek);
326 445
327unsigned int __kfifo_out_generic(struct kfifo *fifo, 446unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{ 447{
331 return __kfifo_out_rec(fifo, to, len, recsize, total); 448 return __kfifo_peek_n(fifo, recsize);
332} 449}
333EXPORT_SYMBOL(__kfifo_out_generic); 450EXPORT_SYMBOL(__kfifo_len_r);
334 451
335unsigned int __kfifo_from_user_n(struct kfifo *fifo, 452unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
336 const void __user *from, unsigned int len, unsigned int recsize) 453 unsigned int len, size_t recsize)
337{ 454{
338 unsigned total; 455 if (len + recsize > kfifo_unused(fifo))
456 return 0;
339 457
340 if (kfifo_avail(fifo) < len + recsize) 458 __kfifo_poke_n(fifo, len, recsize);
341 return len + 1;
342 459
343 __kfifo_from_user_data(fifo, from, len, recsize, &total); 460 kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
344 return total; 461 fifo->in += len + recsize;
462 return len;
345} 463}
346EXPORT_SYMBOL(__kfifo_from_user_n); 464EXPORT_SYMBOL(__kfifo_in_r);
347 465
348/** 466static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
349 * kfifo_from_user - puts some data from user space into the FIFO 467 void *buf, unsigned int len, size_t recsize, unsigned int *n)
350 * @fifo: the fifo to be used. 468{
351 * @from: pointer to the data to be added. 469 *n = __kfifo_peek_n(fifo, recsize);
352 * @len: the length of the data to be added. 470
353 * @total: the actual returned data length. 471 if (len > *n)
354 * 472 len = *n;
355 * This function copies at most @len bytes from the @from into the 473
356 * FIFO depending and returns -EFAULT/0. 474 kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
357 * 475 return len;
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
371} 476}
372EXPORT_SYMBOL(kfifo_from_user);
373 477
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo, 478unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
375 const void __user *from, unsigned int len, unsigned int recsize) 479 unsigned int len, size_t recsize)
376{ 480{
377 return __kfifo_from_user_rec(fifo, from, len, recsize); 481 unsigned int n;
482
483 if (fifo->in == fifo->out)
484 return 0;
485
486 return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
378} 487}
379EXPORT_SYMBOL(__kfifo_from_user_generic); 488EXPORT_SYMBOL(__kfifo_out_peek_r);
380 489
381unsigned int __kfifo_to_user_n(struct kfifo *fifo, 490unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
382 void __user *to, unsigned int len, unsigned int reclen, 491 unsigned int len, size_t recsize)
383 unsigned int recsize)
384{ 492{
385 unsigned int ret, total; 493 unsigned int n;
386 494
387 if (kfifo_len(fifo) < reclen + recsize) 495 if (fifo->in == fifo->out)
388 return len; 496 return 0;
389 497
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); 498 len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
499 fifo->out += n + recsize;
500 return len;
501}
502EXPORT_SYMBOL(__kfifo_out_r);
391 503
392 if (likely(ret == 0)) 504void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
393 __kfifo_add_out(fifo, reclen + recsize); 505{
506 unsigned int n;
394 507
395 return total; 508 n = __kfifo_peek_n(fifo, recsize);
509 fifo->out += n + recsize;
396} 510}
397EXPORT_SYMBOL(__kfifo_to_user_n); 511EXPORT_SYMBOL(__kfifo_skip_r);
398 512
399/** 513int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
400 * kfifo_to_user - gets data from the FIFO and write it to user space 514 unsigned long len, unsigned int *copied, size_t recsize)
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{ 515{
415 int ret; 516 unsigned long ret;
416 len = min(kfifo_len(fifo), len); 517
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); 518 len = __kfifo_max_r(len, recsize);
418 __kfifo_add_out(fifo, *lenout); 519
419 return ret; 520 if (len + recsize > kfifo_unused(fifo)) {
521 *copied = 0;
522 return 0;
523 }
524
525 __kfifo_poke_n(fifo, len, recsize);
526
527 ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
528 if (unlikely(ret)) {
529 *copied = 0;
530 return -EFAULT;
531 }
532 fifo->in += len + recsize;
533 return 0;
420} 534}
421EXPORT_SYMBOL(kfifo_to_user); 535EXPORT_SYMBOL(__kfifo_from_user_r);
422 536
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo, 537int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
424 void __user *to, unsigned int len, unsigned int recsize, 538 unsigned long len, unsigned int *copied, size_t recsize)
425 unsigned int *total)
426{ 539{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total); 540 unsigned long ret;
541 unsigned int n;
542
543 if (fifo->in == fifo->out) {
544 *copied = 0;
545 return 0;
546 }
547
548 n = __kfifo_peek_n(fifo, recsize);
549 if (len > n)
550 len = n;
551
552 ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
553 if (unlikely(ret)) {
554 *copied = 0;
555 return -EFAULT;
556 }
557 fifo->out += n + recsize;
558 return 0;
428} 559}
429EXPORT_SYMBOL(__kfifo_to_user_generic); 560EXPORT_SYMBOL(__kfifo_to_user_r);
430 561
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) 562unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
563 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
432{ 564{
433 if (recsize == 0) 565 if (!nents)
434 return kfifo_avail(fifo); 566 BUG();
435 567
436 return __kfifo_peek_n(fifo, recsize); 568 len = __kfifo_max_r(len, recsize);
569
570 if (len + recsize > kfifo_unused(fifo))
571 return 0;
572
573 return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
437} 574}
438EXPORT_SYMBOL(__kfifo_peek_generic); 575EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
439 576
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) 577void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
578 unsigned int len, size_t recsize)
441{ 579{
442 __kfifo_skip_rec(fifo, recsize); 580 len = __kfifo_max_r(len, recsize);
581 __kfifo_poke_n(fifo, len, recsize);
582 fifo->in += len + recsize;
443} 583}
444EXPORT_SYMBOL(__kfifo_skip_generic); 584EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
445 585
586unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
587 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
588{
589 if (!nents)
590 BUG();
591
592 len = __kfifo_max_r(len, recsize);
593
594 if (len + recsize > fifo->in - fifo->out)
595 return 0;
596
597 return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
598}
599EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
600
601void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
602{
603 unsigned int len;
604
605 len = __kfifo_peek_n(fifo, recsize);
606 fifo->out += len + recsize;
607}
608EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6e9b19667a8d..9cd0591c96a2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data)
153 goto fail; 153 goto fail;
154 } 154 }
155 155
156 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 156 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp);
157 159
158 /* Exec failed? */ 160 /* Exec failed? */
159fail: 161fail:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
18#include <linux/freezer.h>
17#include <trace/events/sched.h> 19#include <trace/events/sched.h>
18 20
19static DEFINE_SPINLOCK(kthread_create_lock); 21static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
35 37
36struct kthread { 38struct kthread {
37 int should_stop; 39 int should_stop;
40 void *data;
38 struct completion exited; 41 struct completion exited;
39}; 42};
40 43
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
54} 57}
55EXPORT_SYMBOL(kthread_should_stop); 58EXPORT_SYMBOL(kthread_should_stop);
56 59
60/**
61 * kthread_data - return data value specified on kthread creation
62 * @task: kthread task in question
63 *
64 * Return the data value specified when kthread @task was created.
65 * The caller is responsible for ensuring the validity of @task when
66 * calling this function.
67 */
68void *kthread_data(struct task_struct *task)
69{
70 return to_kthread(task)->data;
71}
72
57static int kthread(void *_create) 73static int kthread(void *_create)
58{ 74{
59 /* Copy data: it's on kthread's stack */ 75 /* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
64 int ret; 80 int ret;
65 81
66 self.should_stop = 0; 82 self.should_stop = 0;
83 self.data = data;
67 init_completion(&self.exited); 84 init_completion(&self.exited);
68 current->vfork_done = &self.exited; 85 current->vfork_done = &self.exited;
69 86
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
247 264
248 return 0; 265 return 0;
249} 266}
267
268/**
269 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker
271 *
272 * This function can be used as @threadfn to kthread_create() or
273 * kthread_run() with @worker_ptr argument pointing to an initialized
274 * kthread_worker. The started kthread will process work_list until
275 * the it is stopped with kthread_stop(). A kthread can also call
276 * this function directly after extra initialization.
277 *
278 * Different kthreads can be used for the same kthread_worker as long
279 * as there's only one kthread attached to it at any given time. A
280 * kthread_worker without an attached kthread simply collects queued
281 * kthread_works.
282 */
283int kthread_worker_fn(void *worker_ptr)
284{
285 struct kthread_worker *worker = worker_ptr;
286 struct kthread_work *work;
287
288 WARN_ON(worker->task);
289 worker->task = current;
290repeat:
291 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
292
293 if (kthread_should_stop()) {
294 __set_current_state(TASK_RUNNING);
295 spin_lock_irq(&worker->lock);
296 worker->task = NULL;
297 spin_unlock_irq(&worker->lock);
298 return 0;
299 }
300
301 work = NULL;
302 spin_lock_irq(&worker->lock);
303 if (!list_empty(&worker->work_list)) {
304 work = list_first_entry(&worker->work_list,
305 struct kthread_work, node);
306 list_del_init(&work->node);
307 }
308 spin_unlock_irq(&worker->lock);
309
310 if (work) {
311 __set_current_state(TASK_RUNNING);
312 work->func(work);
313 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
314 work->done_seq = work->queue_seq;
315 smp_mb(); /* mb worker-b1 paired with flush-b0 */
316 if (atomic_read(&work->flushing))
317 wake_up_all(&work->done);
318 } else if (!freezing(current))
319 schedule();
320
321 try_to_freeze();
322 goto repeat;
323}
324EXPORT_SYMBOL_GPL(kthread_worker_fn);
325
326/**
327 * queue_kthread_work - queue a kthread_work
328 * @worker: target kthread_worker
329 * @work: kthread_work to queue
330 *
331 * Queue @work to work processor @task for async execution. @task
332 * must have been created with kthread_worker_create(). Returns %true
333 * if @work was successfully queued, %false if it was already pending.
334 */
335bool queue_kthread_work(struct kthread_worker *worker,
336 struct kthread_work *work)
337{
338 bool ret = false;
339 unsigned long flags;
340
341 spin_lock_irqsave(&worker->lock, flags);
342 if (list_empty(&work->node)) {
343 list_add_tail(&work->node, &worker->work_list);
344 work->queue_seq++;
345 if (likely(worker->task))
346 wake_up_process(worker->task);
347 ret = true;
348 }
349 spin_unlock_irqrestore(&worker->lock, flags);
350 return ret;
351}
352EXPORT_SYMBOL_GPL(queue_kthread_work);
353
354/**
355 * flush_kthread_work - flush a kthread_work
356 * @work: work to flush
357 *
358 * If @work is queued or executing, wait for it to finish execution.
359 */
360void flush_kthread_work(struct kthread_work *work)
361{
362 int seq = work->queue_seq;
363
364 atomic_inc(&work->flushing);
365
366 /*
367 * mb flush-b0 paired with worker-b1, to make sure either
368 * worker sees the above increment or we see done_seq update.
369 */
370 smp_mb__after_atomic_inc();
371
372 /* A - B <= 0 tests whether B is in front of A regardless of overflow */
373 wait_event(work->done, seq - work->done_seq <= 0);
374 atomic_dec(&work->flushing);
375
376 /*
377 * rmb flush-b1 paired with worker-b0, to make sure our caller
378 * sees every change made by work->func().
379 */
380 smp_mb__after_atomic_dec();
381}
382EXPORT_SYMBOL_GPL(flush_kthread_work);
383
384struct kthread_flush_work {
385 struct kthread_work work;
386 struct completion done;
387};
388
389static void kthread_flush_work_fn(struct kthread_work *work)
390{
391 struct kthread_flush_work *fwork =
392 container_of(work, struct kthread_flush_work, work);
393 complete(&fwork->done);
394}
395
396/**
397 * flush_kthread_worker - flush all current works on a kthread_worker
398 * @worker: worker to flush
399 *
400 * Wait until all currently executing or pending works on @worker are
401 * finished.
402 */
403void flush_kthread_worker(struct kthread_worker *worker)
404{
405 struct kthread_flush_work fwork = {
406 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
407 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
408 };
409
410 queue_kthread_work(worker, &fwork.work);
411 wait_for_completion(&fwork.done);
412}
413EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
146 146
147static inline u64 lockstat_clock(void) 147static inline u64 lockstat_clock(void)
148{ 148{
149 return cpu_clock(smp_processor_id()); 149 return local_clock();
150} 150}
151 151
152static int lock_point(unsigned long points[], unsigned long ip) 152static int lock_point(unsigned long points[], unsigned long ip)
diff --git a/kernel/module.c b/kernel/module.c
index 6c562828c85c..ccd641991842 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
1/* 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU General Public License as published by
@@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb)
110} 110}
111EXPORT_SYMBOL(unregister_module_notifier); 111EXPORT_SYMBOL(unregister_module_notifier);
112 112
113struct load_info {
114 Elf_Ehdr *hdr;
115 unsigned long len;
116 Elf_Shdr *sechdrs;
117 char *secstrings, *strtab;
118 unsigned long *strmap;
119 unsigned long symoffs, stroffs;
120 struct _ddebug *debug;
121 unsigned int num_debug;
122 struct {
123 unsigned int sym, str, mod, vers, info, pcpu;
124 } index;
125};
126
113/* We require a truly strong try_module_get(): 0 means failure due to 127/* We require a truly strong try_module_get(): 0 means failure due to
114 ongoing or failed initialization etc. */ 128 ongoing or failed initialization etc. */
115static inline int strong_try_module_get(struct module *mod) 129static inline int strong_try_module_get(struct module *mod)
@@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code)
140EXPORT_SYMBOL(__module_put_and_exit); 154EXPORT_SYMBOL(__module_put_and_exit);
141 155
142/* Find a module section: 0 means not found. */ 156/* Find a module section: 0 means not found. */
143static unsigned int find_sec(Elf_Ehdr *hdr, 157static unsigned int find_sec(const struct load_info *info, const char *name)
144 Elf_Shdr *sechdrs,
145 const char *secstrings,
146 const char *name)
147{ 158{
148 unsigned int i; 159 unsigned int i;
149 160
150 for (i = 1; i < hdr->e_shnum; i++) 161 for (i = 1; i < info->hdr->e_shnum; i++) {
162 Elf_Shdr *shdr = &info->sechdrs[i];
151 /* Alloc bit cleared means "ignore it." */ 163 /* Alloc bit cleared means "ignore it." */
152 if ((sechdrs[i].sh_flags & SHF_ALLOC) 164 if ((shdr->sh_flags & SHF_ALLOC)
153 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) 165 && strcmp(info->secstrings + shdr->sh_name, name) == 0)
154 return i; 166 return i;
167 }
155 return 0; 168 return 0;
156} 169}
157 170
158/* Find a module section, or NULL. */ 171/* Find a module section, or NULL. */
159static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, 172static void *section_addr(const struct load_info *info, const char *name)
160 const char *secstrings, const char *name)
161{ 173{
162 /* Section 0 has sh_addr 0. */ 174 /* Section 0 has sh_addr 0. */
163 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; 175 return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
164} 176}
165 177
166/* Find a module section, or NULL. Fill in number of "objects" in section. */ 178/* Find a module section, or NULL. Fill in number of "objects" in section. */
167static void *section_objs(Elf_Ehdr *hdr, 179static void *section_objs(const struct load_info *info,
168 Elf_Shdr *sechdrs,
169 const char *secstrings,
170 const char *name, 180 const char *name,
171 size_t object_size, 181 size_t object_size,
172 unsigned int *num) 182 unsigned int *num)
173{ 183{
174 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); 184 unsigned int sec = find_sec(info, name);
175 185
176 /* Section 0 has sh_addr 0 and sh_size 0. */ 186 /* Section 0 has sh_addr 0 and sh_size 0. */
177 *num = sechdrs[sec].sh_size / object_size; 187 *num = info->sechdrs[sec].sh_size / object_size;
178 return (void *)sechdrs[sec].sh_addr; 188 return (void *)info->sechdrs[sec].sh_addr;
179} 189}
180 190
181/* Provided by the linker */ 191/* Provided by the linker */
@@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
227 unsigned int symnum, void *data), void *data) 237 unsigned int symnum, void *data), void *data)
228{ 238{
229 struct module *mod; 239 struct module *mod;
230 const struct symsearch arr[] = { 240 static const struct symsearch arr[] = {
231 { __start___ksymtab, __stop___ksymtab, __start___kcrctab, 241 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
232 NOT_GPL_ONLY, false }, 242 NOT_GPL_ONLY, false },
233 { __start___ksymtab_gpl, __stop___ksymtab_gpl, 243 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod,
392 mod->percpu = __alloc_reserved_percpu(size, align); 402 mod->percpu = __alloc_reserved_percpu(size, align);
393 if (!mod->percpu) { 403 if (!mod->percpu) {
394 printk(KERN_WARNING 404 printk(KERN_WARNING
395 "Could not allocate %lu bytes percpu data\n", size); 405 "%s: Could not allocate %lu bytes percpu data\n",
406 mod->name, size);
396 return -ENOMEM; 407 return -ENOMEM;
397 } 408 }
398 mod->percpu_size = size; 409 mod->percpu_size = size;
@@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod)
404 free_percpu(mod->percpu); 415 free_percpu(mod->percpu);
405} 416}
406 417
407static unsigned int find_pcpusec(Elf_Ehdr *hdr, 418static unsigned int find_pcpusec(struct load_info *info)
408 Elf_Shdr *sechdrs,
409 const char *secstrings)
410{ 419{
411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); 420 return find_sec(info, ".data..percpu");
412} 421}
413 422
414static void percpu_modcopy(struct module *mod, 423static void percpu_modcopy(struct module *mod,
@@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod,
468static inline void percpu_modfree(struct module *mod) 477static inline void percpu_modfree(struct module *mod)
469{ 478{
470} 479}
471static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 480static unsigned int find_pcpusec(struct load_info *info)
472 Elf_Shdr *sechdrs,
473 const char *secstrings)
474{ 481{
475 return 0; 482 return 0;
476} 483}
@@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
524EXPORT_TRACEPOINT_SYMBOL(module_get); 531EXPORT_TRACEPOINT_SYMBOL(module_get);
525 532
526/* Init the unload section of the module. */ 533/* Init the unload section of the module. */
527static void module_unload_init(struct module *mod) 534static int module_unload_init(struct module *mod)
528{ 535{
529 int cpu; 536 mod->refptr = alloc_percpu(struct module_ref);
537 if (!mod->refptr)
538 return -ENOMEM;
530 539
531 INIT_LIST_HEAD(&mod->source_list); 540 INIT_LIST_HEAD(&mod->source_list);
532 INIT_LIST_HEAD(&mod->target_list); 541 INIT_LIST_HEAD(&mod->target_list);
533 for_each_possible_cpu(cpu) {
534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
536 }
537 542
538 /* Hold reference count during initialization. */ 543 /* Hold reference count during initialization. */
539 __this_cpu_write(mod->refptr->incs, 1); 544 __this_cpu_write(mod->refptr->incs, 1);
540 /* Backwards compatibility macros put refcount during init. */ 545 /* Backwards compatibility macros put refcount during init. */
541 mod->waiter = current; 546 mod->waiter = current;
547
548 return 0;
542} 549}
543 550
544/* Does a already use b? */ 551/* Does a already use b? */
@@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod)
618 kfree(use); 625 kfree(use);
619 } 626 }
620 mutex_unlock(&module_mutex); 627 mutex_unlock(&module_mutex);
628
629 free_percpu(mod->refptr);
621} 630}
622 631
623#ifdef CONFIG_MODULE_FORCE_UNLOAD 632#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -891,8 +900,9 @@ int ref_module(struct module *a, struct module *b)
891} 900}
892EXPORT_SYMBOL_GPL(ref_module); 901EXPORT_SYMBOL_GPL(ref_module);
893 902
894static inline void module_unload_init(struct module *mod) 903static inline int module_unload_init(struct module *mod)
895{ 904{
905 return 0;
896} 906}
897#endif /* CONFIG_MODULE_UNLOAD */ 907#endif /* CONFIG_MODULE_UNLOAD */
898 908
@@ -1051,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1051#endif /* CONFIG_MODVERSIONS */ 1061#endif /* CONFIG_MODVERSIONS */
1052 1062
1053/* Resolve a symbol for this module. I.e. if we find one, record usage. */ 1063/* Resolve a symbol for this module. I.e. if we find one, record usage. */
1054static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1064static const struct kernel_symbol *resolve_symbol(struct module *mod,
1055 unsigned int versindex, 1065 const struct load_info *info,
1056 const char *name, 1066 const char *name,
1057 struct module *mod,
1058 char ownername[]) 1067 char ownername[])
1059{ 1068{
1060 struct module *owner; 1069 struct module *owner;
@@ -1068,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1068 if (!sym) 1077 if (!sym)
1069 goto unlock; 1078 goto unlock;
1070 1079
1071 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { 1080 if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
1081 owner)) {
1072 sym = ERR_PTR(-EINVAL); 1082 sym = ERR_PTR(-EINVAL);
1073 goto getname; 1083 goto getname;
1074 } 1084 }
@@ -1087,21 +1097,20 @@ unlock:
1087 return sym; 1097 return sym;
1088} 1098}
1089 1099
1090static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, 1100static const struct kernel_symbol *
1091 unsigned int versindex, 1101resolve_symbol_wait(struct module *mod,
1092 const char *name, 1102 const struct load_info *info,
1093 struct module *mod) 1103 const char *name)
1094{ 1104{
1095 const struct kernel_symbol *ksym; 1105 const struct kernel_symbol *ksym;
1096 char ownername[MODULE_NAME_LEN]; 1106 char owner[MODULE_NAME_LEN];
1097 1107
1098 if (wait_event_interruptible_timeout(module_wq, 1108 if (wait_event_interruptible_timeout(module_wq,
1099 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, 1109 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
1100 mod, ownername)) || 1110 || PTR_ERR(ksym) != -EBUSY,
1101 PTR_ERR(ksym) != -EBUSY,
1102 30 * HZ) <= 0) { 1111 30 * HZ) <= 0) {
1103 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", 1112 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1104 mod->name, ownername); 1113 mod->name, owner);
1105 } 1114 }
1106 return ksym; 1115 return ksym;
1107} 1116}
@@ -1110,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1110 * /sys/module/foo/sections stuff 1119 * /sys/module/foo/sections stuff
1111 * J. Corbet <corbet@lwn.net> 1120 * J. Corbet <corbet@lwn.net>
1112 */ 1121 */
1113#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1122#ifdef CONFIG_SYSFS
1114 1123
1124#ifdef CONFIG_KALLSYMS
1115static inline bool sect_empty(const Elf_Shdr *sect) 1125static inline bool sect_empty(const Elf_Shdr *sect)
1116{ 1126{
1117 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1127 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1148,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1148 kfree(sect_attrs); 1158 kfree(sect_attrs);
1149} 1159}
1150 1160
1151static void add_sect_attrs(struct module *mod, unsigned int nsect, 1161static void add_sect_attrs(struct module *mod, const struct load_info *info)
1152 char *secstrings, Elf_Shdr *sechdrs)
1153{ 1162{
1154 unsigned int nloaded = 0, i, size[2]; 1163 unsigned int nloaded = 0, i, size[2];
1155 struct module_sect_attrs *sect_attrs; 1164 struct module_sect_attrs *sect_attrs;
@@ -1157,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1157 struct attribute **gattr; 1166 struct attribute **gattr;
1158 1167
1159 /* Count loaded sections and allocate structures */ 1168 /* Count loaded sections and allocate structures */
1160 for (i = 0; i < nsect; i++) 1169 for (i = 0; i < info->hdr->e_shnum; i++)
1161 if (!sect_empty(&sechdrs[i])) 1170 if (!sect_empty(&info->sechdrs[i]))
1162 nloaded++; 1171 nloaded++;
1163 size[0] = ALIGN(sizeof(*sect_attrs) 1172 size[0] = ALIGN(sizeof(*sect_attrs)
1164 + nloaded * sizeof(sect_attrs->attrs[0]), 1173 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1175,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1175 sect_attrs->nsections = 0; 1184 sect_attrs->nsections = 0;
1176 sattr = &sect_attrs->attrs[0]; 1185 sattr = &sect_attrs->attrs[0];
1177 gattr = &sect_attrs->grp.attrs[0]; 1186 gattr = &sect_attrs->grp.attrs[0];
1178 for (i = 0; i < nsect; i++) { 1187 for (i = 0; i < info->hdr->e_shnum; i++) {
1179 if (sect_empty(&sechdrs[i])) 1188 Elf_Shdr *sec = &info->sechdrs[i];
1189 if (sect_empty(sec))
1180 continue; 1190 continue;
1181 sattr->address = sechdrs[i].sh_addr; 1191 sattr->address = sec->sh_addr;
1182 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1192 sattr->name = kstrdup(info->secstrings + sec->sh_name,
1183 GFP_KERNEL); 1193 GFP_KERNEL);
1184 if (sattr->name == NULL) 1194 if (sattr->name == NULL)
1185 goto out; 1195 goto out;
@@ -1247,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1247 kfree(notes_attrs); 1257 kfree(notes_attrs);
1248} 1258}
1249 1259
1250static void add_notes_attrs(struct module *mod, unsigned int nsect, 1260static void add_notes_attrs(struct module *mod, const struct load_info *info)
1251 char *secstrings, Elf_Shdr *sechdrs)
1252{ 1261{
1253 unsigned int notes, loaded, i; 1262 unsigned int notes, loaded, i;
1254 struct module_notes_attrs *notes_attrs; 1263 struct module_notes_attrs *notes_attrs;
@@ -1260,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1260 1269
1261 /* Count notes sections and allocate structures. */ 1270 /* Count notes sections and allocate structures. */
1262 notes = 0; 1271 notes = 0;
1263 for (i = 0; i < nsect; i++) 1272 for (i = 0; i < info->hdr->e_shnum; i++)
1264 if (!sect_empty(&sechdrs[i]) && 1273 if (!sect_empty(&info->sechdrs[i]) &&
1265 (sechdrs[i].sh_type == SHT_NOTE)) 1274 (info->sechdrs[i].sh_type == SHT_NOTE))
1266 ++notes; 1275 ++notes;
1267 1276
1268 if (notes == 0) 1277 if (notes == 0)
@@ -1276,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1276 1285
1277 notes_attrs->notes = notes; 1286 notes_attrs->notes = notes;
1278 nattr = &notes_attrs->attrs[0]; 1287 nattr = &notes_attrs->attrs[0];
1279 for (loaded = i = 0; i < nsect; ++i) { 1288 for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
1280 if (sect_empty(&sechdrs[i])) 1289 if (sect_empty(&info->sechdrs[i]))
1281 continue; 1290 continue;
1282 if (sechdrs[i].sh_type == SHT_NOTE) { 1291 if (info->sechdrs[i].sh_type == SHT_NOTE) {
1283 sysfs_bin_attr_init(nattr); 1292 sysfs_bin_attr_init(nattr);
1284 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1293 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1285 nattr->attr.mode = S_IRUGO; 1294 nattr->attr.mode = S_IRUGO;
1286 nattr->size = sechdrs[i].sh_size; 1295 nattr->size = info->sechdrs[i].sh_size;
1287 nattr->private = (void *) sechdrs[i].sh_addr; 1296 nattr->private = (void *) info->sechdrs[i].sh_addr;
1288 nattr->read = module_notes_read; 1297 nattr->read = module_notes_read;
1289 ++nattr; 1298 ++nattr;
1290 } 1299 }
@@ -1315,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod)
1315 1324
1316#else 1325#else
1317 1326
1318static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1327static inline void add_sect_attrs(struct module *mod,
1319 char *sectstrings, Elf_Shdr *sechdrs) 1328 const struct load_info *info)
1320{ 1329{
1321} 1330}
1322 1331
@@ -1324,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod)
1324{ 1333{
1325} 1334}
1326 1335
1327static inline void add_notes_attrs(struct module *mod, unsigned int nsect, 1336static inline void add_notes_attrs(struct module *mod,
1328 char *sectstrings, Elf_Shdr *sechdrs) 1337 const struct load_info *info)
1329{ 1338{
1330} 1339}
1331 1340
1332static inline void remove_notes_attrs(struct module *mod) 1341static inline void remove_notes_attrs(struct module *mod)
1333{ 1342{
1334} 1343}
1335#endif 1344#endif /* CONFIG_KALLSYMS */
1336 1345
1337#ifdef CONFIG_SYSFS
1338static void add_usage_links(struct module *mod) 1346static void add_usage_links(struct module *mod)
1339{ 1347{
1340#ifdef CONFIG_MODULE_UNLOAD 1348#ifdef CONFIG_MODULE_UNLOAD
@@ -1439,6 +1447,7 @@ out:
1439} 1447}
1440 1448
1441static int mod_sysfs_setup(struct module *mod, 1449static int mod_sysfs_setup(struct module *mod,
1450 const struct load_info *info,
1442 struct kernel_param *kparam, 1451 struct kernel_param *kparam,
1443 unsigned int num_params) 1452 unsigned int num_params)
1444{ 1453{
@@ -1463,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod,
1463 goto out_unreg_param; 1472 goto out_unreg_param;
1464 1473
1465 add_usage_links(mod); 1474 add_usage_links(mod);
1475 add_sect_attrs(mod, info);
1476 add_notes_attrs(mod, info);
1466 1477
1467 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1478 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1468 return 0; 1479 return 0;
@@ -1479,33 +1490,26 @@ out:
1479 1490
1480static void mod_sysfs_fini(struct module *mod) 1491static void mod_sysfs_fini(struct module *mod)
1481{ 1492{
1493 remove_notes_attrs(mod);
1494 remove_sect_attrs(mod);
1482 kobject_put(&mod->mkobj.kobj); 1495 kobject_put(&mod->mkobj.kobj);
1483} 1496}
1484 1497
1485#else /* CONFIG_SYSFS */ 1498#else /* !CONFIG_SYSFS */
1486
1487static inline int mod_sysfs_init(struct module *mod)
1488{
1489 return 0;
1490}
1491 1499
1492static inline int mod_sysfs_setup(struct module *mod, 1500static int mod_sysfs_setup(struct module *mod,
1501 const struct load_info *info,
1493 struct kernel_param *kparam, 1502 struct kernel_param *kparam,
1494 unsigned int num_params) 1503 unsigned int num_params)
1495{ 1504{
1496 return 0; 1505 return 0;
1497} 1506}
1498 1507
1499static inline int module_add_modinfo_attrs(struct module *mod) 1508static void mod_sysfs_fini(struct module *mod)
1500{
1501 return 0;
1502}
1503
1504static inline void module_remove_modinfo_attrs(struct module *mod)
1505{ 1509{
1506} 1510}
1507 1511
1508static void mod_sysfs_fini(struct module *mod) 1512static void module_remove_modinfo_attrs(struct module *mod)
1509{ 1513{
1510} 1514}
1511 1515
@@ -1515,7 +1519,7 @@ static void del_usage_links(struct module *mod)
1515 1519
1516#endif /* CONFIG_SYSFS */ 1520#endif /* CONFIG_SYSFS */
1517 1521
1518static void mod_kobject_remove(struct module *mod) 1522static void mod_sysfs_teardown(struct module *mod)
1519{ 1523{
1520 del_usage_links(mod); 1524 del_usage_links(mod);
1521 module_remove_modinfo_attrs(mod); 1525 module_remove_modinfo_attrs(mod);
@@ -1533,6 +1537,7 @@ static int __unlink_module(void *_mod)
1533{ 1537{
1534 struct module *mod = _mod; 1538 struct module *mod = _mod;
1535 list_del(&mod->list); 1539 list_del(&mod->list);
1540 module_bug_cleanup(mod);
1536 return 0; 1541 return 0;
1537} 1542}
1538 1543
@@ -1545,9 +1550,7 @@ static void free_module(struct module *mod)
1545 mutex_lock(&module_mutex); 1550 mutex_lock(&module_mutex);
1546 stop_machine(__unlink_module, mod, NULL); 1551 stop_machine(__unlink_module, mod, NULL);
1547 mutex_unlock(&module_mutex); 1552 mutex_unlock(&module_mutex);
1548 remove_notes_attrs(mod); 1553 mod_sysfs_teardown(mod);
1549 remove_sect_attrs(mod);
1550 mod_kobject_remove(mod);
1551 1554
1552 /* Remove dynamic debug info */ 1555 /* Remove dynamic debug info */
1553 ddebug_remove_module(mod->name); 1556 ddebug_remove_module(mod->name);
@@ -1565,10 +1568,7 @@ static void free_module(struct module *mod)
1565 module_free(mod, mod->module_init); 1568 module_free(mod, mod->module_init);
1566 kfree(mod->args); 1569 kfree(mod->args);
1567 percpu_modfree(mod); 1570 percpu_modfree(mod);
1568#if defined(CONFIG_MODULE_UNLOAD) 1571
1569 if (mod->refptr)
1570 free_percpu(mod->refptr);
1571#endif
1572 /* Free lock-classes: */ 1572 /* Free lock-classes: */
1573 lockdep_free_key_range(mod->module_core, mod->core_size); 1573 lockdep_free_key_range(mod->module_core, mod->core_size);
1574 1574
@@ -1634,25 +1634,23 @@ static int verify_export_symbols(struct module *mod)
1634} 1634}
1635 1635
1636/* Change all symbols so that st_value encodes the pointer directly. */ 1636/* Change all symbols so that st_value encodes the pointer directly. */
1637static int simplify_symbols(Elf_Shdr *sechdrs, 1637static int simplify_symbols(struct module *mod, const struct load_info *info)
1638 unsigned int symindex, 1638{
1639 const char *strtab, 1639 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
1640 unsigned int versindex, 1640 Elf_Sym *sym = (void *)symsec->sh_addr;
1641 unsigned int pcpuindex,
1642 struct module *mod)
1643{
1644 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
1645 unsigned long secbase; 1641 unsigned long secbase;
1646 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1642 unsigned int i;
1647 int ret = 0; 1643 int ret = 0;
1648 const struct kernel_symbol *ksym; 1644 const struct kernel_symbol *ksym;
1649 1645
1650 for (i = 1; i < n; i++) { 1646 for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
1647 const char *name = info->strtab + sym[i].st_name;
1648
1651 switch (sym[i].st_shndx) { 1649 switch (sym[i].st_shndx) {
1652 case SHN_COMMON: 1650 case SHN_COMMON:
1653 /* We compiled with -fno-common. These are not 1651 /* We compiled with -fno-common. These are not
1654 supposed to happen. */ 1652 supposed to happen. */
1655 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); 1653 DEBUGP("Common symbol: %s\n", name);
1656 printk("%s: please compile with -fno-common\n", 1654 printk("%s: please compile with -fno-common\n",
1657 mod->name); 1655 mod->name);
1658 ret = -ENOEXEC; 1656 ret = -ENOEXEC;
@@ -1665,9 +1663,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1665 break; 1663 break;
1666 1664
1667 case SHN_UNDEF: 1665 case SHN_UNDEF:
1668 ksym = resolve_symbol_wait(sechdrs, versindex, 1666 ksym = resolve_symbol_wait(mod, info, name);
1669 strtab + sym[i].st_name,
1670 mod);
1671 /* Ok if resolved. */ 1667 /* Ok if resolved. */
1672 if (ksym && !IS_ERR(ksym)) { 1668 if (ksym && !IS_ERR(ksym)) {
1673 sym[i].st_value = ksym->value; 1669 sym[i].st_value = ksym->value;
@@ -1679,17 +1675,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1679 break; 1675 break;
1680 1676
1681 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", 1677 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1682 mod->name, strtab + sym[i].st_name, 1678 mod->name, name, PTR_ERR(ksym));
1683 PTR_ERR(ksym));
1684 ret = PTR_ERR(ksym) ?: -ENOENT; 1679 ret = PTR_ERR(ksym) ?: -ENOENT;
1685 break; 1680 break;
1686 1681
1687 default: 1682 default:
1688 /* Divert to percpu allocation if a percpu var. */ 1683 /* Divert to percpu allocation if a percpu var. */
1689 if (sym[i].st_shndx == pcpuindex) 1684 if (sym[i].st_shndx == info->index.pcpu)
1690 secbase = (unsigned long)mod_percpu(mod); 1685 secbase = (unsigned long)mod_percpu(mod);
1691 else 1686 else
1692 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1687 secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
1693 sym[i].st_value += secbase; 1688 sym[i].st_value += secbase;
1694 break; 1689 break;
1695 } 1690 }
@@ -1698,6 +1693,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1698 return ret; 1693 return ret;
1699} 1694}
1700 1695
1696static int apply_relocations(struct module *mod, const struct load_info *info)
1697{
1698 unsigned int i;
1699 int err = 0;
1700
1701 /* Now do relocations. */
1702 for (i = 1; i < info->hdr->e_shnum; i++) {
1703 unsigned int infosec = info->sechdrs[i].sh_info;
1704
1705 /* Not a valid relocation section? */
1706 if (infosec >= info->hdr->e_shnum)
1707 continue;
1708
1709 /* Don't bother with non-allocated sections */
1710 if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
1711 continue;
1712
1713 if (info->sechdrs[i].sh_type == SHT_REL)
1714 err = apply_relocate(info->sechdrs, info->strtab,
1715 info->index.sym, i, mod);
1716 else if (info->sechdrs[i].sh_type == SHT_RELA)
1717 err = apply_relocate_add(info->sechdrs, info->strtab,
1718 info->index.sym, i, mod);
1719 if (err < 0)
1720 break;
1721 }
1722 return err;
1723}
1724
1701/* Additional bytes needed by arch in front of individual sections */ 1725/* Additional bytes needed by arch in front of individual sections */
1702unsigned int __weak arch_mod_section_prepend(struct module *mod, 1726unsigned int __weak arch_mod_section_prepend(struct module *mod,
1703 unsigned int section) 1727 unsigned int section)
@@ -1722,10 +1746,7 @@ static long get_offset(struct module *mod, unsigned int *size,
1722 might -- code, read-only data, read-write data, small data. Tally 1746 might -- code, read-only data, read-write data, small data. Tally
1723 sizes, and place the offsets into sh_entsize fields: high bit means it 1747 sizes, and place the offsets into sh_entsize fields: high bit means it
1724 belongs in init. */ 1748 belongs in init. */
1725static void layout_sections(struct module *mod, 1749static void layout_sections(struct module *mod, struct load_info *info)
1726 const Elf_Ehdr *hdr,
1727 Elf_Shdr *sechdrs,
1728 const char *secstrings)
1729{ 1750{
1730 static unsigned long const masks[][2] = { 1751 static unsigned long const masks[][2] = {
1731 /* NOTE: all executable code must be the first section 1752 /* NOTE: all executable code must be the first section
@@ -1738,21 +1759,22 @@ static void layout_sections(struct module *mod,
1738 }; 1759 };
1739 unsigned int m, i; 1760 unsigned int m, i;
1740 1761
1741 for (i = 0; i < hdr->e_shnum; i++) 1762 for (i = 0; i < info->hdr->e_shnum; i++)
1742 sechdrs[i].sh_entsize = ~0UL; 1763 info->sechdrs[i].sh_entsize = ~0UL;
1743 1764
1744 DEBUGP("Core section allocation order:\n"); 1765 DEBUGP("Core section allocation order:\n");
1745 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1766 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1746 for (i = 0; i < hdr->e_shnum; ++i) { 1767 for (i = 0; i < info->hdr->e_shnum; ++i) {
1747 Elf_Shdr *s = &sechdrs[i]; 1768 Elf_Shdr *s = &info->sechdrs[i];
1769 const char *sname = info->secstrings + s->sh_name;
1748 1770
1749 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1771 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1750 || (s->sh_flags & masks[m][1]) 1772 || (s->sh_flags & masks[m][1])
1751 || s->sh_entsize != ~0UL 1773 || s->sh_entsize != ~0UL
1752 || strstarts(secstrings + s->sh_name, ".init")) 1774 || strstarts(sname, ".init"))
1753 continue; 1775 continue;
1754 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1776 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1755 DEBUGP("\t%s\n", secstrings + s->sh_name); 1777 DEBUGP("\t%s\n", name);
1756 } 1778 }
1757 if (m == 0) 1779 if (m == 0)
1758 mod->core_text_size = mod->core_size; 1780 mod->core_text_size = mod->core_size;
@@ -1760,17 +1782,18 @@ static void layout_sections(struct module *mod,
1760 1782
1761 DEBUGP("Init section allocation order:\n"); 1783 DEBUGP("Init section allocation order:\n");
1762 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1784 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1763 for (i = 0; i < hdr->e_shnum; ++i) { 1785 for (i = 0; i < info->hdr->e_shnum; ++i) {
1764 Elf_Shdr *s = &sechdrs[i]; 1786 Elf_Shdr *s = &info->sechdrs[i];
1787 const char *sname = info->secstrings + s->sh_name;
1765 1788
1766 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1789 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1767 || (s->sh_flags & masks[m][1]) 1790 || (s->sh_flags & masks[m][1])
1768 || s->sh_entsize != ~0UL 1791 || s->sh_entsize != ~0UL
1769 || !strstarts(secstrings + s->sh_name, ".init")) 1792 || !strstarts(sname, ".init"))
1770 continue; 1793 continue;
1771 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1794 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1772 | INIT_OFFSET_MASK); 1795 | INIT_OFFSET_MASK);
1773 DEBUGP("\t%s\n", secstrings + s->sh_name); 1796 DEBUGP("\t%s\n", sname);
1774 } 1797 }
1775 if (m == 0) 1798 if (m == 0)
1776 mod->init_text_size = mod->init_size; 1799 mod->init_text_size = mod->init_size;
@@ -1809,33 +1832,28 @@ static char *next_string(char *string, unsigned long *secsize)
1809 return string; 1832 return string;
1810} 1833}
1811 1834
1812static char *get_modinfo(Elf_Shdr *sechdrs, 1835static char *get_modinfo(struct load_info *info, const char *tag)
1813 unsigned int info,
1814 const char *tag)
1815{ 1836{
1816 char *p; 1837 char *p;
1817 unsigned int taglen = strlen(tag); 1838 unsigned int taglen = strlen(tag);
1818 unsigned long size = sechdrs[info].sh_size; 1839 Elf_Shdr *infosec = &info->sechdrs[info->index.info];
1840 unsigned long size = infosec->sh_size;
1819 1841
1820 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { 1842 for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
1821 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') 1843 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1822 return p + taglen + 1; 1844 return p + taglen + 1;
1823 } 1845 }
1824 return NULL; 1846 return NULL;
1825} 1847}
1826 1848
1827static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, 1849static void setup_modinfo(struct module *mod, struct load_info *info)
1828 unsigned int infoindex)
1829{ 1850{
1830 struct module_attribute *attr; 1851 struct module_attribute *attr;
1831 int i; 1852 int i;
1832 1853
1833 for (i = 0; (attr = modinfo_attrs[i]); i++) { 1854 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1834 if (attr->setup) 1855 if (attr->setup)
1835 attr->setup(mod, 1856 attr->setup(mod, get_modinfo(info, attr->attr.name));
1836 get_modinfo(sechdrs,
1837 infoindex,
1838 attr->attr.name));
1839 } 1857 }
1840} 1858}
1841 1859
@@ -1876,11 +1894,10 @@ static int is_exported(const char *name, unsigned long value,
1876} 1894}
1877 1895
1878/* As per nm */ 1896/* As per nm */
1879static char elf_type(const Elf_Sym *sym, 1897static char elf_type(const Elf_Sym *sym, const struct load_info *info)
1880 Elf_Shdr *sechdrs,
1881 const char *secstrings,
1882 struct module *mod)
1883{ 1898{
1899 const Elf_Shdr *sechdrs = info->sechdrs;
1900
1884 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { 1901 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1885 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) 1902 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1886 return 'v'; 1903 return 'v';
@@ -1910,8 +1927,10 @@ static char elf_type(const Elf_Sym *sym,
1910 else 1927 else
1911 return 'b'; 1928 return 'b';
1912 } 1929 }
1913 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) 1930 if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
1931 ".debug")) {
1914 return 'n'; 1932 return 'n';
1933 }
1915 return '?'; 1934 return '?';
1916} 1935}
1917 1936
@@ -1936,127 +1955,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1936 return true; 1955 return true;
1937} 1956}
1938 1957
1939static unsigned long layout_symtab(struct module *mod, 1958static void layout_symtab(struct module *mod, struct load_info *info)
1940 Elf_Shdr *sechdrs,
1941 unsigned int symindex,
1942 unsigned int strindex,
1943 const Elf_Ehdr *hdr,
1944 const char *secstrings,
1945 unsigned long *pstroffs,
1946 unsigned long *strmap)
1947{ 1959{
1948 unsigned long symoffs; 1960 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
1949 Elf_Shdr *symsect = sechdrs + symindex; 1961 Elf_Shdr *strsect = info->sechdrs + info->index.str;
1950 Elf_Shdr *strsect = sechdrs + strindex;
1951 const Elf_Sym *src; 1962 const Elf_Sym *src;
1952 const char *strtab;
1953 unsigned int i, nsrc, ndst; 1963 unsigned int i, nsrc, ndst;
1954 1964
1955 /* Put symbol section at end of init part of module. */ 1965 /* Put symbol section at end of init part of module. */
1956 symsect->sh_flags |= SHF_ALLOC; 1966 symsect->sh_flags |= SHF_ALLOC;
1957 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 1967 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1958 symindex) | INIT_OFFSET_MASK; 1968 info->index.sym) | INIT_OFFSET_MASK;
1959 DEBUGP("\t%s\n", secstrings + symsect->sh_name); 1969 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
1960 1970
1961 src = (void *)hdr + symsect->sh_offset; 1971 src = (void *)info->hdr + symsect->sh_offset;
1962 nsrc = symsect->sh_size / sizeof(*src); 1972 nsrc = symsect->sh_size / sizeof(*src);
1963 strtab = (void *)hdr + strsect->sh_offset;
1964 for (ndst = i = 1; i < nsrc; ++i, ++src) 1973 for (ndst = i = 1; i < nsrc; ++i, ++src)
1965 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { 1974 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
1966 unsigned int j = src->st_name; 1975 unsigned int j = src->st_name;
1967 1976
1968 while(!__test_and_set_bit(j, strmap) && strtab[j]) 1977 while (!__test_and_set_bit(j, info->strmap)
1978 && info->strtab[j])
1969 ++j; 1979 ++j;
1970 ++ndst; 1980 ++ndst;
1971 } 1981 }
1972 1982
1973 /* Append room for core symbols at end of core part. */ 1983 /* Append room for core symbols at end of core part. */
1974 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 1984 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1975 mod->core_size = symoffs + ndst * sizeof(Elf_Sym); 1985 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
1976 1986
1977 /* Put string table section at end of init part of module. */ 1987 /* Put string table section at end of init part of module. */
1978 strsect->sh_flags |= SHF_ALLOC; 1988 strsect->sh_flags |= SHF_ALLOC;
1979 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 1989 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1980 strindex) | INIT_OFFSET_MASK; 1990 info->index.str) | INIT_OFFSET_MASK;
1981 DEBUGP("\t%s\n", secstrings + strsect->sh_name); 1991 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
1982 1992
1983 /* Append room for core symbols' strings at end of core part. */ 1993 /* Append room for core symbols' strings at end of core part. */
1984 *pstroffs = mod->core_size; 1994 info->stroffs = mod->core_size;
1985 __set_bit(0, strmap); 1995 __set_bit(0, info->strmap);
1986 mod->core_size += bitmap_weight(strmap, strsect->sh_size); 1996 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
1987
1988 return symoffs;
1989} 1997}
1990 1998
1991static void add_kallsyms(struct module *mod, 1999static void add_kallsyms(struct module *mod, const struct load_info *info)
1992 Elf_Shdr *sechdrs,
1993 unsigned int shnum,
1994 unsigned int symindex,
1995 unsigned int strindex,
1996 unsigned long symoffs,
1997 unsigned long stroffs,
1998 const char *secstrings,
1999 unsigned long *strmap)
2000{ 2000{
2001 unsigned int i, ndst; 2001 unsigned int i, ndst;
2002 const Elf_Sym *src; 2002 const Elf_Sym *src;
2003 Elf_Sym *dst; 2003 Elf_Sym *dst;
2004 char *s; 2004 char *s;
2005 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
2005 2006
2006 mod->symtab = (void *)sechdrs[symindex].sh_addr; 2007 mod->symtab = (void *)symsec->sh_addr;
2007 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 2008 mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
2008 mod->strtab = (void *)sechdrs[strindex].sh_addr; 2009 /* Make sure we get permanent strtab: don't use info->strtab. */
2010 mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
2009 2011
2010 /* Set types up while we still have access to sections. */ 2012 /* Set types up while we still have access to sections. */
2011 for (i = 0; i < mod->num_symtab; i++) 2013 for (i = 0; i < mod->num_symtab; i++)
2012 mod->symtab[i].st_info 2014 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2013 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
2014 2015
2015 mod->core_symtab = dst = mod->module_core + symoffs; 2016 mod->core_symtab = dst = mod->module_core + info->symoffs;
2016 src = mod->symtab; 2017 src = mod->symtab;
2017 *dst = *src; 2018 *dst = *src;
2018 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2019 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2019 if (!is_core_symbol(src, sechdrs, shnum)) 2020 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2020 continue; 2021 continue;
2021 dst[ndst] = *src; 2022 dst[ndst] = *src;
2022 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); 2023 dst[ndst].st_name = bitmap_weight(info->strmap,
2024 dst[ndst].st_name);
2023 ++ndst; 2025 ++ndst;
2024 } 2026 }
2025 mod->core_num_syms = ndst; 2027 mod->core_num_syms = ndst;
2026 2028
2027 mod->core_strtab = s = mod->module_core + stroffs; 2029 mod->core_strtab = s = mod->module_core + info->stroffs;
2028 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) 2030 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2029 if (test_bit(i, strmap)) 2031 if (test_bit(i, info->strmap))
2030 *++s = mod->strtab[i]; 2032 *++s = mod->strtab[i];
2031} 2033}
2032#else 2034#else
2033static inline unsigned long layout_symtab(struct module *mod, 2035static inline void layout_symtab(struct module *mod, struct load_info *info)
2034 Elf_Shdr *sechdrs,
2035 unsigned int symindex,
2036 unsigned int strindex,
2037 const Elf_Ehdr *hdr,
2038 const char *secstrings,
2039 unsigned long *pstroffs,
2040 unsigned long *strmap)
2041{ 2036{
2042 return 0;
2043} 2037}
2044 2038
2045static inline void add_kallsyms(struct module *mod, 2039static void add_kallsyms(struct module *mod, struct load_info *info)
2046 Elf_Shdr *sechdrs,
2047 unsigned int shnum,
2048 unsigned int symindex,
2049 unsigned int strindex,
2050 unsigned long symoffs,
2051 unsigned long stroffs,
2052 const char *secstrings,
2053 const unsigned long *strmap)
2054{ 2040{
2055} 2041}
2056#endif /* CONFIG_KALLSYMS */ 2042#endif /* CONFIG_KALLSYMS */
2057 2043
2058static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) 2044static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
2059{ 2045{
2046 if (!debug)
2047 return;
2060#ifdef CONFIG_DYNAMIC_DEBUG 2048#ifdef CONFIG_DYNAMIC_DEBUG
2061 if (ddebug_add_module(debug, num, debug->modname)) 2049 if (ddebug_add_module(debug, num, debug->modname))
2062 printk(KERN_ERR "dynamic debug error adding module: %s\n", 2050 printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -2087,65 +2075,47 @@ static void *module_alloc_update_bounds(unsigned long size)
2087} 2075}
2088 2076
2089#ifdef CONFIG_DEBUG_KMEMLEAK 2077#ifdef CONFIG_DEBUG_KMEMLEAK
2090static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2078static void kmemleak_load_module(const struct module *mod,
2091 Elf_Shdr *sechdrs, char *secstrings) 2079 const struct load_info *info)
2092{ 2080{
2093 unsigned int i; 2081 unsigned int i;
2094 2082
2095 /* only scan the sections containing data */ 2083 /* only scan the sections containing data */
2096 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); 2084 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2097 2085
2098 for (i = 1; i < hdr->e_shnum; i++) { 2086 for (i = 1; i < info->hdr->e_shnum; i++) {
2099 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2087 const char *name = info->secstrings + info->sechdrs[i].sh_name;
2088 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
2100 continue; 2089 continue;
2101 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 2090 if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
2102 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2103 continue; 2091 continue;
2104 2092
2105 kmemleak_scan_area((void *)sechdrs[i].sh_addr, 2093 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
2106 sechdrs[i].sh_size, GFP_KERNEL); 2094 info->sechdrs[i].sh_size, GFP_KERNEL);
2107 } 2095 }
2108} 2096}
2109#else 2097#else
2110static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2098static inline void kmemleak_load_module(const struct module *mod,
2111 Elf_Shdr *sechdrs, char *secstrings) 2099 const struct load_info *info)
2112{ 2100{
2113} 2101}
2114#endif 2102#endif
2115 2103
2116/* Allocate and load the module: note that size of section 0 is always 2104/* Sets info->hdr and info->len. */
2117 zero, and we rely on this for optional sections. */ 2105static int copy_and_check(struct load_info *info,
2118static noinline struct module *load_module(void __user *umod, 2106 const void __user *umod, unsigned long len,
2119 unsigned long len, 2107 const char __user *uargs)
2120 const char __user *uargs)
2121{ 2108{
2109 int err;
2122 Elf_Ehdr *hdr; 2110 Elf_Ehdr *hdr;
2123 Elf_Shdr *sechdrs;
2124 char *secstrings, *args, *modmagic, *strtab = NULL;
2125 char *staging;
2126 unsigned int i;
2127 unsigned int symindex = 0;
2128 unsigned int strindex = 0;
2129 unsigned int modindex, versindex, infoindex, pcpuindex;
2130 struct module *mod;
2131 long err = 0;
2132 void *ptr = NULL; /* Stops spurious gcc warning */
2133 unsigned long symoffs, stroffs, *strmap;
2134 void __percpu *percpu;
2135 struct _ddebug *debug = NULL;
2136 unsigned int num_debug = 0;
2137 2111
2138 mm_segment_t old_fs;
2139
2140 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2141 umod, len, uargs);
2142 if (len < sizeof(*hdr)) 2112 if (len < sizeof(*hdr))
2143 return ERR_PTR(-ENOEXEC); 2113 return -ENOEXEC;
2144 2114
2145 /* Suck in entire file: we'll want most of it. */ 2115 /* Suck in entire file: we'll want most of it. */
2146 /* vmalloc barfs on "unusual" numbers. Check here */ 2116 /* vmalloc barfs on "unusual" numbers. Check here */
2147 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 2117 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2148 return ERR_PTR(-ENOMEM); 2118 return -ENOMEM;
2149 2119
2150 if (copy_from_user(hdr, umod, len) != 0) { 2120 if (copy_from_user(hdr, umod, len) != 0) {
2151 err = -EFAULT; 2121 err = -EFAULT;
@@ -2153,135 +2123,225 @@ static noinline struct module *load_module(void __user *umod,
2153 } 2123 }
2154 2124
2155 /* Sanity checks against insmoding binaries or wrong arch, 2125 /* Sanity checks against insmoding binaries or wrong arch,
2156 weird elf version */ 2126 weird elf version */
2157 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2127 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
2158 || hdr->e_type != ET_REL 2128 || hdr->e_type != ET_REL
2159 || !elf_check_arch(hdr) 2129 || !elf_check_arch(hdr)
2160 || hdr->e_shentsize != sizeof(*sechdrs)) { 2130 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2161 err = -ENOEXEC; 2131 err = -ENOEXEC;
2162 goto free_hdr; 2132 goto free_hdr;
2163 } 2133 }
2164 2134
2165 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) 2135 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
2166 goto truncated; 2136 err = -ENOEXEC;
2137 goto free_hdr;
2138 }
2167 2139
2168 /* Convenience variables */ 2140 info->hdr = hdr;
2169 sechdrs = (void *)hdr + hdr->e_shoff; 2141 info->len = len;
2170 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 2142 return 0;
2171 sechdrs[0].sh_addr = 0;
2172 2143
2173 for (i = 1; i < hdr->e_shnum; i++) { 2144free_hdr:
2174 if (sechdrs[i].sh_type != SHT_NOBITS 2145 vfree(hdr);
2175 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) 2146 return err;
2176 goto truncated; 2147}
2148
2149static void free_copy(struct load_info *info)
2150{
2151 vfree(info->hdr);
2152}
2153
2154static int rewrite_section_headers(struct load_info *info)
2155{
2156 unsigned int i;
2157
2158 /* This should always be true, but let's be sure. */
2159 info->sechdrs[0].sh_addr = 0;
2160
2161 for (i = 1; i < info->hdr->e_shnum; i++) {
2162 Elf_Shdr *shdr = &info->sechdrs[i];
2163 if (shdr->sh_type != SHT_NOBITS
2164 && info->len < shdr->sh_offset + shdr->sh_size) {
2165 printk(KERN_ERR "Module len %lu truncated\n",
2166 info->len);
2167 return -ENOEXEC;
2168 }
2177 2169
2178 /* Mark all sections sh_addr with their address in the 2170 /* Mark all sections sh_addr with their address in the
2179 temporary image. */ 2171 temporary image. */
2180 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; 2172 shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
2181 2173
2182 /* Internal symbols and strings. */
2183 if (sechdrs[i].sh_type == SHT_SYMTAB) {
2184 symindex = i;
2185 strindex = sechdrs[i].sh_link;
2186 strtab = (char *)hdr + sechdrs[strindex].sh_offset;
2187 }
2188#ifndef CONFIG_MODULE_UNLOAD 2174#ifndef CONFIG_MODULE_UNLOAD
2189 /* Don't load .exit sections */ 2175 /* Don't load .exit sections */
2190 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) 2176 if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
2191 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 2177 shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
2192#endif 2178#endif
2193 } 2179 }
2194 2180
2195 modindex = find_sec(hdr, sechdrs, secstrings, 2181 /* Track but don't keep modinfo and version sections. */
2196 ".gnu.linkonce.this_module"); 2182 info->index.vers = find_sec(info, "__versions");
2197 if (!modindex) { 2183 info->index.info = find_sec(info, ".modinfo");
2184 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2185 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
2186 return 0;
2187}
2188
2189/*
2190 * Set up our basic convenience variables (pointers to section headers,
2191 * search for module section index etc), and do some basic section
2192 * verification.
2193 *
2194 * Return the temporary module pointer (we'll replace it with the final
2195 * one when we move the module sections around).
2196 */
2197static struct module *setup_load_info(struct load_info *info)
2198{
2199 unsigned int i;
2200 int err;
2201 struct module *mod;
2202
2203 /* Set up the convenience variables */
2204 info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
2205 info->secstrings = (void *)info->hdr
2206 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2207
2208 err = rewrite_section_headers(info);
2209 if (err)
2210 return ERR_PTR(err);
2211
2212 /* Find internal symbols and strings. */
2213 for (i = 1; i < info->hdr->e_shnum; i++) {
2214 if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
2215 info->index.sym = i;
2216 info->index.str = info->sechdrs[i].sh_link;
2217 info->strtab = (char *)info->hdr
2218 + info->sechdrs[info->index.str].sh_offset;
2219 break;
2220 }
2221 }
2222
2223 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2224 if (!info->index.mod) {
2198 printk(KERN_WARNING "No module found in object\n"); 2225 printk(KERN_WARNING "No module found in object\n");
2199 err = -ENOEXEC; 2226 return ERR_PTR(-ENOEXEC);
2200 goto free_hdr;
2201 } 2227 }
2202 /* This is temporary: point mod into copy of data. */ 2228 /* This is temporary: point mod into copy of data. */
2203 mod = (void *)sechdrs[modindex].sh_addr; 2229 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2204 2230
2205 if (symindex == 0) { 2231 if (info->index.sym == 0) {
2206 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 2232 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
2207 mod->name); 2233 mod->name);
2208 err = -ENOEXEC; 2234 return ERR_PTR(-ENOEXEC);
2209 goto free_hdr;
2210 } 2235 }
2211 2236
2212 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 2237 info->index.pcpu = find_pcpusec(info);
2213 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
2214 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
2215
2216 /* Don't keep modinfo and version sections. */
2217 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2218 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2219 2238
2220 /* Check module struct version now, before we try to use module. */ 2239 /* Check module struct version now, before we try to use module. */
2221 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2240 if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
2222 err = -ENOEXEC; 2241 return ERR_PTR(-ENOEXEC);
2223 goto free_hdr; 2242
2224 } 2243 return mod;
2244}
2245
2246static int check_modinfo(struct module *mod, struct load_info *info)
2247{
2248 const char *modmagic = get_modinfo(info, "vermagic");
2249 int err;
2225 2250
2226 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2227 /* This is allowed: modprobe --force will invalidate it. */ 2251 /* This is allowed: modprobe --force will invalidate it. */
2228 if (!modmagic) { 2252 if (!modmagic) {
2229 err = try_to_force_load(mod, "bad vermagic"); 2253 err = try_to_force_load(mod, "bad vermagic");
2230 if (err) 2254 if (err)
2231 goto free_hdr; 2255 return err;
2232 } else if (!same_magic(modmagic, vermagic, versindex)) { 2256 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2233 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 2257 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
2234 mod->name, modmagic, vermagic); 2258 mod->name, modmagic, vermagic);
2235 err = -ENOEXEC; 2259 return -ENOEXEC;
2236 goto free_hdr;
2237 } 2260 }
2238 2261
2239 staging = get_modinfo(sechdrs, infoindex, "staging"); 2262 if (get_modinfo(info, "staging")) {
2240 if (staging) {
2241 add_taint_module(mod, TAINT_CRAP); 2263 add_taint_module(mod, TAINT_CRAP);
2242 printk(KERN_WARNING "%s: module is from the staging directory," 2264 printk(KERN_WARNING "%s: module is from the staging directory,"
2243 " the quality is unknown, you have been warned.\n", 2265 " the quality is unknown, you have been warned.\n",
2244 mod->name); 2266 mod->name);
2245 } 2267 }
2246 2268
2247 /* Now copy in args */ 2269 /* Set up license info based on the info section */
2248 args = strndup_user(uargs, ~0UL >> 1); 2270 set_license(mod, get_modinfo(info, "license"));
2249 if (IS_ERR(args)) {
2250 err = PTR_ERR(args);
2251 goto free_hdr;
2252 }
2253 2271
2254 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) 2272 return 0;
2255 * sizeof(long), GFP_KERNEL); 2273}
2256 if (!strmap) {
2257 err = -ENOMEM;
2258 goto free_mod;
2259 }
2260 2274
2261 mod->state = MODULE_STATE_COMING; 2275static void find_module_sections(struct module *mod, struct load_info *info)
2276{
2277 mod->kp = section_objs(info, "__param",
2278 sizeof(*mod->kp), &mod->num_kp);
2279 mod->syms = section_objs(info, "__ksymtab",
2280 sizeof(*mod->syms), &mod->num_syms);
2281 mod->crcs = section_addr(info, "__kcrctab");
2282 mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
2283 sizeof(*mod->gpl_syms),
2284 &mod->num_gpl_syms);
2285 mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
2286 mod->gpl_future_syms = section_objs(info,
2287 "__ksymtab_gpl_future",
2288 sizeof(*mod->gpl_future_syms),
2289 &mod->num_gpl_future_syms);
2290 mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
2262 2291
2263 /* Allow arches to frob section contents and sizes. */ 2292#ifdef CONFIG_UNUSED_SYMBOLS
2264 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); 2293 mod->unused_syms = section_objs(info, "__ksymtab_unused",
2265 if (err < 0) 2294 sizeof(*mod->unused_syms),
2266 goto free_mod; 2295 &mod->num_unused_syms);
2296 mod->unused_crcs = section_addr(info, "__kcrctab_unused");
2297 mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
2298 sizeof(*mod->unused_gpl_syms),
2299 &mod->num_unused_gpl_syms);
2300 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
2301#endif
2302#ifdef CONFIG_CONSTRUCTORS
2303 mod->ctors = section_objs(info, ".ctors",
2304 sizeof(*mod->ctors), &mod->num_ctors);
2305#endif
2267 2306
2268 if (pcpuindex) { 2307#ifdef CONFIG_TRACEPOINTS
2269 /* We have a special allocation for this section. */ 2308 mod->tracepoints = section_objs(info, "__tracepoints",
2270 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, 2309 sizeof(*mod->tracepoints),
2271 sechdrs[pcpuindex].sh_addralign); 2310 &mod->num_tracepoints);
2272 if (err) 2311#endif
2273 goto free_mod; 2312#ifdef CONFIG_EVENT_TRACING
2274 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2313 mod->trace_events = section_objs(info, "_ftrace_events",
2275 } 2314 sizeof(*mod->trace_events),
2276 /* Keep this around for failure path. */ 2315 &mod->num_trace_events);
2277 percpu = mod_percpu(mod); 2316 /*
2317 * This section contains pointers to allocated objects in the trace
2318 * code and not scanning it leads to false positives.
2319 */
2320 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2321 mod->num_trace_events, GFP_KERNEL);
2322#endif
2323#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2324 /* sechdrs[0].sh_size is always zero */
2325 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
2326 sizeof(*mod->ftrace_callsites),
2327 &mod->num_ftrace_callsites);
2328#endif
2278 2329
2279 /* Determine total sizes, and put offsets in sh_entsize. For now 2330 mod->extable = section_objs(info, "__ex_table",
2280 this is done generically; there doesn't appear to be any 2331 sizeof(*mod->extable), &mod->num_exentries);
2281 special cases for the architectures. */ 2332
2282 layout_sections(mod, hdr, sechdrs, secstrings); 2333 if (section_addr(info, "__obsparm"))
2283 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, 2334 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2284 secstrings, &stroffs, strmap); 2335 mod->name);
2336
2337 info->debug = section_objs(info, "__verbose",
2338 sizeof(*info->debug), &info->num_debug);
2339}
2340
2341static int move_module(struct module *mod, struct load_info *info)
2342{
2343 int i;
2344 void *ptr;
2285 2345
2286 /* Do the allocs. */ 2346 /* Do the allocs. */
2287 ptr = module_alloc_update_bounds(mod->core_size); 2347 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2291,10 +2351,9 @@ static noinline struct module *load_module(void __user *umod,
2291 * leak. 2351 * leak.
2292 */ 2352 */
2293 kmemleak_not_leak(ptr); 2353 kmemleak_not_leak(ptr);
2294 if (!ptr) { 2354 if (!ptr)
2295 err = -ENOMEM; 2355 return -ENOMEM;
2296 goto free_percpu; 2356
2297 }
2298 memset(ptr, 0, mod->core_size); 2357 memset(ptr, 0, mod->core_size);
2299 mod->module_core = ptr; 2358 mod->module_core = ptr;
2300 2359
@@ -2307,50 +2366,40 @@ static noinline struct module *load_module(void __user *umod,
2307 */ 2366 */
2308 kmemleak_ignore(ptr); 2367 kmemleak_ignore(ptr);
2309 if (!ptr && mod->init_size) { 2368 if (!ptr && mod->init_size) {
2310 err = -ENOMEM; 2369 module_free(mod, mod->module_core);
2311 goto free_core; 2370 return -ENOMEM;
2312 } 2371 }
2313 memset(ptr, 0, mod->init_size); 2372 memset(ptr, 0, mod->init_size);
2314 mod->module_init = ptr; 2373 mod->module_init = ptr;
2315 2374
2316 /* Transfer each section which specifies SHF_ALLOC */ 2375 /* Transfer each section which specifies SHF_ALLOC */
2317 DEBUGP("final section addresses:\n"); 2376 DEBUGP("final section addresses:\n");
2318 for (i = 0; i < hdr->e_shnum; i++) { 2377 for (i = 0; i < info->hdr->e_shnum; i++) {
2319 void *dest; 2378 void *dest;
2379 Elf_Shdr *shdr = &info->sechdrs[i];
2320 2380
2321 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2381 if (!(shdr->sh_flags & SHF_ALLOC))
2322 continue; 2382 continue;
2323 2383
2324 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) 2384 if (shdr->sh_entsize & INIT_OFFSET_MASK)
2325 dest = mod->module_init 2385 dest = mod->module_init
2326 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); 2386 + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
2327 else 2387 else
2328 dest = mod->module_core + sechdrs[i].sh_entsize; 2388 dest = mod->module_core + shdr->sh_entsize;
2329 2389
2330 if (sechdrs[i].sh_type != SHT_NOBITS) 2390 if (shdr->sh_type != SHT_NOBITS)
2331 memcpy(dest, (void *)sechdrs[i].sh_addr, 2391 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2332 sechdrs[i].sh_size);
2333 /* Update sh_addr to point to copy in image. */ 2392 /* Update sh_addr to point to copy in image. */
2334 sechdrs[i].sh_addr = (unsigned long)dest; 2393 shdr->sh_addr = (unsigned long)dest;
2335 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); 2394 DEBUGP("\t0x%lx %s\n",
2336 } 2395 shdr->sh_addr, info->secstrings + shdr->sh_name);
2337 /* Module has been moved. */
2338 mod = (void *)sechdrs[modindex].sh_addr;
2339 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2340
2341#if defined(CONFIG_MODULE_UNLOAD)
2342 mod->refptr = alloc_percpu(struct module_ref);
2343 if (!mod->refptr) {
2344 err = -ENOMEM;
2345 goto free_init;
2346 } 2396 }
2347#endif
2348 /* Now we've moved module, initialize linked lists, etc. */
2349 module_unload_init(mod);
2350 2397
2351 /* Set up license info based on the info section */ 2398 return 0;
2352 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2399}
2353 2400
2401static int check_module_license_and_versions(struct module *mod)
2402{
2354 /* 2403 /*
2355 * ndiswrapper is under GPL by itself, but loads proprietary modules. 2404 * ndiswrapper is under GPL by itself, but loads proprietary modules.
2356 * Don't use add_taint_module(), as it would prevent ndiswrapper from 2405 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2363,77 +2412,6 @@ static noinline struct module *load_module(void __user *umod,
2363 if (strcmp(mod->name, "driverloader") == 0) 2412 if (strcmp(mod->name, "driverloader") == 0)
2364 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2413 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2365 2414
2366 /* Set up MODINFO_ATTR fields */
2367 setup_modinfo(mod, sechdrs, infoindex);
2368
2369 /* Fix up syms, so that st_value is a pointer to location. */
2370 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
2371 mod);
2372 if (err < 0)
2373 goto cleanup;
2374
2375 /* Now we've got everything in the final locations, we can
2376 * find optional sections. */
2377 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2378 sizeof(*mod->kp), &mod->num_kp);
2379 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2380 sizeof(*mod->syms), &mod->num_syms);
2381 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2382 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2383 sizeof(*mod->gpl_syms),
2384 &mod->num_gpl_syms);
2385 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2386 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2387 "__ksymtab_gpl_future",
2388 sizeof(*mod->gpl_future_syms),
2389 &mod->num_gpl_future_syms);
2390 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2391 "__kcrctab_gpl_future");
2392
2393#ifdef CONFIG_UNUSED_SYMBOLS
2394 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2395 "__ksymtab_unused",
2396 sizeof(*mod->unused_syms),
2397 &mod->num_unused_syms);
2398 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2399 "__kcrctab_unused");
2400 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2401 "__ksymtab_unused_gpl",
2402 sizeof(*mod->unused_gpl_syms),
2403 &mod->num_unused_gpl_syms);
2404 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2405 "__kcrctab_unused_gpl");
2406#endif
2407#ifdef CONFIG_CONSTRUCTORS
2408 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2409 sizeof(*mod->ctors), &mod->num_ctors);
2410#endif
2411
2412#ifdef CONFIG_TRACEPOINTS
2413 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2414 "__tracepoints",
2415 sizeof(*mod->tracepoints),
2416 &mod->num_tracepoints);
2417#endif
2418#ifdef CONFIG_EVENT_TRACING
2419 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2420 "_ftrace_events",
2421 sizeof(*mod->trace_events),
2422 &mod->num_trace_events);
2423 /*
2424 * This section contains pointers to allocated objects in the trace
2425 * code and not scanning it leads to false positives.
2426 */
2427 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2428 mod->num_trace_events, GFP_KERNEL);
2429#endif
2430#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2431 /* sechdrs[0].sh_size is always zero */
2432 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2433 "__mcount_loc",
2434 sizeof(*mod->ftrace_callsites),
2435 &mod->num_ftrace_callsites);
2436#endif
2437#ifdef CONFIG_MODVERSIONS 2415#ifdef CONFIG_MODVERSIONS
2438 if ((mod->num_syms && !mod->crcs) 2416 if ((mod->num_syms && !mod->crcs)
2439 || (mod->num_gpl_syms && !mod->gpl_crcs) 2417 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2443,56 +2421,16 @@ static noinline struct module *load_module(void __user *umod,
2443 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2421 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2444#endif 2422#endif
2445 ) { 2423 ) {
2446 err = try_to_force_load(mod, 2424 return try_to_force_load(mod,
2447 "no versions for exported symbols"); 2425 "no versions for exported symbols");
2448 if (err)
2449 goto cleanup;
2450 } 2426 }
2451#endif 2427#endif
2428 return 0;
2429}
2452 2430
2453 /* Now do relocations. */ 2431static void flush_module_icache(const struct module *mod)
2454 for (i = 1; i < hdr->e_shnum; i++) { 2432{
2455 const char *strtab = (char *)sechdrs[strindex].sh_addr; 2433 mm_segment_t old_fs;
2456 unsigned int info = sechdrs[i].sh_info;
2457
2458 /* Not a valid relocation section? */
2459 if (info >= hdr->e_shnum)
2460 continue;
2461
2462 /* Don't bother with non-allocated sections */
2463 if (!(sechdrs[info].sh_flags & SHF_ALLOC))
2464 continue;
2465
2466 if (sechdrs[i].sh_type == SHT_REL)
2467 err = apply_relocate(sechdrs, strtab, symindex, i,mod);
2468 else if (sechdrs[i].sh_type == SHT_RELA)
2469 err = apply_relocate_add(sechdrs, strtab, symindex, i,
2470 mod);
2471 if (err < 0)
2472 goto cleanup;
2473 }
2474
2475 /* Set up and sort exception table */
2476 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2477 sizeof(*mod->extable), &mod->num_exentries);
2478 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2479
2480 /* Finally, copy percpu area over. */
2481 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2482 sechdrs[pcpuindex].sh_size);
2483
2484 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2485 symoffs, stroffs, secstrings, strmap);
2486 kfree(strmap);
2487 strmap = NULL;
2488
2489 if (!mod->taints)
2490 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2491 sizeof(*debug), &num_debug);
2492
2493 err = module_finalize(hdr, sechdrs, mod);
2494 if (err < 0)
2495 goto cleanup;
2496 2434
2497 /* flush the icache in correct context */ 2435 /* flush the icache in correct context */
2498 old_fs = get_fs(); 2436 old_fs = get_fs();
@@ -2511,11 +2449,160 @@ static noinline struct module *load_module(void __user *umod,
2511 (unsigned long)mod->module_core + mod->core_size); 2449 (unsigned long)mod->module_core + mod->core_size);
2512 2450
2513 set_fs(old_fs); 2451 set_fs(old_fs);
2452}
2514 2453
2515 mod->args = args; 2454static struct module *layout_and_allocate(struct load_info *info)
2516 if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) 2455{
2517 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2456 /* Module within temporary copy. */
2518 mod->name); 2457 struct module *mod;
2458 Elf_Shdr *pcpusec;
2459 int err;
2460
2461 mod = setup_load_info(info);
2462 if (IS_ERR(mod))
2463 return mod;
2464
2465 err = check_modinfo(mod, info);
2466 if (err)
2467 return ERR_PTR(err);
2468
2469 /* Allow arches to frob section contents and sizes. */
2470 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2471 info->secstrings, mod);
2472 if (err < 0)
2473 goto out;
2474
2475 pcpusec = &info->sechdrs[info->index.pcpu];
2476 if (pcpusec->sh_size) {
2477 /* We have a special allocation for this section. */
2478 err = percpu_modalloc(mod,
2479 pcpusec->sh_size, pcpusec->sh_addralign);
2480 if (err)
2481 goto out;
2482 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2483 }
2484
2485 /* Determine total sizes, and put offsets in sh_entsize. For now
2486 this is done generically; there doesn't appear to be any
2487 special cases for the architectures. */
2488 layout_sections(mod, info);
2489
2490 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2491 * sizeof(long), GFP_KERNEL);
2492 if (!info->strmap) {
2493 err = -ENOMEM;
2494 goto free_percpu;
2495 }
2496 layout_symtab(mod, info);
2497
2498 /* Allocate and move to the final place */
2499 err = move_module(mod, info);
2500 if (err)
2501 goto free_strmap;
2502
2503 /* Module has been copied to its final place now: return it. */
2504 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2505 kmemleak_load_module(mod, info);
2506 return mod;
2507
2508free_strmap:
2509 kfree(info->strmap);
2510free_percpu:
2511 percpu_modfree(mod);
2512out:
2513 return ERR_PTR(err);
2514}
2515
2516/* mod is no longer valid after this! */
2517static void module_deallocate(struct module *mod, struct load_info *info)
2518{
2519 kfree(info->strmap);
2520 percpu_modfree(mod);
2521 module_free(mod, mod->module_init);
2522 module_free(mod, mod->module_core);
2523}
2524
2525static int post_relocation(struct module *mod, const struct load_info *info)
2526{
2527 /* Sort exception table now relocations are done. */
2528 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2529
2530 /* Copy relocated percpu area over. */
2531 percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
2532 info->sechdrs[info->index.pcpu].sh_size);
2533
2534 /* Setup kallsyms-specific fields. */
2535 add_kallsyms(mod, info);
2536
2537 /* Arch-specific module finalizing. */
2538 return module_finalize(info->hdr, info->sechdrs, mod);
2539}
2540
2541/* Allocate and load the module: note that size of section 0 is always
2542 zero, and we rely on this for optional sections. */
2543static struct module *load_module(void __user *umod,
2544 unsigned long len,
2545 const char __user *uargs)
2546{
2547 struct load_info info = { NULL, };
2548 struct module *mod;
2549 long err;
2550
2551 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2552 umod, len, uargs);
2553
2554 /* Copy in the blobs from userspace, check they are vaguely sane. */
2555 err = copy_and_check(&info, umod, len, uargs);
2556 if (err)
2557 return ERR_PTR(err);
2558
2559 /* Figure out module layout, and allocate all the memory. */
2560 mod = layout_and_allocate(&info);
2561 if (IS_ERR(mod)) {
2562 err = PTR_ERR(mod);
2563 goto free_copy;
2564 }
2565
2566 /* Now module is in final location, initialize linked lists, etc. */
2567 err = module_unload_init(mod);
2568 if (err)
2569 goto free_module;
2570
2571 /* Now we've got everything in the final locations, we can
2572 * find optional sections. */
2573 find_module_sections(mod, &info);
2574
2575 err = check_module_license_and_versions(mod);
2576 if (err)
2577 goto free_unload;
2578
2579 /* Set up MODINFO_ATTR fields */
2580 setup_modinfo(mod, &info);
2581
2582 /* Fix up syms, so that st_value is a pointer to location. */
2583 err = simplify_symbols(mod, &info);
2584 if (err < 0)
2585 goto free_modinfo;
2586
2587 err = apply_relocations(mod, &info);
2588 if (err < 0)
2589 goto free_modinfo;
2590
2591 err = post_relocation(mod, &info);
2592 if (err < 0)
2593 goto free_modinfo;
2594
2595 flush_module_icache(mod);
2596
2597 /* Now copy in args */
2598 mod->args = strndup_user(uargs, ~0UL >> 1);
2599 if (IS_ERR(mod->args)) {
2600 err = PTR_ERR(mod->args);
2601 goto free_arch_cleanup;
2602 }
2603
2604 /* Mark state as coming so strong_try_module_get() ignores us. */
2605 mod->state = MODULE_STATE_COMING;
2519 2606
2520 /* Now sew it into the lists so we can get lockdep and oops 2607 /* Now sew it into the lists so we can get lockdep and oops
2521 * info during argument parsing. Noone should access us, since 2608 * info during argument parsing. Noone should access us, since
@@ -2530,70 +2617,61 @@ static noinline struct module *load_module(void __user *umod,
2530 goto unlock; 2617 goto unlock;
2531 } 2618 }
2532 2619
2533 if (debug) 2620 /* This has to be done once we're sure module name is unique. */
2534 dynamic_debug_setup(debug, num_debug); 2621 if (!mod->taints)
2622 dynamic_debug_setup(info.debug, info.num_debug);
2535 2623
2536 /* Find duplicate symbols */ 2624 /* Find duplicate symbols */
2537 err = verify_export_symbols(mod); 2625 err = verify_export_symbols(mod);
2538 if (err < 0) 2626 if (err < 0)
2539 goto ddebug; 2627 goto ddebug;
2540 2628
2629 module_bug_finalize(info.hdr, info.sechdrs, mod);
2541 list_add_rcu(&mod->list, &modules); 2630 list_add_rcu(&mod->list, &modules);
2542 mutex_unlock(&module_mutex); 2631 mutex_unlock(&module_mutex);
2543 2632
2633 /* Module is ready to execute: parsing args may do that. */
2544 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2634 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2545 if (err < 0) 2635 if (err < 0)
2546 goto unlink; 2636 goto unlink;
2547 2637
2548 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2638 /* Link in to syfs. */
2639 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
2549 if (err < 0) 2640 if (err < 0)
2550 goto unlink; 2641 goto unlink;
2551 2642
2552 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2643 /* Get rid of temporary copy and strmap. */
2553 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2644 kfree(info.strmap);
2554 2645 free_copy(&info);
2555 /* Get rid of temporary copy */
2556 vfree(hdr);
2557
2558 trace_module_load(mod);
2559 2646
2560 /* Done! */ 2647 /* Done! */
2648 trace_module_load(mod);
2561 return mod; 2649 return mod;
2562 2650
2563 unlink: 2651 unlink:
2564 mutex_lock(&module_mutex); 2652 mutex_lock(&module_mutex);
2565 /* Unlink carefully: kallsyms could be walking list. */ 2653 /* Unlink carefully: kallsyms could be walking list. */
2566 list_del_rcu(&mod->list); 2654 list_del_rcu(&mod->list);
2655 module_bug_cleanup(mod);
2656
2567 ddebug: 2657 ddebug:
2568 dynamic_debug_remove(debug); 2658 if (!mod->taints)
2659 dynamic_debug_remove(info.debug);
2569 unlock: 2660 unlock:
2570 mutex_unlock(&module_mutex); 2661 mutex_unlock(&module_mutex);
2571 synchronize_sched(); 2662 synchronize_sched();
2663 kfree(mod->args);
2664 free_arch_cleanup:
2572 module_arch_cleanup(mod); 2665 module_arch_cleanup(mod);
2573 cleanup: 2666 free_modinfo:
2574 free_modinfo(mod); 2667 free_modinfo(mod);
2668 free_unload:
2575 module_unload_free(mod); 2669 module_unload_free(mod);
2576#if defined(CONFIG_MODULE_UNLOAD) 2670 free_module:
2577 free_percpu(mod->refptr); 2671 module_deallocate(mod, &info);
2578 free_init: 2672 free_copy:
2579#endif 2673 free_copy(&info);
2580 module_free(mod, mod->module_init);
2581 free_core:
2582 module_free(mod, mod->module_core);
2583 /* mod will be freed with core. Don't access it beyond this line! */
2584 free_percpu:
2585 free_percpu(percpu);
2586 free_mod:
2587 kfree(args);
2588 kfree(strmap);
2589 free_hdr:
2590 vfree(hdr);
2591 return ERR_PTR(err); 2674 return ERR_PTR(err);
2592
2593 truncated:
2594 printk(KERN_ERR "Module len %lu truncated\n", len);
2595 err = -ENOEXEC;
2596 goto free_hdr;
2597} 2675}
2598 2676
2599/* Call module constructors. */ 2677/* Call module constructors. */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4c0b7b3e6d2e..200407c1502f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -36,15 +36,6 @@
36# include <asm/mutex.h> 36# include <asm/mutex.h>
37#endif 37#endif
38 38
39/***
40 * mutex_init - initialize the mutex
41 * @lock: the mutex to be initialized
42 * @key: the lock_class_key for the class; used by mutex lock debugging
43 *
44 * Initialize the mutex to unlocked state.
45 *
46 * It is not allowed to initialize an already locked mutex.
47 */
48void 39void
49__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 40__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
50{ 41{
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
68static __used noinline void __sched 59static __used noinline void __sched
69__mutex_lock_slowpath(atomic_t *lock_count); 60__mutex_lock_slowpath(atomic_t *lock_count);
70 61
71/*** 62/**
72 * mutex_lock - acquire the mutex 63 * mutex_lock - acquire the mutex
73 * @lock: the mutex to be acquired 64 * @lock: the mutex to be acquired
74 * 65 *
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock);
105 96
106static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 97static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
107 98
108/*** 99/**
109 * mutex_unlock - release the mutex 100 * mutex_unlock - release the mutex
110 * @lock: the mutex to be released 101 * @lock: the mutex to be released
111 * 102 *
@@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count);
364static noinline int __sched 355static noinline int __sched
365__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 356__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
366 357
367/*** 358/**
368 * mutex_lock_interruptible - acquire the mutex, interruptable 359 * mutex_lock_interruptible - acquire the mutex, interruptible
369 * @lock: the mutex to be acquired 360 * @lock: the mutex to be acquired
370 * 361 *
371 * Lock the mutex like mutex_lock(), and return 0 if the mutex has 362 * Lock the mutex like mutex_lock(), and return 0 if the mutex has
@@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
456 return prev == 1; 447 return prev == 1;
457} 448}
458 449
459/*** 450/**
460 * mutex_trylock - try acquire the mutex, without waiting 451 * mutex_trylock - try to acquire the mutex, without waiting
461 * @lock: the mutex to be acquired 452 * @lock: the mutex to be acquired
462 * 453 *
463 * Try to acquire the mutex atomically. Returns 1 if the mutex 454 * Try to acquire the mutex atomically. Returns 1 if the mutex
464 * has been acquired successfully, and 0 on contention. 455 * has been acquired successfully, and 0 on contention.
465 * 456 *
466 * NOTE: this function follows the spin_trylock() convention, so 457 * NOTE: this function follows the spin_trylock() convention, so
467 * it is negated to the down_trylock() return values! Be careful 458 * it is negated from the down_trylock() return values! Be careful
468 * about this when converting semaphore users to mutexes. 459 * about this when converting semaphore users to mutexes.
469 * 460 *
470 * This function must not be used in interrupt context. The 461 * This function must not be used in interrupt context. The
diff --git a/kernel/panic.c b/kernel/panic.c
index 3b16cd93fa7d..4c13b1a88ebb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,9 @@
24#include <linux/nmi.h> 24#include <linux/nmi.h>
25#include <linux/dmi.h> 25#include <linux/dmi.h>
26 26
27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18
29
27int panic_on_oops; 30int panic_on_oops;
28static unsigned long tainted_mask; 31static unsigned long tainted_mask;
29static int pause_on_oops; 32static int pause_on_oops;
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 39
37EXPORT_SYMBOL(panic_notifier_list); 40EXPORT_SYMBOL(panic_notifier_list);
38 41
39/* Returns how long it waited in ms */ 42static long no_blink(int state)
40long (*panic_blink)(long time);
41EXPORT_SYMBOL(panic_blink);
42
43static void panic_blink_one_second(void)
44{ 43{
45 static long i = 0, end; 44 return 0;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67} 45}
68 46
47/* Returns how long it waited in ms */
48long (*panic_blink)(int state);
49EXPORT_SYMBOL(panic_blink);
50
69/** 51/**
70 * panic - halt the system 52 * panic - halt the system
71 * @fmt: The text string to print 53 * @fmt: The text string to print
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...)
78{ 60{
79 static char buf[1024]; 61 static char buf[1024];
80 va_list args; 62 va_list args;
81 long i; 63 long i, i_next = 0;
64 int state = 0;
82 65
83 /* 66 /*
84 * It's possible to come here directly from a panic-assertion and 67 * It's possible to come here directly from a panic-assertion and
@@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...)
117 100
118 bust_spinlocks(0); 101 bust_spinlocks(0);
119 102
103 if (!panic_blink)
104 panic_blink = no_blink;
105
120 if (panic_timeout > 0) { 106 if (panic_timeout > 0) {
121 /* 107 /*
122 * Delay timeout seconds before rebooting the machine. 108 * Delay timeout seconds before rebooting the machine.
@@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...)
124 */ 110 */
125 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 111 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
126 112
127 for (i = 0; i < panic_timeout; i++) { 113 for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
128 touch_nmi_watchdog(); 114 touch_nmi_watchdog();
129 panic_blink_one_second(); 115 if (i >= i_next) {
116 i += panic_blink(state ^= 1);
117 i_next = i + 3600 / PANIC_BLINK_SPD;
118 }
119 mdelay(PANIC_TIMER_STEP);
130 } 120 }
131 /* 121 /*
132 * This will not be a clean reboot, with everything 122 * This will not be a clean reboot, with everything
@@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...)
152 } 142 }
153#endif 143#endif
154 local_irq_enable(); 144 local_irq_enable();
155 while (1) { 145 for (i = 0; ; i += PANIC_TIMER_STEP) {
156 touch_softlockup_watchdog(); 146 touch_softlockup_watchdog();
157 panic_blink_one_second(); 147 if (i >= i_next) {
148 i += panic_blink(state ^= 1);
149 i_next = i + 3600 / PANIC_BLINK_SPD;
150 }
151 mdelay(PANIC_TIMER_STEP);
158 } 152 }
159} 153}
160 154
@@ -344,7 +338,7 @@ static int init_oops_id(void)
344} 338}
345late_initcall(init_oops_id); 339late_initcall(init_oops_id);
346 340
347static void print_oops_end_marker(void) 341void print_oops_end_marker(void)
348{ 342{
349 init_oops_id(); 343 init_oops_id();
350 printk(KERN_WARNING "---[ end trace %016llx ]---\n", 344 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
diff --git a/kernel/params.c b/kernel/params.c
index 0b30ecd53a52..08107d181758 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,6 +31,42 @@
31#define DEBUGP(fmt, a...) 31#define DEBUGP(fmt, a...)
32#endif 32#endif
33 33
34/* Protects all parameters, and incidentally kmalloced_param list. */
35static DEFINE_MUTEX(param_lock);
36
37/* This just allows us to keep track of which parameters are kmalloced. */
38struct kmalloced_param {
39 struct list_head list;
40 char val[];
41};
42static LIST_HEAD(kmalloced_params);
43
44static void *kmalloc_parameter(unsigned int size)
45{
46 struct kmalloced_param *p;
47
48 p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
49 if (!p)
50 return NULL;
51
52 list_add(&p->list, &kmalloced_params);
53 return p->val;
54}
55
56/* Does nothing if parameter wasn't kmalloced above. */
57static void maybe_kfree_parameter(void *param)
58{
59 struct kmalloced_param *p;
60
61 list_for_each_entry(p, &kmalloced_params, list) {
62 if (p->val == param) {
63 list_del(&p->list);
64 kfree(p);
65 break;
66 }
67 }
68}
69
34static inline char dash2underscore(char c) 70static inline char dash2underscore(char c)
35{ 71{
36 if (c == '-') 72 if (c == '-')
@@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname)
49 85
50static int parse_one(char *param, 86static int parse_one(char *param,
51 char *val, 87 char *val,
52 struct kernel_param *params, 88 const struct kernel_param *params,
53 unsigned num_params, 89 unsigned num_params,
54 int (*handle_unknown)(char *param, char *val)) 90 int (*handle_unknown)(char *param, char *val))
55{ 91{
56 unsigned int i; 92 unsigned int i;
93 int err;
57 94
58 /* Find parameter */ 95 /* Find parameter */
59 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
60 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL;
61 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
62 params[i].set); 102 params[i].ops->set);
63 return params[i].set(val, &params[i]); 103 mutex_lock(&param_lock);
104 err = params[i].ops->set(val, &params[i]);
105 mutex_unlock(&param_lock);
106 return err;
64 } 107 }
65 } 108 }
66 109
@@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val)
128/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 171/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
129int parse_args(const char *name, 172int parse_args(const char *name,
130 char *args, 173 char *args,
131 struct kernel_param *params, 174 const struct kernel_param *params,
132 unsigned num, 175 unsigned num,
133 int (*unknown)(char *param, char *val)) 176 int (*unknown)(char *param, char *val))
134{ 177{
@@ -176,22 +219,29 @@ int parse_args(const char *name,
176 219
177/* Lazy bastard, eh? */ 220/* Lazy bastard, eh? */
178#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ 221#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
179 int param_set_##name(const char *val, struct kernel_param *kp) \ 222 int param_set_##name(const char *val, const struct kernel_param *kp) \
180 { \ 223 { \
181 tmptype l; \ 224 tmptype l; \
182 int ret; \ 225 int ret; \
183 \ 226 \
184 if (!val) return -EINVAL; \
185 ret = strtolfn(val, 0, &l); \ 227 ret = strtolfn(val, 0, &l); \
186 if (ret == -EINVAL || ((type)l != l)) \ 228 if (ret == -EINVAL || ((type)l != l)) \
187 return -EINVAL; \ 229 return -EINVAL; \
188 *((type *)kp->arg) = l; \ 230 *((type *)kp->arg) = l; \
189 return 0; \ 231 return 0; \
190 } \ 232 } \
191 int param_get_##name(char *buffer, struct kernel_param *kp) \ 233 int param_get_##name(char *buffer, const struct kernel_param *kp) \
192 { \ 234 { \
193 return sprintf(buffer, format, *((type *)kp->arg)); \ 235 return sprintf(buffer, format, *((type *)kp->arg)); \
194 } 236 } \
237 struct kernel_param_ops param_ops_##name = { \
238 .set = param_set_##name, \
239 .get = param_get_##name, \
240 }; \
241 EXPORT_SYMBOL(param_set_##name); \
242 EXPORT_SYMBOL(param_get_##name); \
243 EXPORT_SYMBOL(param_ops_##name)
244
195 245
196STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); 246STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
197STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); 247STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
@@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
201STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); 251STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
202STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); 252STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
203 253
204int param_set_charp(const char *val, struct kernel_param *kp) 254int param_set_charp(const char *val, const struct kernel_param *kp)
205{ 255{
206 if (!val) {
207 printk(KERN_ERR "%s: string parameter expected\n",
208 kp->name);
209 return -EINVAL;
210 }
211
212 if (strlen(val) > 1024) { 256 if (strlen(val) > 1024) {
213 printk(KERN_ERR "%s: string parameter too long\n", 257 printk(KERN_ERR "%s: string parameter too long\n",
214 kp->name); 258 kp->name);
215 return -ENOSPC; 259 return -ENOSPC;
216 } 260 }
217 261
218 /* This is a hack. We can't need to strdup in early boot, and we 262 maybe_kfree_parameter(*(char **)kp->arg);
263
264 /* This is a hack. We can't kmalloc in early boot, and we
219 * don't need to; this mangled commandline is preserved. */ 265 * don't need to; this mangled commandline is preserved. */
220 if (slab_is_available()) { 266 if (slab_is_available()) {
221 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 267 *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
222 if (!*(char **)kp->arg) 268 if (!*(char **)kp->arg)
223 return -ENOMEM; 269 return -ENOMEM;
270 strcpy(*(char **)kp->arg, val);
224 } else 271 } else
225 *(const char **)kp->arg = val; 272 *(const char **)kp->arg = val;
226 273
227 return 0; 274 return 0;
228} 275}
276EXPORT_SYMBOL(param_set_charp);
229 277
230int param_get_charp(char *buffer, struct kernel_param *kp) 278int param_get_charp(char *buffer, const struct kernel_param *kp)
231{ 279{
232 return sprintf(buffer, "%s", *((char **)kp->arg)); 280 return sprintf(buffer, "%s", *((char **)kp->arg));
233} 281}
282EXPORT_SYMBOL(param_get_charp);
283
284static void param_free_charp(void *arg)
285{
286 maybe_kfree_parameter(*((char **)arg));
287}
288
289struct kernel_param_ops param_ops_charp = {
290 .set = param_set_charp,
291 .get = param_get_charp,
292 .free = param_free_charp,
293};
294EXPORT_SYMBOL(param_ops_charp);
234 295
235/* Actually could be a bool or an int, for historical reasons. */ 296/* Actually could be a bool or an int, for historical reasons. */
236int param_set_bool(const char *val, struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
237{ 298{
238 bool v; 299 bool v;
239 300
@@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp)
258 *(int *)kp->arg = v; 319 *(int *)kp->arg = v;
259 return 0; 320 return 0;
260} 321}
322EXPORT_SYMBOL(param_set_bool);
261 323
262int param_get_bool(char *buffer, struct kernel_param *kp) 324int param_get_bool(char *buffer, const struct kernel_param *kp)
263{ 325{
264 bool val; 326 bool val;
265 if (kp->flags & KPARAM_ISBOOL) 327 if (kp->flags & KPARAM_ISBOOL)
@@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
270 /* Y and N chosen as being relatively non-coder friendly */ 332 /* Y and N chosen as being relatively non-coder friendly */
271 return sprintf(buffer, "%c", val ? 'Y' : 'N'); 333 return sprintf(buffer, "%c", val ? 'Y' : 'N');
272} 334}
335EXPORT_SYMBOL(param_get_bool);
336
337struct kernel_param_ops param_ops_bool = {
338 .set = param_set_bool,
339 .get = param_get_bool,
340};
341EXPORT_SYMBOL(param_ops_bool);
273 342
274/* This one must be bool. */ 343/* This one must be bool. */
275int param_set_invbool(const char *val, struct kernel_param *kp) 344int param_set_invbool(const char *val, const struct kernel_param *kp)
276{ 345{
277 int ret; 346 int ret;
278 bool boolval; 347 bool boolval;
@@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
285 *(bool *)kp->arg = !boolval; 354 *(bool *)kp->arg = !boolval;
286 return ret; 355 return ret;
287} 356}
357EXPORT_SYMBOL(param_set_invbool);
288 358
289int param_get_invbool(char *buffer, struct kernel_param *kp) 359int param_get_invbool(char *buffer, const struct kernel_param *kp)
290{ 360{
291 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); 361 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
292} 362}
363EXPORT_SYMBOL(param_get_invbool);
364
365struct kernel_param_ops param_ops_invbool = {
366 .set = param_set_invbool,
367 .get = param_get_invbool,
368};
369EXPORT_SYMBOL(param_ops_invbool);
293 370
294/* We break the rule and mangle the string. */ 371/* We break the rule and mangle the string. */
295static int param_array(const char *name, 372static int param_array(const char *name,
296 const char *val, 373 const char *val,
297 unsigned int min, unsigned int max, 374 unsigned int min, unsigned int max,
298 void *elem, int elemsize, 375 void *elem, int elemsize,
299 int (*set)(const char *, struct kernel_param *kp), 376 int (*set)(const char *, const struct kernel_param *kp),
300 u16 flags, 377 u16 flags,
301 unsigned int *num) 378 unsigned int *num)
302{ 379{
@@ -309,12 +386,6 @@ static int param_array(const char *name,
309 kp.arg = elem; 386 kp.arg = elem;
310 kp.flags = flags; 387 kp.flags = flags;
311 388
312 /* No equals sign? */
313 if (!val) {
314 printk(KERN_ERR "%s: expects arguments\n", name);
315 return -EINVAL;
316 }
317
318 *num = 0; 389 *num = 0;
319 /* We expect a comma-separated list of values. */ 390 /* We expect a comma-separated list of values. */
320 do { 391 do {
@@ -330,6 +401,7 @@ static int param_array(const char *name,
330 /* nul-terminate and parse */ 401 /* nul-terminate and parse */
331 save = val[len]; 402 save = val[len];
332 ((char *)val)[len] = '\0'; 403 ((char *)val)[len] = '\0';
404 BUG_ON(!mutex_is_locked(&param_lock));
333 ret = set(val, &kp); 405 ret = set(val, &kp);
334 406
335 if (ret != 0) 407 if (ret != 0)
@@ -347,17 +419,17 @@ static int param_array(const char *name,
347 return 0; 419 return 0;
348} 420}
349 421
350int param_array_set(const char *val, struct kernel_param *kp) 422static int param_array_set(const char *val, const struct kernel_param *kp)
351{ 423{
352 const struct kparam_array *arr = kp->arr; 424 const struct kparam_array *arr = kp->arr;
353 unsigned int temp_num; 425 unsigned int temp_num;
354 426
355 return param_array(kp->name, val, 1, arr->max, arr->elem, 427 return param_array(kp->name, val, 1, arr->max, arr->elem,
356 arr->elemsize, arr->set, kp->flags, 428 arr->elemsize, arr->ops->set, kp->flags,
357 arr->num ?: &temp_num); 429 arr->num ?: &temp_num);
358} 430}
359 431
360int param_array_get(char *buffer, struct kernel_param *kp) 432static int param_array_get(char *buffer, const struct kernel_param *kp)
361{ 433{
362 int i, off, ret; 434 int i, off, ret;
363 const struct kparam_array *arr = kp->arr; 435 const struct kparam_array *arr = kp->arr;
@@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp)
368 if (i) 440 if (i)
369 buffer[off++] = ','; 441 buffer[off++] = ',';
370 p.arg = arr->elem + arr->elemsize * i; 442 p.arg = arr->elem + arr->elemsize * i;
371 ret = arr->get(buffer + off, &p); 443 BUG_ON(!mutex_is_locked(&param_lock));
444 ret = arr->ops->get(buffer + off, &p);
372 if (ret < 0) 445 if (ret < 0)
373 return ret; 446 return ret;
374 off += ret; 447 off += ret;
@@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp)
377 return off; 450 return off;
378} 451}
379 452
380int param_set_copystring(const char *val, struct kernel_param *kp) 453static void param_array_free(void *arg)
454{
455 unsigned int i;
456 const struct kparam_array *arr = arg;
457
458 if (arr->ops->free)
459 for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
460 arr->ops->free(arr->elem + arr->elemsize * i);
461}
462
463struct kernel_param_ops param_array_ops = {
464 .set = param_array_set,
465 .get = param_array_get,
466 .free = param_array_free,
467};
468EXPORT_SYMBOL(param_array_ops);
469
470int param_set_copystring(const char *val, const struct kernel_param *kp)
381{ 471{
382 const struct kparam_string *kps = kp->str; 472 const struct kparam_string *kps = kp->str;
383 473
384 if (!val) {
385 printk(KERN_ERR "%s: missing param set value\n", kp->name);
386 return -EINVAL;
387 }
388 if (strlen(val)+1 > kps->maxlen) { 474 if (strlen(val)+1 > kps->maxlen) {
389 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 475 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
390 kp->name, kps->maxlen-1); 476 kp->name, kps->maxlen-1);
@@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
393 strcpy(kps->string, val); 479 strcpy(kps->string, val);
394 return 0; 480 return 0;
395} 481}
482EXPORT_SYMBOL(param_set_copystring);
396 483
397int param_get_string(char *buffer, struct kernel_param *kp) 484int param_get_string(char *buffer, const struct kernel_param *kp)
398{ 485{
399 const struct kparam_string *kps = kp->str; 486 const struct kparam_string *kps = kp->str;
400 return strlcpy(buffer, kps->string, kps->maxlen); 487 return strlcpy(buffer, kps->string, kps->maxlen);
401} 488}
489EXPORT_SYMBOL(param_get_string);
490
491struct kernel_param_ops param_ops_string = {
492 .set = param_set_copystring,
493 .get = param_get_string,
494};
495EXPORT_SYMBOL(param_ops_string);
402 496
403/* sysfs output in /sys/modules/XYZ/parameters/ */ 497/* sysfs output in /sys/modules/XYZ/parameters/ */
404#define to_module_attr(n) container_of(n, struct module_attribute, attr) 498#define to_module_attr(n) container_of(n, struct module_attribute, attr)
@@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[];
409struct param_attribute 503struct param_attribute
410{ 504{
411 struct module_attribute mattr; 505 struct module_attribute mattr;
412 struct kernel_param *param; 506 const struct kernel_param *param;
413}; 507};
414 508
415struct module_param_attrs 509struct module_param_attrs
@@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
428 int count; 522 int count;
429 struct param_attribute *attribute = to_param_attr(mattr); 523 struct param_attribute *attribute = to_param_attr(mattr);
430 524
431 if (!attribute->param->get) 525 if (!attribute->param->ops->get)
432 return -EPERM; 526 return -EPERM;
433 527
434 count = attribute->param->get(buf, attribute->param); 528 mutex_lock(&param_lock);
529 count = attribute->param->ops->get(buf, attribute->param);
530 mutex_unlock(&param_lock);
435 if (count > 0) { 531 if (count > 0) {
436 strcat(buf, "\n"); 532 strcat(buf, "\n");
437 ++count; 533 ++count;
@@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
447 int err; 543 int err;
448 struct param_attribute *attribute = to_param_attr(mattr); 544 struct param_attribute *attribute = to_param_attr(mattr);
449 545
450 if (!attribute->param->set) 546 if (!attribute->param->ops->set)
451 return -EPERM; 547 return -EPERM;
452 548
453 err = attribute->param->set(buf, attribute->param); 549 mutex_lock(&param_lock);
550 err = attribute->param->ops->set(buf, attribute->param);
551 mutex_unlock(&param_lock);
454 if (!err) 552 if (!err)
455 return len; 553 return len;
456 return err; 554 return err;
@@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
464#endif 562#endif
465 563
466#ifdef CONFIG_SYSFS 564#ifdef CONFIG_SYSFS
565void __kernel_param_lock(void)
566{
567 mutex_lock(&param_lock);
568}
569EXPORT_SYMBOL(__kernel_param_lock);
570
571void __kernel_param_unlock(void)
572{
573 mutex_unlock(&param_lock);
574}
575EXPORT_SYMBOL(__kernel_param_unlock);
576
467/* 577/*
468 * add_sysfs_param - add a parameter to sysfs 578 * add_sysfs_param - add a parameter to sysfs
469 * @mk: struct module_kobject 579 * @mk: struct module_kobject
@@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
475 * if there's an error. 585 * if there's an error.
476 */ 586 */
477static __modinit int add_sysfs_param(struct module_kobject *mk, 587static __modinit int add_sysfs_param(struct module_kobject *mk,
478 struct kernel_param *kp, 588 const struct kernel_param *kp,
479 const char *name) 589 const char *name)
480{ 590{
481 struct module_param_attrs *new; 591 struct module_param_attrs *new;
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk)
557 * /sys/module/[mod->name]/parameters/ 667 * /sys/module/[mod->name]/parameters/
558 */ 668 */
559int module_param_sysfs_setup(struct module *mod, 669int module_param_sysfs_setup(struct module *mod,
560 struct kernel_param *kparam, 670 const struct kernel_param *kparam,
561 unsigned int num_params) 671 unsigned int num_params)
562{ 672{
563 int i, err; 673 int i, err;
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod)
602 712
603void destroy_params(const struct kernel_param *params, unsigned num) 713void destroy_params(const struct kernel_param *params, unsigned num)
604{ 714{
605 /* FIXME: This should free kmalloced charp parameters. It doesn't. */ 715 unsigned int i;
716
717 for (i = 0; i < num; i++)
718 if (params[i].ops->free)
719 params[i].ops->free(params[i].arg);
606} 720}
607 721
608static void __init kernel_add_sysfs_param(const char *name, 722static void __init kernel_add_sysfs_param(const char *name,
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void)
768subsys_initcall(param_sysfs_init); 882subsys_initcall(param_sysfs_init);
769 883
770#endif /* CONFIG_SYSFS */ 884#endif /* CONFIG_SYSFS */
771
772EXPORT_SYMBOL(param_set_byte);
773EXPORT_SYMBOL(param_get_byte);
774EXPORT_SYMBOL(param_set_short);
775EXPORT_SYMBOL(param_get_short);
776EXPORT_SYMBOL(param_set_ushort);
777EXPORT_SYMBOL(param_get_ushort);
778EXPORT_SYMBOL(param_set_int);
779EXPORT_SYMBOL(param_get_int);
780EXPORT_SYMBOL(param_set_uint);
781EXPORT_SYMBOL(param_get_uint);
782EXPORT_SYMBOL(param_set_long);
783EXPORT_SYMBOL(param_get_long);
784EXPORT_SYMBOL(param_set_ulong);
785EXPORT_SYMBOL(param_get_ulong);
786EXPORT_SYMBOL(param_set_charp);
787EXPORT_SYMBOL(param_get_charp);
788EXPORT_SYMBOL(param_set_bool);
789EXPORT_SYMBOL(param_get_bool);
790EXPORT_SYMBOL(param_set_invbool);
791EXPORT_SYMBOL(param_get_invbool);
792EXPORT_SYMBOL(param_array_set);
793EXPORT_SYMBOL(param_array_get);
794EXPORT_SYMBOL(param_set_copystring);
795EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..db5b56064687 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
214 214
215static inline u64 perf_clock(void) 215static inline u64 perf_clock(void)
216{ 216{
217 return cpu_clock(raw_smp_processor_id()); 217 return local_clock();
218} 218}
219 219
220/* 220/*
@@ -402,11 +402,31 @@ static void perf_group_detach(struct perf_event *event)
402 } 402 }
403} 403}
404 404
405static inline int
406event_filter_match(struct perf_event *event)
407{
408 return event->cpu == -1 || event->cpu == smp_processor_id();
409}
410
405static void 411static void
406event_sched_out(struct perf_event *event, 412event_sched_out(struct perf_event *event,
407 struct perf_cpu_context *cpuctx, 413 struct perf_cpu_context *cpuctx,
408 struct perf_event_context *ctx) 414 struct perf_event_context *ctx)
409{ 415{
416 u64 delta;
417 /*
418 * An event which could not be activated because of
419 * filter mismatch still needs to have its timings
420 * maintained, otherwise bogus information is return
421 * via read() for time_enabled, time_running:
422 */
423 if (event->state == PERF_EVENT_STATE_INACTIVE
424 && !event_filter_match(event)) {
425 delta = ctx->time - event->tstamp_stopped;
426 event->tstamp_running += delta;
427 event->tstamp_stopped = ctx->time;
428 }
429
410 if (event->state != PERF_EVENT_STATE_ACTIVE) 430 if (event->state != PERF_EVENT_STATE_ACTIVE)
411 return; 431 return;
412 432
@@ -432,9 +452,7 @@ group_sched_out(struct perf_event *group_event,
432 struct perf_event_context *ctx) 452 struct perf_event_context *ctx)
433{ 453{
434 struct perf_event *event; 454 struct perf_event *event;
435 455 int state = group_event->state;
436 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
437 return;
438 456
439 event_sched_out(group_event, cpuctx, ctx); 457 event_sched_out(group_event, cpuctx, ctx);
440 458
@@ -444,7 +462,7 @@ group_sched_out(struct perf_event *group_event,
444 list_for_each_entry(event, &group_event->sibling_list, group_entry) 462 list_for_each_entry(event, &group_event->sibling_list, group_entry)
445 event_sched_out(event, cpuctx, ctx); 463 event_sched_out(event, cpuctx, ctx);
446 464
447 if (group_event->attr.exclusive) 465 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
448 cpuctx->exclusive = 0; 466 cpuctx->exclusive = 0;
449} 467}
450 468
@@ -675,7 +693,6 @@ group_sched_in(struct perf_event *group_event,
675 struct perf_event *event, *partial_group = NULL; 693 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu; 694 const struct pmu *pmu = group_event->pmu;
677 bool txn = false; 695 bool txn = false;
678 int ret;
679 696
680 if (group_event->state == PERF_EVENT_STATE_OFF) 697 if (group_event->state == PERF_EVENT_STATE_OFF)
681 return 0; 698 return 0;
@@ -703,14 +720,8 @@ group_sched_in(struct perf_event *group_event,
703 } 720 }
704 } 721 }
705 722
706 if (!txn) 723 if (!txn || !pmu->commit_txn(pmu))
707 return 0;
708
709 ret = pmu->commit_txn(pmu);
710 if (!ret) {
711 pmu->cancel_txn(pmu);
712 return 0; 724 return 0;
713 }
714 725
715group_error: 726group_error:
716 /* 727 /*
@@ -1155,9 +1166,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
1155 * In order to keep per-task stats reliable we need to flip the event 1166 * In order to keep per-task stats reliable we need to flip the event
1156 * values when we flip the contexts. 1167 * values when we flip the contexts.
1157 */ 1168 */
1158 value = atomic64_read(&next_event->count); 1169 value = local64_read(&next_event->count);
1159 value = atomic64_xchg(&event->count, value); 1170 value = local64_xchg(&event->count, value);
1160 atomic64_set(&next_event->count, value); 1171 local64_set(&next_event->count, value);
1161 1172
1162 swap(event->total_time_enabled, next_event->total_time_enabled); 1173 swap(event->total_time_enabled, next_event->total_time_enabled);
1163 swap(event->total_time_running, next_event->total_time_running); 1174 swap(event->total_time_running, next_event->total_time_running);
@@ -1547,10 +1558,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1547 1558
1548 hwc->sample_period = sample_period; 1559 hwc->sample_period = sample_period;
1549 1560
1550 if (atomic64_read(&hwc->period_left) > 8*sample_period) { 1561 if (local64_read(&hwc->period_left) > 8*sample_period) {
1551 perf_disable(); 1562 perf_disable();
1552 perf_event_stop(event); 1563 perf_event_stop(event);
1553 atomic64_set(&hwc->period_left, 0); 1564 local64_set(&hwc->period_left, 0);
1554 perf_event_start(event); 1565 perf_event_start(event);
1555 perf_enable(); 1566 perf_enable();
1556 } 1567 }
@@ -1591,7 +1602,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1591 1602
1592 perf_disable(); 1603 perf_disable();
1593 event->pmu->read(event); 1604 event->pmu->read(event);
1594 now = atomic64_read(&event->count); 1605 now = local64_read(&event->count);
1595 delta = now - hwc->freq_count_stamp; 1606 delta = now - hwc->freq_count_stamp;
1596 hwc->freq_count_stamp = now; 1607 hwc->freq_count_stamp = now;
1597 1608
@@ -1743,6 +1754,11 @@ static void __perf_event_read(void *info)
1743 event->pmu->read(event); 1754 event->pmu->read(event);
1744} 1755}
1745 1756
1757static inline u64 perf_event_count(struct perf_event *event)
1758{
1759 return local64_read(&event->count) + atomic64_read(&event->child_count);
1760}
1761
1746static u64 perf_event_read(struct perf_event *event) 1762static u64 perf_event_read(struct perf_event *event)
1747{ 1763{
1748 /* 1764 /*
@@ -1762,7 +1778,7 @@ static u64 perf_event_read(struct perf_event *event)
1762 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1778 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1763 } 1779 }
1764 1780
1765 return atomic64_read(&event->count); 1781 return perf_event_count(event);
1766} 1782}
1767 1783
1768/* 1784/*
@@ -1883,7 +1899,7 @@ static void free_event_rcu(struct rcu_head *head)
1883} 1899}
1884 1900
1885static void perf_pending_sync(struct perf_event *event); 1901static void perf_pending_sync(struct perf_event *event);
1886static void perf_mmap_data_put(struct perf_mmap_data *data); 1902static void perf_buffer_put(struct perf_buffer *buffer);
1887 1903
1888static void free_event(struct perf_event *event) 1904static void free_event(struct perf_event *event)
1889{ 1905{
@@ -1891,7 +1907,7 @@ static void free_event(struct perf_event *event)
1891 1907
1892 if (!event->parent) { 1908 if (!event->parent) {
1893 atomic_dec(&nr_events); 1909 atomic_dec(&nr_events);
1894 if (event->attr.mmap) 1910 if (event->attr.mmap || event->attr.mmap_data)
1895 atomic_dec(&nr_mmap_events); 1911 atomic_dec(&nr_mmap_events);
1896 if (event->attr.comm) 1912 if (event->attr.comm)
1897 atomic_dec(&nr_comm_events); 1913 atomic_dec(&nr_comm_events);
@@ -1899,9 +1915,9 @@ static void free_event(struct perf_event *event)
1899 atomic_dec(&nr_task_events); 1915 atomic_dec(&nr_task_events);
1900 } 1916 }
1901 1917
1902 if (event->data) { 1918 if (event->buffer) {
1903 perf_mmap_data_put(event->data); 1919 perf_buffer_put(event->buffer);
1904 event->data = NULL; 1920 event->buffer = NULL;
1905 } 1921 }
1906 1922
1907 if (event->destroy) 1923 if (event->destroy)
@@ -2126,13 +2142,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2126static unsigned int perf_poll(struct file *file, poll_table *wait) 2142static unsigned int perf_poll(struct file *file, poll_table *wait)
2127{ 2143{
2128 struct perf_event *event = file->private_data; 2144 struct perf_event *event = file->private_data;
2129 struct perf_mmap_data *data; 2145 struct perf_buffer *buffer;
2130 unsigned int events = POLL_HUP; 2146 unsigned int events = POLL_HUP;
2131 2147
2132 rcu_read_lock(); 2148 rcu_read_lock();
2133 data = rcu_dereference(event->data); 2149 buffer = rcu_dereference(event->buffer);
2134 if (data) 2150 if (buffer)
2135 events = atomic_xchg(&data->poll, 0); 2151 events = atomic_xchg(&buffer->poll, 0);
2136 rcu_read_unlock(); 2152 rcu_read_unlock();
2137 2153
2138 poll_wait(file, &event->waitq, wait); 2154 poll_wait(file, &event->waitq, wait);
@@ -2143,7 +2159,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
2143static void perf_event_reset(struct perf_event *event) 2159static void perf_event_reset(struct perf_event *event)
2144{ 2160{
2145 (void)perf_event_read(event); 2161 (void)perf_event_read(event);
2146 atomic64_set(&event->count, 0); 2162 local64_set(&event->count, 0);
2147 perf_event_update_userpage(event); 2163 perf_event_update_userpage(event);
2148} 2164}
2149 2165
@@ -2342,14 +2358,14 @@ static int perf_event_index(struct perf_event *event)
2342void perf_event_update_userpage(struct perf_event *event) 2358void perf_event_update_userpage(struct perf_event *event)
2343{ 2359{
2344 struct perf_event_mmap_page *userpg; 2360 struct perf_event_mmap_page *userpg;
2345 struct perf_mmap_data *data; 2361 struct perf_buffer *buffer;
2346 2362
2347 rcu_read_lock(); 2363 rcu_read_lock();
2348 data = rcu_dereference(event->data); 2364 buffer = rcu_dereference(event->buffer);
2349 if (!data) 2365 if (!buffer)
2350 goto unlock; 2366 goto unlock;
2351 2367
2352 userpg = data->user_page; 2368 userpg = buffer->user_page;
2353 2369
2354 /* 2370 /*
2355 * Disable preemption so as to not let the corresponding user-space 2371 * Disable preemption so as to not let the corresponding user-space
@@ -2359,9 +2375,9 @@ void perf_event_update_userpage(struct perf_event *event)
2359 ++userpg->lock; 2375 ++userpg->lock;
2360 barrier(); 2376 barrier();
2361 userpg->index = perf_event_index(event); 2377 userpg->index = perf_event_index(event);
2362 userpg->offset = atomic64_read(&event->count); 2378 userpg->offset = perf_event_count(event);
2363 if (event->state == PERF_EVENT_STATE_ACTIVE) 2379 if (event->state == PERF_EVENT_STATE_ACTIVE)
2364 userpg->offset -= atomic64_read(&event->hw.prev_count); 2380 userpg->offset -= local64_read(&event->hw.prev_count);
2365 2381
2366 userpg->time_enabled = event->total_time_enabled + 2382 userpg->time_enabled = event->total_time_enabled +
2367 atomic64_read(&event->child_total_time_enabled); 2383 atomic64_read(&event->child_total_time_enabled);
@@ -2376,6 +2392,25 @@ unlock:
2376 rcu_read_unlock(); 2392 rcu_read_unlock();
2377} 2393}
2378 2394
2395static unsigned long perf_data_size(struct perf_buffer *buffer);
2396
2397static void
2398perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
2399{
2400 long max_size = perf_data_size(buffer);
2401
2402 if (watermark)
2403 buffer->watermark = min(max_size, watermark);
2404
2405 if (!buffer->watermark)
2406 buffer->watermark = max_size / 2;
2407
2408 if (flags & PERF_BUFFER_WRITABLE)
2409 buffer->writable = 1;
2410
2411 atomic_set(&buffer->refcount, 1);
2412}
2413
2379#ifndef CONFIG_PERF_USE_VMALLOC 2414#ifndef CONFIG_PERF_USE_VMALLOC
2380 2415
2381/* 2416/*
@@ -2383,15 +2418,15 @@ unlock:
2383 */ 2418 */
2384 2419
2385static struct page * 2420static struct page *
2386perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2421perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2387{ 2422{
2388 if (pgoff > data->nr_pages) 2423 if (pgoff > buffer->nr_pages)
2389 return NULL; 2424 return NULL;
2390 2425
2391 if (pgoff == 0) 2426 if (pgoff == 0)
2392 return virt_to_page(data->user_page); 2427 return virt_to_page(buffer->user_page);
2393 2428
2394 return virt_to_page(data->data_pages[pgoff - 1]); 2429 return virt_to_page(buffer->data_pages[pgoff - 1]);
2395} 2430}
2396 2431
2397static void *perf_mmap_alloc_page(int cpu) 2432static void *perf_mmap_alloc_page(int cpu)
@@ -2407,42 +2442,44 @@ static void *perf_mmap_alloc_page(int cpu)
2407 return page_address(page); 2442 return page_address(page);
2408} 2443}
2409 2444
2410static struct perf_mmap_data * 2445static struct perf_buffer *
2411perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2446perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2412{ 2447{
2413 struct perf_mmap_data *data; 2448 struct perf_buffer *buffer;
2414 unsigned long size; 2449 unsigned long size;
2415 int i; 2450 int i;
2416 2451
2417 size = sizeof(struct perf_mmap_data); 2452 size = sizeof(struct perf_buffer);
2418 size += nr_pages * sizeof(void *); 2453 size += nr_pages * sizeof(void *);
2419 2454
2420 data = kzalloc(size, GFP_KERNEL); 2455 buffer = kzalloc(size, GFP_KERNEL);
2421 if (!data) 2456 if (!buffer)
2422 goto fail; 2457 goto fail;
2423 2458
2424 data->user_page = perf_mmap_alloc_page(event->cpu); 2459 buffer->user_page = perf_mmap_alloc_page(cpu);
2425 if (!data->user_page) 2460 if (!buffer->user_page)
2426 goto fail_user_page; 2461 goto fail_user_page;
2427 2462
2428 for (i = 0; i < nr_pages; i++) { 2463 for (i = 0; i < nr_pages; i++) {
2429 data->data_pages[i] = perf_mmap_alloc_page(event->cpu); 2464 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2430 if (!data->data_pages[i]) 2465 if (!buffer->data_pages[i])
2431 goto fail_data_pages; 2466 goto fail_data_pages;
2432 } 2467 }
2433 2468
2434 data->nr_pages = nr_pages; 2469 buffer->nr_pages = nr_pages;
2435 2470
2436 return data; 2471 perf_buffer_init(buffer, watermark, flags);
2472
2473 return buffer;
2437 2474
2438fail_data_pages: 2475fail_data_pages:
2439 for (i--; i >= 0; i--) 2476 for (i--; i >= 0; i--)
2440 free_page((unsigned long)data->data_pages[i]); 2477 free_page((unsigned long)buffer->data_pages[i]);
2441 2478
2442 free_page((unsigned long)data->user_page); 2479 free_page((unsigned long)buffer->user_page);
2443 2480
2444fail_user_page: 2481fail_user_page:
2445 kfree(data); 2482 kfree(buffer);
2446 2483
2447fail: 2484fail:
2448 return NULL; 2485 return NULL;
@@ -2456,17 +2493,17 @@ static void perf_mmap_free_page(unsigned long addr)
2456 __free_page(page); 2493 __free_page(page);
2457} 2494}
2458 2495
2459static void perf_mmap_data_free(struct perf_mmap_data *data) 2496static void perf_buffer_free(struct perf_buffer *buffer)
2460{ 2497{
2461 int i; 2498 int i;
2462 2499
2463 perf_mmap_free_page((unsigned long)data->user_page); 2500 perf_mmap_free_page((unsigned long)buffer->user_page);
2464 for (i = 0; i < data->nr_pages; i++) 2501 for (i = 0; i < buffer->nr_pages; i++)
2465 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2502 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
2466 kfree(data); 2503 kfree(buffer);
2467} 2504}
2468 2505
2469static inline int page_order(struct perf_mmap_data *data) 2506static inline int page_order(struct perf_buffer *buffer)
2470{ 2507{
2471 return 0; 2508 return 0;
2472} 2509}
@@ -2479,18 +2516,18 @@ static inline int page_order(struct perf_mmap_data *data)
2479 * Required for architectures that have d-cache aliasing issues. 2516 * Required for architectures that have d-cache aliasing issues.
2480 */ 2517 */
2481 2518
2482static inline int page_order(struct perf_mmap_data *data) 2519static inline int page_order(struct perf_buffer *buffer)
2483{ 2520{
2484 return data->page_order; 2521 return buffer->page_order;
2485} 2522}
2486 2523
2487static struct page * 2524static struct page *
2488perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2525perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2489{ 2526{
2490 if (pgoff > (1UL << page_order(data))) 2527 if (pgoff > (1UL << page_order(buffer)))
2491 return NULL; 2528 return NULL;
2492 2529
2493 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2530 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2494} 2531}
2495 2532
2496static void perf_mmap_unmark_page(void *addr) 2533static void perf_mmap_unmark_page(void *addr)
@@ -2500,57 +2537,59 @@ static void perf_mmap_unmark_page(void *addr)
2500 page->mapping = NULL; 2537 page->mapping = NULL;
2501} 2538}
2502 2539
2503static void perf_mmap_data_free_work(struct work_struct *work) 2540static void perf_buffer_free_work(struct work_struct *work)
2504{ 2541{
2505 struct perf_mmap_data *data; 2542 struct perf_buffer *buffer;
2506 void *base; 2543 void *base;
2507 int i, nr; 2544 int i, nr;
2508 2545
2509 data = container_of(work, struct perf_mmap_data, work); 2546 buffer = container_of(work, struct perf_buffer, work);
2510 nr = 1 << page_order(data); 2547 nr = 1 << page_order(buffer);
2511 2548
2512 base = data->user_page; 2549 base = buffer->user_page;
2513 for (i = 0; i < nr + 1; i++) 2550 for (i = 0; i < nr + 1; i++)
2514 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2551 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2515 2552
2516 vfree(base); 2553 vfree(base);
2517 kfree(data); 2554 kfree(buffer);
2518} 2555}
2519 2556
2520static void perf_mmap_data_free(struct perf_mmap_data *data) 2557static void perf_buffer_free(struct perf_buffer *buffer)
2521{ 2558{
2522 schedule_work(&data->work); 2559 schedule_work(&buffer->work);
2523} 2560}
2524 2561
2525static struct perf_mmap_data * 2562static struct perf_buffer *
2526perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2563perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2527{ 2564{
2528 struct perf_mmap_data *data; 2565 struct perf_buffer *buffer;
2529 unsigned long size; 2566 unsigned long size;
2530 void *all_buf; 2567 void *all_buf;
2531 2568
2532 size = sizeof(struct perf_mmap_data); 2569 size = sizeof(struct perf_buffer);
2533 size += sizeof(void *); 2570 size += sizeof(void *);
2534 2571
2535 data = kzalloc(size, GFP_KERNEL); 2572 buffer = kzalloc(size, GFP_KERNEL);
2536 if (!data) 2573 if (!buffer)
2537 goto fail; 2574 goto fail;
2538 2575
2539 INIT_WORK(&data->work, perf_mmap_data_free_work); 2576 INIT_WORK(&buffer->work, perf_buffer_free_work);
2540 2577
2541 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 2578 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2542 if (!all_buf) 2579 if (!all_buf)
2543 goto fail_all_buf; 2580 goto fail_all_buf;
2544 2581
2545 data->user_page = all_buf; 2582 buffer->user_page = all_buf;
2546 data->data_pages[0] = all_buf + PAGE_SIZE; 2583 buffer->data_pages[0] = all_buf + PAGE_SIZE;
2547 data->page_order = ilog2(nr_pages); 2584 buffer->page_order = ilog2(nr_pages);
2548 data->nr_pages = 1; 2585 buffer->nr_pages = 1;
2586
2587 perf_buffer_init(buffer, watermark, flags);
2549 2588
2550 return data; 2589 return buffer;
2551 2590
2552fail_all_buf: 2591fail_all_buf:
2553 kfree(data); 2592 kfree(buffer);
2554 2593
2555fail: 2594fail:
2556 return NULL; 2595 return NULL;
@@ -2558,15 +2597,15 @@ fail:
2558 2597
2559#endif 2598#endif
2560 2599
2561static unsigned long perf_data_size(struct perf_mmap_data *data) 2600static unsigned long perf_data_size(struct perf_buffer *buffer)
2562{ 2601{
2563 return data->nr_pages << (PAGE_SHIFT + page_order(data)); 2602 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2564} 2603}
2565 2604
2566static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2605static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2567{ 2606{
2568 struct perf_event *event = vma->vm_file->private_data; 2607 struct perf_event *event = vma->vm_file->private_data;
2569 struct perf_mmap_data *data; 2608 struct perf_buffer *buffer;
2570 int ret = VM_FAULT_SIGBUS; 2609 int ret = VM_FAULT_SIGBUS;
2571 2610
2572 if (vmf->flags & FAULT_FLAG_MKWRITE) { 2611 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2576,14 +2615,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2576 } 2615 }
2577 2616
2578 rcu_read_lock(); 2617 rcu_read_lock();
2579 data = rcu_dereference(event->data); 2618 buffer = rcu_dereference(event->buffer);
2580 if (!data) 2619 if (!buffer)
2581 goto unlock; 2620 goto unlock;
2582 2621
2583 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 2622 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2584 goto unlock; 2623 goto unlock;
2585 2624
2586 vmf->page = perf_mmap_to_page(data, vmf->pgoff); 2625 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
2587 if (!vmf->page) 2626 if (!vmf->page)
2588 goto unlock; 2627 goto unlock;
2589 2628
@@ -2598,52 +2637,35 @@ unlock:
2598 return ret; 2637 return ret;
2599} 2638}
2600 2639
2601static void 2640static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
2602perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2603{ 2641{
2604 long max_size = perf_data_size(data); 2642 struct perf_buffer *buffer;
2605 2643
2606 if (event->attr.watermark) { 2644 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
2607 data->watermark = min_t(long, max_size, 2645 perf_buffer_free(buffer);
2608 event->attr.wakeup_watermark);
2609 }
2610
2611 if (!data->watermark)
2612 data->watermark = max_size / 2;
2613
2614 atomic_set(&data->refcount, 1);
2615 rcu_assign_pointer(event->data, data);
2616} 2646}
2617 2647
2618static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) 2648static struct perf_buffer *perf_buffer_get(struct perf_event *event)
2619{ 2649{
2620 struct perf_mmap_data *data; 2650 struct perf_buffer *buffer;
2621
2622 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2623 perf_mmap_data_free(data);
2624}
2625
2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
2627{
2628 struct perf_mmap_data *data;
2629 2651
2630 rcu_read_lock(); 2652 rcu_read_lock();
2631 data = rcu_dereference(event->data); 2653 buffer = rcu_dereference(event->buffer);
2632 if (data) { 2654 if (buffer) {
2633 if (!atomic_inc_not_zero(&data->refcount)) 2655 if (!atomic_inc_not_zero(&buffer->refcount))
2634 data = NULL; 2656 buffer = NULL;
2635 } 2657 }
2636 rcu_read_unlock(); 2658 rcu_read_unlock();
2637 2659
2638 return data; 2660 return buffer;
2639} 2661}
2640 2662
2641static void perf_mmap_data_put(struct perf_mmap_data *data) 2663static void perf_buffer_put(struct perf_buffer *buffer)
2642{ 2664{
2643 if (!atomic_dec_and_test(&data->refcount)) 2665 if (!atomic_dec_and_test(&buffer->refcount))
2644 return; 2666 return;
2645 2667
2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2668 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
2647} 2669}
2648 2670
2649static void perf_mmap_open(struct vm_area_struct *vma) 2671static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2658,16 +2680,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2658 struct perf_event *event = vma->vm_file->private_data; 2680 struct perf_event *event = vma->vm_file->private_data;
2659 2681
2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2682 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2661 unsigned long size = perf_data_size(event->data); 2683 unsigned long size = perf_data_size(event->buffer);
2662 struct user_struct *user = event->mmap_user; 2684 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data; 2685 struct perf_buffer *buffer = event->buffer;
2664 2686
2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2687 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2666 vma->vm_mm->locked_vm -= event->mmap_locked; 2688 vma->vm_mm->locked_vm -= event->mmap_locked;
2667 rcu_assign_pointer(event->data, NULL); 2689 rcu_assign_pointer(event->buffer, NULL);
2668 mutex_unlock(&event->mmap_mutex); 2690 mutex_unlock(&event->mmap_mutex);
2669 2691
2670 perf_mmap_data_put(data); 2692 perf_buffer_put(buffer);
2671 free_uid(user); 2693 free_uid(user);
2672 } 2694 }
2673} 2695}
@@ -2685,11 +2707,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2685 unsigned long user_locked, user_lock_limit; 2707 unsigned long user_locked, user_lock_limit;
2686 struct user_struct *user = current_user(); 2708 struct user_struct *user = current_user();
2687 unsigned long locked, lock_limit; 2709 unsigned long locked, lock_limit;
2688 struct perf_mmap_data *data; 2710 struct perf_buffer *buffer;
2689 unsigned long vma_size; 2711 unsigned long vma_size;
2690 unsigned long nr_pages; 2712 unsigned long nr_pages;
2691 long user_extra, extra; 2713 long user_extra, extra;
2692 int ret = 0; 2714 int ret = 0, flags = 0;
2693 2715
2694 /* 2716 /*
2695 * Don't allow mmap() of inherited per-task counters. This would 2717 * Don't allow mmap() of inherited per-task counters. This would
@@ -2706,7 +2728,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2706 nr_pages = (vma_size / PAGE_SIZE) - 1; 2728 nr_pages = (vma_size / PAGE_SIZE) - 1;
2707 2729
2708 /* 2730 /*
2709 * If we have data pages ensure they're a power-of-two number, so we 2731 * If we have buffer pages ensure they're a power-of-two number, so we
2710 * can do bitmasks instead of modulo. 2732 * can do bitmasks instead of modulo.
2711 */ 2733 */
2712 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 2734 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2720,9 +2742,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2720 2742
2721 WARN_ON_ONCE(event->ctx->parent_ctx); 2743 WARN_ON_ONCE(event->ctx->parent_ctx);
2722 mutex_lock(&event->mmap_mutex); 2744 mutex_lock(&event->mmap_mutex);
2723 if (event->data) { 2745 if (event->buffer) {
2724 if (event->data->nr_pages == nr_pages) 2746 if (event->buffer->nr_pages == nr_pages)
2725 atomic_inc(&event->data->refcount); 2747 atomic_inc(&event->buffer->refcount);
2726 else 2748 else
2727 ret = -EINVAL; 2749 ret = -EINVAL;
2728 goto unlock; 2750 goto unlock;
@@ -2752,17 +2774,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2752 goto unlock; 2774 goto unlock;
2753 } 2775 }
2754 2776
2755 WARN_ON(event->data); 2777 WARN_ON(event->buffer);
2778
2779 if (vma->vm_flags & VM_WRITE)
2780 flags |= PERF_BUFFER_WRITABLE;
2756 2781
2757 data = perf_mmap_data_alloc(event, nr_pages); 2782 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
2758 if (!data) { 2783 event->cpu, flags);
2784 if (!buffer) {
2759 ret = -ENOMEM; 2785 ret = -ENOMEM;
2760 goto unlock; 2786 goto unlock;
2761 } 2787 }
2762 2788 rcu_assign_pointer(event->buffer, buffer);
2763 perf_mmap_data_init(event, data);
2764 if (vma->vm_flags & VM_WRITE)
2765 event->data->writable = 1;
2766 2789
2767 atomic_long_add(user_extra, &user->locked_vm); 2790 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra; 2791 event->mmap_locked = extra;
@@ -2941,11 +2964,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2941 return NULL; 2964 return NULL;
2942} 2965}
2943 2966
2944__weak
2945void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2946{
2947}
2948
2949 2967
2950/* 2968/*
2951 * We assume there is only KVM supporting the callbacks. 2969 * We assume there is only KVM supporting the callbacks.
@@ -2971,15 +2989,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2971/* 2989/*
2972 * Output 2990 * Output
2973 */ 2991 */
2974static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 2992static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
2975 unsigned long offset, unsigned long head) 2993 unsigned long offset, unsigned long head)
2976{ 2994{
2977 unsigned long mask; 2995 unsigned long mask;
2978 2996
2979 if (!data->writable) 2997 if (!buffer->writable)
2980 return true; 2998 return true;
2981 2999
2982 mask = perf_data_size(data) - 1; 3000 mask = perf_data_size(buffer) - 1;
2983 3001
2984 offset = (offset - tail) & mask; 3002 offset = (offset - tail) & mask;
2985 head = (head - tail) & mask; 3003 head = (head - tail) & mask;
@@ -2992,7 +3010,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2992 3010
2993static void perf_output_wakeup(struct perf_output_handle *handle) 3011static void perf_output_wakeup(struct perf_output_handle *handle)
2994{ 3012{
2995 atomic_set(&handle->data->poll, POLL_IN); 3013 atomic_set(&handle->buffer->poll, POLL_IN);
2996 3014
2997 if (handle->nmi) { 3015 if (handle->nmi) {
2998 handle->event->pending_wakeup = 1; 3016 handle->event->pending_wakeup = 1;
@@ -3012,45 +3030,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
3012 */ 3030 */
3013static void perf_output_get_handle(struct perf_output_handle *handle) 3031static void perf_output_get_handle(struct perf_output_handle *handle)
3014{ 3032{
3015 struct perf_mmap_data *data = handle->data; 3033 struct perf_buffer *buffer = handle->buffer;
3016 3034
3017 preempt_disable(); 3035 preempt_disable();
3018 local_inc(&data->nest); 3036 local_inc(&buffer->nest);
3019 handle->wakeup = local_read(&data->wakeup); 3037 handle->wakeup = local_read(&buffer->wakeup);
3020} 3038}
3021 3039
3022static void perf_output_put_handle(struct perf_output_handle *handle) 3040static void perf_output_put_handle(struct perf_output_handle *handle)
3023{ 3041{
3024 struct perf_mmap_data *data = handle->data; 3042 struct perf_buffer *buffer = handle->buffer;
3025 unsigned long head; 3043 unsigned long head;
3026 3044
3027again: 3045again:
3028 head = local_read(&data->head); 3046 head = local_read(&buffer->head);
3029 3047
3030 /* 3048 /*
3031 * IRQ/NMI can happen here, which means we can miss a head update. 3049 * IRQ/NMI can happen here, which means we can miss a head update.
3032 */ 3050 */
3033 3051
3034 if (!local_dec_and_test(&data->nest)) 3052 if (!local_dec_and_test(&buffer->nest))
3035 goto out; 3053 goto out;
3036 3054
3037 /* 3055 /*
3038 * Publish the known good head. Rely on the full barrier implied 3056 * Publish the known good head. Rely on the full barrier implied
3039 * by atomic_dec_and_test() order the data->head read and this 3057 * by atomic_dec_and_test() order the buffer->head read and this
3040 * write. 3058 * write.
3041 */ 3059 */
3042 data->user_page->data_head = head; 3060 buffer->user_page->data_head = head;
3043 3061
3044 /* 3062 /*
3045 * Now check if we missed an update, rely on the (compiler) 3063 * Now check if we missed an update, rely on the (compiler)
3046 * barrier in atomic_dec_and_test() to re-read data->head. 3064 * barrier in atomic_dec_and_test() to re-read buffer->head.
3047 */ 3065 */
3048 if (unlikely(head != local_read(&data->head))) { 3066 if (unlikely(head != local_read(&buffer->head))) {
3049 local_inc(&data->nest); 3067 local_inc(&buffer->nest);
3050 goto again; 3068 goto again;
3051 } 3069 }
3052 3070
3053 if (handle->wakeup != local_read(&data->wakeup)) 3071 if (handle->wakeup != local_read(&buffer->wakeup))
3054 perf_output_wakeup(handle); 3072 perf_output_wakeup(handle);
3055 3073
3056 out: 3074 out:
@@ -3070,12 +3088,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3070 buf += size; 3088 buf += size;
3071 handle->size -= size; 3089 handle->size -= size;
3072 if (!handle->size) { 3090 if (!handle->size) {
3073 struct perf_mmap_data *data = handle->data; 3091 struct perf_buffer *buffer = handle->buffer;
3074 3092
3075 handle->page++; 3093 handle->page++;
3076 handle->page &= data->nr_pages - 1; 3094 handle->page &= buffer->nr_pages - 1;
3077 handle->addr = data->data_pages[handle->page]; 3095 handle->addr = buffer->data_pages[handle->page];
3078 handle->size = PAGE_SIZE << page_order(data); 3096 handle->size = PAGE_SIZE << page_order(buffer);
3079 } 3097 }
3080 } while (len); 3098 } while (len);
3081} 3099}
@@ -3084,7 +3102,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3084 struct perf_event *event, unsigned int size, 3102 struct perf_event *event, unsigned int size,
3085 int nmi, int sample) 3103 int nmi, int sample)
3086{ 3104{
3087 struct perf_mmap_data *data; 3105 struct perf_buffer *buffer;
3088 unsigned long tail, offset, head; 3106 unsigned long tail, offset, head;
3089 int have_lost; 3107 int have_lost;
3090 struct { 3108 struct {
@@ -3100,19 +3118,19 @@ int perf_output_begin(struct perf_output_handle *handle,
3100 if (event->parent) 3118 if (event->parent)
3101 event = event->parent; 3119 event = event->parent;
3102 3120
3103 data = rcu_dereference(event->data); 3121 buffer = rcu_dereference(event->buffer);
3104 if (!data) 3122 if (!buffer)
3105 goto out; 3123 goto out;
3106 3124
3107 handle->data = data; 3125 handle->buffer = buffer;
3108 handle->event = event; 3126 handle->event = event;
3109 handle->nmi = nmi; 3127 handle->nmi = nmi;
3110 handle->sample = sample; 3128 handle->sample = sample;
3111 3129
3112 if (!data->nr_pages) 3130 if (!buffer->nr_pages)
3113 goto out; 3131 goto out;
3114 3132
3115 have_lost = local_read(&data->lost); 3133 have_lost = local_read(&buffer->lost);
3116 if (have_lost) 3134 if (have_lost)
3117 size += sizeof(lost_event); 3135 size += sizeof(lost_event);
3118 3136
@@ -3124,30 +3142,30 @@ int perf_output_begin(struct perf_output_handle *handle,
3124 * tail pointer. So that all reads will be completed before the 3142 * tail pointer. So that all reads will be completed before the
3125 * write is issued. 3143 * write is issued.
3126 */ 3144 */
3127 tail = ACCESS_ONCE(data->user_page->data_tail); 3145 tail = ACCESS_ONCE(buffer->user_page->data_tail);
3128 smp_rmb(); 3146 smp_rmb();
3129 offset = head = local_read(&data->head); 3147 offset = head = local_read(&buffer->head);
3130 head += size; 3148 head += size;
3131 if (unlikely(!perf_output_space(data, tail, offset, head))) 3149 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
3132 goto fail; 3150 goto fail;
3133 } while (local_cmpxchg(&data->head, offset, head) != offset); 3151 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
3134 3152
3135 if (head - local_read(&data->wakeup) > data->watermark) 3153 if (head - local_read(&buffer->wakeup) > buffer->watermark)
3136 local_add(data->watermark, &data->wakeup); 3154 local_add(buffer->watermark, &buffer->wakeup);
3137 3155
3138 handle->page = offset >> (PAGE_SHIFT + page_order(data)); 3156 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
3139 handle->page &= data->nr_pages - 1; 3157 handle->page &= buffer->nr_pages - 1;
3140 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); 3158 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
3141 handle->addr = data->data_pages[handle->page]; 3159 handle->addr = buffer->data_pages[handle->page];
3142 handle->addr += handle->size; 3160 handle->addr += handle->size;
3143 handle->size = (PAGE_SIZE << page_order(data)) - handle->size; 3161 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3144 3162
3145 if (have_lost) { 3163 if (have_lost) {
3146 lost_event.header.type = PERF_RECORD_LOST; 3164 lost_event.header.type = PERF_RECORD_LOST;
3147 lost_event.header.misc = 0; 3165 lost_event.header.misc = 0;
3148 lost_event.header.size = sizeof(lost_event); 3166 lost_event.header.size = sizeof(lost_event);
3149 lost_event.id = event->id; 3167 lost_event.id = event->id;
3150 lost_event.lost = local_xchg(&data->lost, 0); 3168 lost_event.lost = local_xchg(&buffer->lost, 0);
3151 3169
3152 perf_output_put(handle, lost_event); 3170 perf_output_put(handle, lost_event);
3153 } 3171 }
@@ -3155,7 +3173,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3155 return 0; 3173 return 0;
3156 3174
3157fail: 3175fail:
3158 local_inc(&data->lost); 3176 local_inc(&buffer->lost);
3159 perf_output_put_handle(handle); 3177 perf_output_put_handle(handle);
3160out: 3178out:
3161 rcu_read_unlock(); 3179 rcu_read_unlock();
@@ -3166,15 +3184,15 @@ out:
3166void perf_output_end(struct perf_output_handle *handle) 3184void perf_output_end(struct perf_output_handle *handle)
3167{ 3185{
3168 struct perf_event *event = handle->event; 3186 struct perf_event *event = handle->event;
3169 struct perf_mmap_data *data = handle->data; 3187 struct perf_buffer *buffer = handle->buffer;
3170 3188
3171 int wakeup_events = event->attr.wakeup_events; 3189 int wakeup_events = event->attr.wakeup_events;
3172 3190
3173 if (handle->sample && wakeup_events) { 3191 if (handle->sample && wakeup_events) {
3174 int events = local_inc_return(&data->events); 3192 int events = local_inc_return(&buffer->events);
3175 if (events >= wakeup_events) { 3193 if (events >= wakeup_events) {
3176 local_sub(wakeup_events, &data->events); 3194 local_sub(wakeup_events, &buffer->events);
3177 local_inc(&data->wakeup); 3195 local_inc(&buffer->wakeup);
3178 } 3196 }
3179 } 3197 }
3180 3198
@@ -3211,7 +3229,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3211 u64 values[4]; 3229 u64 values[4];
3212 int n = 0; 3230 int n = 0;
3213 3231
3214 values[n++] = atomic64_read(&event->count); 3232 values[n++] = perf_event_count(event);
3215 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3233 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3216 values[n++] = event->total_time_enabled + 3234 values[n++] = event->total_time_enabled +
3217 atomic64_read(&event->child_total_time_enabled); 3235 atomic64_read(&event->child_total_time_enabled);
@@ -3248,7 +3266,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3248 if (leader != event) 3266 if (leader != event)
3249 leader->pmu->read(leader); 3267 leader->pmu->read(leader);
3250 3268
3251 values[n++] = atomic64_read(&leader->count); 3269 values[n++] = perf_event_count(leader);
3252 if (read_format & PERF_FORMAT_ID) 3270 if (read_format & PERF_FORMAT_ID)
3253 values[n++] = primary_event_id(leader); 3271 values[n++] = primary_event_id(leader);
3254 3272
@@ -3260,7 +3278,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3260 if (sub != event) 3278 if (sub != event)
3261 sub->pmu->read(sub); 3279 sub->pmu->read(sub);
3262 3280
3263 values[n++] = atomic64_read(&sub->count); 3281 values[n++] = perf_event_count(sub);
3264 if (read_format & PERF_FORMAT_ID) 3282 if (read_format & PERF_FORMAT_ID)
3265 values[n++] = primary_event_id(sub); 3283 values[n++] = primary_event_id(sub);
3266 3284
@@ -3491,7 +3509,7 @@ perf_event_read_event(struct perf_event *event,
3491/* 3509/*
3492 * task tracking -- fork/exit 3510 * task tracking -- fork/exit
3493 * 3511 *
3494 * enabled by: attr.comm | attr.mmap | attr.task 3512 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
3495 */ 3513 */
3496 3514
3497struct perf_task_event { 3515struct perf_task_event {
@@ -3541,7 +3559,8 @@ static int perf_event_task_match(struct perf_event *event)
3541 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3559 if (event->cpu != -1 && event->cpu != smp_processor_id())
3542 return 0; 3560 return 0;
3543 3561
3544 if (event->attr.comm || event->attr.mmap || event->attr.task) 3562 if (event->attr.comm || event->attr.mmap ||
3563 event->attr.mmap_data || event->attr.task)
3545 return 1; 3564 return 1;
3546 3565
3547 return 0; 3566 return 0;
@@ -3766,7 +3785,8 @@ static void perf_event_mmap_output(struct perf_event *event,
3766} 3785}
3767 3786
3768static int perf_event_mmap_match(struct perf_event *event, 3787static int perf_event_mmap_match(struct perf_event *event,
3769 struct perf_mmap_event *mmap_event) 3788 struct perf_mmap_event *mmap_event,
3789 int executable)
3770{ 3790{
3771 if (event->state < PERF_EVENT_STATE_INACTIVE) 3791 if (event->state < PERF_EVENT_STATE_INACTIVE)
3772 return 0; 3792 return 0;
@@ -3774,19 +3794,21 @@ static int perf_event_mmap_match(struct perf_event *event,
3774 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3794 if (event->cpu != -1 && event->cpu != smp_processor_id())
3775 return 0; 3795 return 0;
3776 3796
3777 if (event->attr.mmap) 3797 if ((!executable && event->attr.mmap_data) ||
3798 (executable && event->attr.mmap))
3778 return 1; 3799 return 1;
3779 3800
3780 return 0; 3801 return 0;
3781} 3802}
3782 3803
3783static void perf_event_mmap_ctx(struct perf_event_context *ctx, 3804static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3784 struct perf_mmap_event *mmap_event) 3805 struct perf_mmap_event *mmap_event,
3806 int executable)
3785{ 3807{
3786 struct perf_event *event; 3808 struct perf_event *event;
3787 3809
3788 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3810 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3789 if (perf_event_mmap_match(event, mmap_event)) 3811 if (perf_event_mmap_match(event, mmap_event, executable))
3790 perf_event_mmap_output(event, mmap_event); 3812 perf_event_mmap_output(event, mmap_event);
3791 } 3813 }
3792} 3814}
@@ -3830,6 +3852,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3830 if (!vma->vm_mm) { 3852 if (!vma->vm_mm) {
3831 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3853 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3832 goto got_name; 3854 goto got_name;
3855 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
3856 vma->vm_end >= vma->vm_mm->brk) {
3857 name = strncpy(tmp, "[heap]", sizeof(tmp));
3858 goto got_name;
3859 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
3860 vma->vm_end >= vma->vm_mm->start_stack) {
3861 name = strncpy(tmp, "[stack]", sizeof(tmp));
3862 goto got_name;
3833 } 3863 }
3834 3864
3835 name = strncpy(tmp, "//anon", sizeof(tmp)); 3865 name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3846,17 +3876,17 @@ got_name:
3846 3876
3847 rcu_read_lock(); 3877 rcu_read_lock();
3848 cpuctx = &get_cpu_var(perf_cpu_context); 3878 cpuctx = &get_cpu_var(perf_cpu_context);
3849 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3879 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
3850 ctx = rcu_dereference(current->perf_event_ctxp); 3880 ctx = rcu_dereference(current->perf_event_ctxp);
3851 if (ctx) 3881 if (ctx)
3852 perf_event_mmap_ctx(ctx, mmap_event); 3882 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
3853 put_cpu_var(perf_cpu_context); 3883 put_cpu_var(perf_cpu_context);
3854 rcu_read_unlock(); 3884 rcu_read_unlock();
3855 3885
3856 kfree(buf); 3886 kfree(buf);
3857} 3887}
3858 3888
3859void __perf_event_mmap(struct vm_area_struct *vma) 3889void perf_event_mmap(struct vm_area_struct *vma)
3860{ 3890{
3861 struct perf_mmap_event mmap_event; 3891 struct perf_mmap_event mmap_event;
3862 3892
@@ -4018,14 +4048,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
4018 hwc->last_period = hwc->sample_period; 4048 hwc->last_period = hwc->sample_period;
4019 4049
4020again: 4050again:
4021 old = val = atomic64_read(&hwc->period_left); 4051 old = val = local64_read(&hwc->period_left);
4022 if (val < 0) 4052 if (val < 0)
4023 return 0; 4053 return 0;
4024 4054
4025 nr = div64_u64(period + val, period); 4055 nr = div64_u64(period + val, period);
4026 offset = nr * period; 4056 offset = nr * period;
4027 val -= offset; 4057 val -= offset;
4028 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) 4058 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4029 goto again; 4059 goto again;
4030 4060
4031 return nr; 4061 return nr;
@@ -4064,7 +4094,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4064{ 4094{
4065 struct hw_perf_event *hwc = &event->hw; 4095 struct hw_perf_event *hwc = &event->hw;
4066 4096
4067 atomic64_add(nr, &event->count); 4097 local64_add(nr, &event->count);
4068 4098
4069 if (!regs) 4099 if (!regs)
4070 return; 4100 return;
@@ -4075,7 +4105,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4075 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4105 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4076 return perf_swevent_overflow(event, 1, nmi, data, regs); 4106 return perf_swevent_overflow(event, 1, nmi, data, regs);
4077 4107
4078 if (atomic64_add_negative(nr, &hwc->period_left)) 4108 if (local64_add_negative(nr, &hwc->period_left))
4079 return; 4109 return;
4080 4110
4081 perf_swevent_overflow(event, 0, nmi, data, regs); 4111 perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4213,14 +4243,12 @@ int perf_swevent_get_recursion_context(void)
4213} 4243}
4214EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4244EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4215 4245
4216void perf_swevent_put_recursion_context(int rctx) 4246void inline perf_swevent_put_recursion_context(int rctx)
4217{ 4247{
4218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4219 barrier(); 4249 barrier();
4220 cpuctx->recursion[rctx]--; 4250 cpuctx->recursion[rctx]--;
4221} 4251}
4222EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4223
4224 4252
4225void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4253void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4226 struct pt_regs *regs, u64 addr) 4254 struct pt_regs *regs, u64 addr)
@@ -4368,8 +4396,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4368 u64 now; 4396 u64 now;
4369 4397
4370 now = cpu_clock(cpu); 4398 now = cpu_clock(cpu);
4371 prev = atomic64_xchg(&event->hw.prev_count, now); 4399 prev = local64_xchg(&event->hw.prev_count, now);
4372 atomic64_add(now - prev, &event->count); 4400 local64_add(now - prev, &event->count);
4373} 4401}
4374 4402
4375static int cpu_clock_perf_event_enable(struct perf_event *event) 4403static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4377,7 +4405,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
4377 struct hw_perf_event *hwc = &event->hw; 4405 struct hw_perf_event *hwc = &event->hw;
4378 int cpu = raw_smp_processor_id(); 4406 int cpu = raw_smp_processor_id();
4379 4407
4380 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4408 local64_set(&hwc->prev_count, cpu_clock(cpu));
4381 perf_swevent_start_hrtimer(event); 4409 perf_swevent_start_hrtimer(event);
4382 4410
4383 return 0; 4411 return 0;
@@ -4409,9 +4437,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4409 u64 prev; 4437 u64 prev;
4410 s64 delta; 4438 s64 delta;
4411 4439
4412 prev = atomic64_xchg(&event->hw.prev_count, now); 4440 prev = local64_xchg(&event->hw.prev_count, now);
4413 delta = now - prev; 4441 delta = now - prev;
4414 atomic64_add(delta, &event->count); 4442 local64_add(delta, &event->count);
4415} 4443}
4416 4444
4417static int task_clock_perf_event_enable(struct perf_event *event) 4445static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4421,7 +4449,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
4421 4449
4422 now = event->ctx->time; 4450 now = event->ctx->time;
4423 4451
4424 atomic64_set(&hwc->prev_count, now); 4452 local64_set(&hwc->prev_count, now);
4425 4453
4426 perf_swevent_start_hrtimer(event); 4454 perf_swevent_start_hrtimer(event);
4427 4455
@@ -4601,7 +4629,7 @@ static int perf_tp_event_match(struct perf_event *event,
4601} 4629}
4602 4630
4603void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 4631void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4604 struct pt_regs *regs, struct hlist_head *head) 4632 struct pt_regs *regs, struct hlist_head *head, int rctx)
4605{ 4633{
4606 struct perf_sample_data data; 4634 struct perf_sample_data data;
4607 struct perf_event *event; 4635 struct perf_event *event;
@@ -4615,12 +4643,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4615 perf_sample_data_init(&data, addr); 4643 perf_sample_data_init(&data, addr);
4616 data.raw = &raw; 4644 data.raw = &raw;
4617 4645
4618 rcu_read_lock();
4619 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4646 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4620 if (perf_tp_event_match(event, &data, regs)) 4647 if (perf_tp_event_match(event, &data, regs))
4621 perf_swevent_add(event, count, 1, &data, regs); 4648 perf_swevent_add(event, count, 1, &data, regs);
4622 } 4649 }
4623 rcu_read_unlock(); 4650
4651 perf_swevent_put_recursion_context(rctx);
4624} 4652}
4625EXPORT_SYMBOL_GPL(perf_tp_event); 4653EXPORT_SYMBOL_GPL(perf_tp_event);
4626 4654
@@ -4864,7 +4892,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4864 hwc->sample_period = 1; 4892 hwc->sample_period = 1;
4865 hwc->last_period = hwc->sample_period; 4893 hwc->last_period = hwc->sample_period;
4866 4894
4867 atomic64_set(&hwc->period_left, hwc->sample_period); 4895 local64_set(&hwc->period_left, hwc->sample_period);
4868 4896
4869 /* 4897 /*
4870 * we currently do not support PERF_FORMAT_GROUP on inherited events 4898 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4913,7 +4941,7 @@ done:
4913 4941
4914 if (!event->parent) { 4942 if (!event->parent) {
4915 atomic_inc(&nr_events); 4943 atomic_inc(&nr_events);
4916 if (event->attr.mmap) 4944 if (event->attr.mmap || event->attr.mmap_data)
4917 atomic_inc(&nr_mmap_events); 4945 atomic_inc(&nr_mmap_events);
4918 if (event->attr.comm) 4946 if (event->attr.comm)
4919 atomic_inc(&nr_comm_events); 4947 atomic_inc(&nr_comm_events);
@@ -5007,7 +5035,7 @@ err_size:
5007static int 5035static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5036perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5009{ 5037{
5010 struct perf_mmap_data *data = NULL, *old_data = NULL; 5038 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
5011 int ret = -EINVAL; 5039 int ret = -EINVAL;
5012 5040
5013 if (!output_event) 5041 if (!output_event)
@@ -5037,19 +5065,19 @@ set:
5037 5065
5038 if (output_event) { 5066 if (output_event) {
5039 /* get the buffer we want to redirect to */ 5067 /* get the buffer we want to redirect to */
5040 data = perf_mmap_data_get(output_event); 5068 buffer = perf_buffer_get(output_event);
5041 if (!data) 5069 if (!buffer)
5042 goto unlock; 5070 goto unlock;
5043 } 5071 }
5044 5072
5045 old_data = event->data; 5073 old_buffer = event->buffer;
5046 rcu_assign_pointer(event->data, data); 5074 rcu_assign_pointer(event->buffer, buffer);
5047 ret = 0; 5075 ret = 0;
5048unlock: 5076unlock:
5049 mutex_unlock(&event->mmap_mutex); 5077 mutex_unlock(&event->mmap_mutex);
5050 5078
5051 if (old_data) 5079 if (old_buffer)
5052 perf_mmap_data_put(old_data); 5080 perf_buffer_put(old_buffer);
5053out: 5081out:
5054 return ret; 5082 return ret;
5055} 5083}
@@ -5298,7 +5326,7 @@ inherit_event(struct perf_event *parent_event,
5298 hwc->sample_period = sample_period; 5326 hwc->sample_period = sample_period;
5299 hwc->last_period = sample_period; 5327 hwc->last_period = sample_period;
5300 5328
5301 atomic64_set(&hwc->period_left, sample_period); 5329 local64_set(&hwc->period_left, sample_period);
5302 } 5330 }
5303 5331
5304 child_event->overflow_handler = parent_event->overflow_handler; 5332 child_event->overflow_handler = parent_event->overflow_handler;
@@ -5359,12 +5387,12 @@ static void sync_child_event(struct perf_event *child_event,
5359 if (child_event->attr.inherit_stat) 5387 if (child_event->attr.inherit_stat)
5360 perf_event_read_event(child_event, child); 5388 perf_event_read_event(child_event, child);
5361 5389
5362 child_val = atomic64_read(&child_event->count); 5390 child_val = perf_event_count(child_event);
5363 5391
5364 /* 5392 /*
5365 * Add back the child's count to the parent's count: 5393 * Add back the child's count to the parent's count:
5366 */ 5394 */
5367 atomic64_add(child_val, &parent_event->count); 5395 atomic64_add(child_val, &parent_event->child_count);
5368 atomic64_add(child_event->total_time_enabled, 5396 atomic64_add(child_event->total_time_enabled,
5369 &parent_event->child_total_time_enabled); 5397 &parent_event->child_total_time_enabled);
5370 atomic64_add(child_event->total_time_running, 5398 atomic64_add(child_event->total_time_running,
@@ -5733,15 +5761,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5733{ 5761{
5734 unsigned int cpu = (long)hcpu; 5762 unsigned int cpu = (long)hcpu;
5735 5763
5736 switch (action) { 5764 switch (action & ~CPU_TASKS_FROZEN) {
5737 5765
5738 case CPU_UP_PREPARE: 5766 case CPU_UP_PREPARE:
5739 case CPU_UP_PREPARE_FROZEN: 5767 case CPU_DOWN_FAILED:
5740 perf_event_init_cpu(cpu); 5768 perf_event_init_cpu(cpu);
5741 break; 5769 break;
5742 5770
5771 case CPU_UP_CANCELED:
5743 case CPU_DOWN_PREPARE: 5772 case CPU_DOWN_PREPARE:
5744 case CPU_DOWN_PREPARE_FROZEN:
5745 perf_event_exit_cpu(cpu); 5773 perf_event_exit_cpu(cpu);
5746 break; 5774 break;
5747 5775
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c132d26..d55c6fb8d087 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid)
122 atomic_inc(&map->nr_free); 122 atomic_inc(&map->nr_free);
123} 123}
124 124
125/*
126 * If we started walking pids at 'base', is 'a' seen before 'b'?
127 */
128static int pid_before(int base, int a, int b)
129{
130 /*
131 * This is the same as saying
132 *
133 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
134 * and that mapping orders 'a' and 'b' with respect to 'base'.
135 */
136 return (unsigned)(a - base) < (unsigned)(b - base);
137}
138
139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid.
141 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately.
143 *
144 * Since pids rollover, it is not sufficient to just pick the bigger
145 * value. We have to consider where we started counting from.
146 *
147 * 'base' is the value of pid_ns->last_pid that we observed when
148 * we started looking for a pid.
149 *
150 * 'pid' is the pid that we eventually found.
151 */
152static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
153{
154 int prev;
155 int last_write = base;
156 do {
157 prev = last_write;
158 last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
159 } while ((prev != last_write) && (pid_before(base, last_write, pid)));
160}
161
125static int alloc_pidmap(struct pid_namespace *pid_ns) 162static int alloc_pidmap(struct pid_namespace *pid_ns)
126{ 163{
127 int i, offset, max_scan, pid, last = pid_ns->last_pid; 164 int i, offset, max_scan, pid, last = pid_ns->last_pid;
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
132 pid = RESERVED_PIDS; 169 pid = RESERVED_PIDS;
133 offset = pid & BITS_PER_PAGE_MASK; 170 offset = pid & BITS_PER_PAGE_MASK;
134 map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; 171 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
135 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 172 /*
173 * If last_pid points into the middle of the map->page we
174 * want to scan this bitmap block twice, the second time
175 * we start with offset == 0 (or RESERVED_PIDS).
176 */
177 max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
136 for (i = 0; i <= max_scan; ++i) { 178 for (i = 0; i <= max_scan; ++i) {
137 if (unlikely(!map->page)) { 179 if (unlikely(!map->page)) {
138 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); 180 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
154 do { 196 do {
155 if (!test_and_set_bit(offset, map->page)) { 197 if (!test_and_set_bit(offset, map->page)) {
156 atomic_dec(&map->nr_free); 198 atomic_dec(&map->nr_free);
157 pid_ns->last_pid = pid; 199 set_last_pid(pid_ns, last, pid);
158 return pid; 200 return pid;
159 } 201 }
160 offset = find_next_offset(map, offset); 202 offset = find_next_offset(map, offset);
161 pid = mk_pid(pid_ns, map, offset); 203 pid = mk_pid(pid_ns, map, offset);
162 /* 204 } while (offset < BITS_PER_PAGE && pid < pid_max);
163 * find_next_offset() found a bit, the pid from it
164 * is in-bounds, and if we fell back to the last
165 * bitmap block and the final block was the same
166 * as the starting point, pid is before last_pid.
167 */
168 } while (offset < BITS_PER_PAGE && pid < pid_max &&
169 (i != max_scan || pid < last ||
170 !((last+1) & BITS_PER_PAGE_MASK)));
171 } 205 }
172 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 206 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
173 ++map; 207 ++map;
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 996a4dec5f96..645e541a45f6 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -212,15 +212,17 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
212 212
213/** 213/**
214 * pm_qos_add_request - inserts new qos request into the list 214 * pm_qos_add_request - inserts new qos request into the list
215 * @pm_qos_class: identifies which list of qos request to us 215 * @dep: pointer to a preallocated handle
216 * @pm_qos_class: identifies which list of qos request to use
216 * @value: defines the qos request 217 * @value: defines the qos request
217 * 218 *
218 * This function inserts a new entry in the pm_qos_class list of requested qos 219 * This function inserts a new entry in the pm_qos_class list of requested qos
219 * performance characteristics. It recomputes the aggregate QoS expectations 220 * performance characteristics. It recomputes the aggregate QoS expectations
220 * for the pm_qos_class of parameters, and returns the pm_qos_request list 221 * for the pm_qos_class of parameters and initializes the pm_qos_request_list
221 * element as a handle for use in updating and removal. Call needs to save 222 * handle. Caller needs to save this handle for later use in updates and
222 * this handle for later use. 223 * removal.
223 */ 224 */
225
224void pm_qos_add_request(struct pm_qos_request_list *dep, 226void pm_qos_add_request(struct pm_qos_request_list *dep,
225 int pm_qos_class, s32 value) 227 int pm_qos_class, s32 value)
226{ 228{
@@ -348,7 +350,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
348 350
349 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 351 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
350 if (pm_qos_class >= 0) { 352 if (pm_qos_class >= 0) {
351 struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); 353 struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
352 if (!req) 354 if (!req)
353 return -ENOMEM; 355 return -ENOMEM;
354 356
@@ -387,10 +389,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
387 } else if (count == 11) { /* len('0x12345678/0') */ 389 } else if (count == 11) { /* len('0x12345678/0') */
388 if (copy_from_user(ascii_value, buf, 11)) 390 if (copy_from_user(ascii_value, buf, 11))
389 return -EFAULT; 391 return -EFAULT;
392 if (strlen(ascii_value) != 10)
393 return -EINVAL;
390 x = sscanf(ascii_value, "%x", &value); 394 x = sscanf(ascii_value, "%x", &value);
391 if (x != 1) 395 if (x != 1)
392 return -EINVAL; 396 return -EINVAL;
393 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); 397 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
394 } else 398 } else
395 return -EINVAL; 399 return -EINVAL;
396 400
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..6842eeba5879 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -16,13 +16,13 @@
16 * siglock protection since other code may update expiration cache as 16 * siglock protection since other code may update expiration cache as
17 * well. 17 * well.
18 */ 18 */
19void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
20{ 20{
21 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
22 22
23 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&task->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&task->sighand->siglock);
26} 26}
27 27
28static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
232 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{ 234{
235 struct sighand_struct *sighand; 235 struct signal_struct *sig = tsk->signal;
236 struct signal_struct *sig;
237 struct task_struct *t; 236 struct task_struct *t;
238 237
239 *times = INIT_CPUTIME; 238 times->utime = sig->utime;
239 times->stime = sig->stime;
240 times->sum_exec_runtime = sig->sum_sched_runtime;
240 241
241 rcu_read_lock(); 242 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand); 243 /* make sure we can trust tsk->thread_group list */
243 if (!sighand) 244 if (!likely(pid_alive(tsk)))
244 goto out; 245 goto out;
245 246
246 sig = tsk->signal;
247
248 t = tsk; 247 t = tsk;
249 do { 248 do {
250 times->utime = cputime_add(times->utime, t->utime); 249 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime); 250 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime; 251 times->sum_exec_runtime += t->se.sum_exec_runtime;
253 252 } while_each_thread(tsk, t);
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out: 253out:
261 rcu_read_unlock(); 254 rcu_read_unlock();
262} 255}
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1279{ 1272{
1280 struct signal_struct *sig; 1273 struct signal_struct *sig;
1281 1274
1282 /* tsk == current, ensure it is safe to use ->signal/sighand */
1283 if (unlikely(tsk->exit_state))
1284 return 0;
1285
1286 if (!task_cputime_zero(&tsk->cputime_expires)) { 1275 if (!task_cputime_zero(&tsk->cputime_expires)) {
1287 struct task_cputime task_sample = { 1276 struct task_cputime task_sample = {
1288 .utime = tsk->utime, 1277 .utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1298 if (sig->cputimer.running) { 1287 if (sig->cputimer.running) {
1299 struct task_cputime group_sample; 1288 struct task_cputime group_sample;
1300 1289
1301 thread_group_cputimer(tsk, &group_sample); 1290 spin_lock(&sig->cputimer.lock);
1291 group_sample = sig->cputimer.cputime;
1292 spin_unlock(&sig->cputimer.lock);
1293
1302 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1294 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1303 return 1; 1295 return 1;
1304 } 1296 }
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1315{ 1307{
1316 LIST_HEAD(firing); 1308 LIST_HEAD(firing);
1317 struct k_itimer *timer, *next; 1309 struct k_itimer *timer, *next;
1310 unsigned long flags;
1318 1311
1319 BUG_ON(!irqs_disabled()); 1312 BUG_ON(!irqs_disabled());
1320 1313
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1325 if (!fastpath_timer_check(tsk)) 1318 if (!fastpath_timer_check(tsk))
1326 return; 1319 return;
1327 1320
1328 spin_lock(&tsk->sighand->siglock); 1321 if (!lock_task_sighand(tsk, &flags))
1322 return;
1329 /* 1323 /*
1330 * Here we take off tsk->signal->cpu_timers[N] and 1324 * Here we take off tsk->signal->cpu_timers[N] and
1331 * tsk->cpu_timers[N] all the timers that are firing, and 1325 * tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1347 * that gets the timer lock before we do will give it up and 1341 * that gets the timer lock before we do will give it up and
1348 * spin until we've taken care of that timer below. 1342 * spin until we've taken care of that timer below.
1349 */ 1343 */
1350 spin_unlock(&tsk->sighand->siglock); 1344 unlock_task_sighand(tsk, &flags);
1351 1345
1352 /* 1346 /*
1353 * Now that all the timers on our list have the firing flag, 1347 * Now that all the timers on our list have the firing flag,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad723420acc3..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 562
563 if (copy_to_user(created_timer_id,
564 &new_timer_id, sizeof (new_timer_id))) {
565 error = -EFAULT;
566 goto out;
567 }
568 if (timer_event_spec) { 563 if (timer_event_spec) {
569 if (copy_from_user(&event, timer_event_spec, sizeof (event))) { 564 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
570 error = -EFAULT; 565 error = -EFAULT;
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
590 new_timer->sigq->info.si_tid = new_timer->it_id; 585 new_timer->sigq->info.si_tid = new_timer->it_id;
591 new_timer->sigq->info.si_code = SI_TIMER; 586 new_timer->sigq->info.si_code = SI_TIMER;
592 587
588 if (copy_to_user(created_timer_id,
589 &new_timer_id, sizeof (new_timer_id))) {
590 error = -EFAULT;
591 goto out;
592 }
593
593 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 594 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
594 if (error) 595 if (error)
595 goto out; 596 goto out;
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 97024fd40cd5..83bbc7c02df9 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index e8b337006276..d52359374e85 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy)
24 24
25static DECLARE_WORK(poweroff_work, do_poweroff); 25static DECLARE_WORK(poweroff_work, do_poweroff);
26 26
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..028a99598f49 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h>
18 19
19/* 20/*
20 * Timeout for stopping processes 21 * Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
35 struct task_struct *g, *p; 36 struct task_struct *g, *p;
36 unsigned long end_time; 37 unsigned long end_time;
37 unsigned int todo; 38 unsigned int todo;
39 bool wq_busy = false;
38 struct timeval start, end; 40 struct timeval start, end;
39 u64 elapsed_csecs64; 41 u64 elapsed_csecs64;
40 unsigned int elapsed_csecs; 42 unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
42 do_gettimeofday(&start); 44 do_gettimeofday(&start);
43 45
44 end_time = jiffies + TIMEOUT; 46 end_time = jiffies + TIMEOUT;
47
48 if (!sig_only)
49 freeze_workqueues_begin();
50
45 while (true) { 51 while (true) {
46 todo = 0; 52 todo = 0;
47 read_lock(&tasklist_lock); 53 read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
63 todo++; 69 todo++;
64 } while_each_thread(g, p); 70 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 71 read_unlock(&tasklist_lock);
72
73 if (!sig_only) {
74 wq_busy = freeze_workqueues_busy();
75 todo += wq_busy;
76 }
77
66 if (!todo || time_after(jiffies, end_time)) 78 if (!todo || time_after(jiffies, end_time))
67 break; 79 break;
68 80
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only)
86 */ 98 */
87 printk("\n"); 99 printk("\n");
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 100 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 101 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 102 elapsed_csecs / 100, elapsed_csecs % 100,
103 todo - wq_busy, wq_busy);
104
105 thaw_workqueues();
106
91 read_lock(&tasklist_lock); 107 read_lock(&tasklist_lock);
92 do_each_thread(g, p) { 108 do_each_thread(g, p) {
93 task_lock(p); 109 task_lock(p);
@@ -157,6 +173,7 @@ void thaw_processes(void)
157 oom_killer_enable(); 173 oom_killer_enable();
158 174
159 printk("Restarting tasks ... "); 175 printk("Restarting tasks ... ");
176 thaw_workqueues();
160 thaw_tasks(true); 177 thaw_tasks(true);
161 thaw_tasks(false); 178 thaw_tasks(false);
162 schedule(); 179 schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f6cd6faf84fd..d3f795f01bbc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1121,9 +1121,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1121 return nr_alloc; 1121 return nr_alloc;
1122} 1122}
1123 1123
1124static unsigned long preallocate_image_memory(unsigned long nr_pages) 1124static unsigned long preallocate_image_memory(unsigned long nr_pages,
1125 unsigned long avail_normal)
1125{ 1126{
1126 return preallocate_image_pages(nr_pages, GFP_IMAGE); 1127 unsigned long alloc;
1128
1129 if (avail_normal <= alloc_normal)
1130 return 0;
1131
1132 alloc = avail_normal - alloc_normal;
1133 if (nr_pages < alloc)
1134 alloc = nr_pages;
1135
1136 return preallocate_image_pages(alloc, GFP_IMAGE);
1127} 1137}
1128 1138
1129#ifdef CONFIG_HIGHMEM 1139#ifdef CONFIG_HIGHMEM
@@ -1169,15 +1179,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1169 */ 1179 */
1170static void free_unnecessary_pages(void) 1180static void free_unnecessary_pages(void)
1171{ 1181{
1172 unsigned long save_highmem, to_free_normal, to_free_highmem; 1182 unsigned long save, to_free_normal, to_free_highmem;
1173 1183
1174 to_free_normal = alloc_normal - count_data_pages(); 1184 save = count_data_pages();
1175 save_highmem = count_highmem_pages(); 1185 if (alloc_normal >= save) {
1176 if (alloc_highmem > save_highmem) { 1186 to_free_normal = alloc_normal - save;
1177 to_free_highmem = alloc_highmem - save_highmem; 1187 save = 0;
1188 } else {
1189 to_free_normal = 0;
1190 save -= alloc_normal;
1191 }
1192 save += count_highmem_pages();
1193 if (alloc_highmem >= save) {
1194 to_free_highmem = alloc_highmem - save;
1178 } else { 1195 } else {
1179 to_free_highmem = 0; 1196 to_free_highmem = 0;
1180 to_free_normal -= save_highmem - alloc_highmem; 1197 to_free_normal -= save - alloc_highmem;
1181 } 1198 }
1182 1199
1183 memory_bm_position_reset(&copy_bm); 1200 memory_bm_position_reset(&copy_bm);
@@ -1258,7 +1275,7 @@ int hibernate_preallocate_memory(void)
1258{ 1275{
1259 struct zone *zone; 1276 struct zone *zone;
1260 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1277 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1261 unsigned long alloc, save_highmem, pages_highmem; 1278 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1262 struct timeval start, stop; 1279 struct timeval start, stop;
1263 int error; 1280 int error;
1264 1281
@@ -1295,6 +1312,7 @@ int hibernate_preallocate_memory(void)
1295 else 1312 else
1296 count += zone_page_state(zone, NR_FREE_PAGES); 1313 count += zone_page_state(zone, NR_FREE_PAGES);
1297 } 1314 }
1315 avail_normal = count;
1298 count += highmem; 1316 count += highmem;
1299 count -= totalreserve_pages; 1317 count -= totalreserve_pages;
1300 1318
@@ -1309,12 +1327,21 @@ int hibernate_preallocate_memory(void)
1309 */ 1327 */
1310 if (size >= saveable) { 1328 if (size >= saveable) {
1311 pages = preallocate_image_highmem(save_highmem); 1329 pages = preallocate_image_highmem(save_highmem);
1312 pages += preallocate_image_memory(saveable - pages); 1330 pages += preallocate_image_memory(saveable - pages, avail_normal);
1313 goto out; 1331 goto out;
1314 } 1332 }
1315 1333
1316 /* Estimate the minimum size of the image. */ 1334 /* Estimate the minimum size of the image. */
1317 pages = minimum_image_size(saveable); 1335 pages = minimum_image_size(saveable);
1336 /*
1337 * To avoid excessive pressure on the normal zone, leave room in it to
1338 * accommodate an image of the minimum size (unless it's already too
1339 * small, in which case don't preallocate pages from it at all).
1340 */
1341 if (avail_normal > pages)
1342 avail_normal -= pages;
1343 else
1344 avail_normal = 0;
1318 if (size < pages) 1345 if (size < pages)
1319 size = min_t(unsigned long, pages, max_size); 1346 size = min_t(unsigned long, pages, max_size);
1320 1347
@@ -1335,16 +1362,34 @@ int hibernate_preallocate_memory(void)
1335 */ 1362 */
1336 pages_highmem = preallocate_image_highmem(highmem / 2); 1363 pages_highmem = preallocate_image_highmem(highmem / 2);
1337 alloc = (count - max_size) - pages_highmem; 1364 alloc = (count - max_size) - pages_highmem;
1338 pages = preallocate_image_memory(alloc); 1365 pages = preallocate_image_memory(alloc, avail_normal);
1339 if (pages < alloc) 1366 if (pages < alloc) {
1340 goto err_out; 1367 /* We have exhausted non-highmem pages, try highmem. */
1341 size = max_size - size; 1368 alloc -= pages;
1342 alloc = size; 1369 pages += pages_highmem;
1343 size = preallocate_highmem_fraction(size, highmem, count); 1370 pages_highmem = preallocate_image_highmem(alloc);
1344 pages_highmem += size; 1371 if (pages_highmem < alloc)
1345 alloc -= size; 1372 goto err_out;
1346 pages += preallocate_image_memory(alloc); 1373 pages += pages_highmem;
1347 pages += pages_highmem; 1374 /*
1375 * size is the desired number of saveable pages to leave in
1376 * memory, so try to preallocate (all memory - size) pages.
1377 */
1378 alloc = (count - pages) - size;
1379 pages += preallocate_image_highmem(alloc);
1380 } else {
1381 /*
1382 * There are approximately max_size saveable pages at this point
1383 * and we want to reduce this number down to size.
1384 */
1385 alloc = max_size - size;
1386 size = preallocate_highmem_fraction(alloc, highmem, count);
1387 pages_highmem += size;
1388 alloc -= size;
1389 size = preallocate_image_memory(alloc, avail_normal);
1390 pages_highmem += preallocate_image_highmem(alloc - size);
1391 pages += pages_highmem + size;
1392 }
1348 1393
1349 /* 1394 /*
1350 * We only need as many page frames for the image as there are saveable 1395 * We only need as many page frames for the image as there are saveable
diff --git a/kernel/printk.c b/kernel/printk.c
index 444b770c9595..8fe465ac008a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,8 @@
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h>
41#include <linux/notifier.h>
40 42
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
42 44
@@ -985,6 +987,32 @@ void resume_console(void)
985} 987}
986 988
987/** 989/**
990 * console_cpu_notify - print deferred console messages after CPU hotplug
991 * @self: notifier struct
992 * @action: CPU hotplug event
993 * @hcpu: unused
994 *
995 * If printk() is called from a CPU that is not online yet, the messages
996 * will be spooled but will not show up on the console. This function is
997 * called when a new CPU comes online (or fails to come up), and ensures
998 * that any such output gets printed.
999 */
1000static int __cpuinit console_cpu_notify(struct notifier_block *self,
1001 unsigned long action, void *hcpu)
1002{
1003 switch (action) {
1004 case CPU_ONLINE:
1005 case CPU_DEAD:
1006 case CPU_DYING:
1007 case CPU_DOWN_FAILED:
1008 case CPU_UP_CANCELED:
1009 acquire_console_sem();
1010 release_console_sem();
1011 }
1012 return NOTIFY_OK;
1013}
1014
1015/**
988 * acquire_console_sem - lock the console system for exclusive use. 1016 * acquire_console_sem - lock the console system for exclusive use.
989 * 1017 *
990 * Acquires a semaphore which guarantees that the caller has 1018 * Acquires a semaphore which guarantees that the caller has
@@ -1371,7 +1399,7 @@ int unregister_console(struct console *console)
1371} 1399}
1372EXPORT_SYMBOL(unregister_console); 1400EXPORT_SYMBOL(unregister_console);
1373 1401
1374static int __init disable_boot_consoles(void) 1402static int __init printk_late_init(void)
1375{ 1403{
1376 struct console *con; 1404 struct console *con;
1377 1405
@@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void)
1382 unregister_console(con); 1410 unregister_console(con);
1383 } 1411 }
1384 } 1412 }
1413 hotcpu_notifier(console_cpu_notify, 0);
1385 return 0; 1414 return 0;
1386} 1415}
1387late_initcall(disable_boot_consoles); 1416late_initcall(printk_late_init);
1388 1417
1389#if defined CONFIG_PRINTK 1418#if defined CONFIG_PRINTK
1390 1419
@@ -1520,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1520 chars = logged_chars; 1549 chars = logged_chars;
1521 spin_unlock_irqrestore(&logbuf_lock, flags); 1550 spin_unlock_irqrestore(&logbuf_lock, flags);
1522 1551
1523 if (logged_chars > end) { 1552 if (chars > end) {
1524 s1 = log_buf + log_buf_len - logged_chars + end; 1553 s1 = log_buf + log_buf_len - chars + end;
1525 l1 = logged_chars - end; 1554 l1 = chars - end;
1526 1555
1527 s2 = log_buf; 1556 s2 = log_buf;
1528 l2 = end; 1557 l2 = end;
@@ -1530,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1530 s1 = ""; 1559 s1 = "";
1531 l1 = 0; 1560 l1 = 0;
1532 1561
1533 s2 = log_buf + end - logged_chars; 1562 s2 = log_buf + end - chars;
1534 l2 = logged_chars; 1563 l2 = chars;
1535 } 1564 }
1536 1565
1537 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1566 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 74a3d693c196..f34d798ef4a2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -324,26 +324,32 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
324} 324}
325 325
326/* 326/*
327 * Detach all tasks we were using ptrace on. 327 * Detach all tasks we were using ptrace on. Called with tasklist held
328 * for writing, and returns with it held too. But note it can release
329 * and reacquire the lock.
328 */ 330 */
329void exit_ptrace(struct task_struct *tracer) 331void exit_ptrace(struct task_struct *tracer)
330{ 332{
331 struct task_struct *p, *n; 333 struct task_struct *p, *n;
332 LIST_HEAD(ptrace_dead); 334 LIST_HEAD(ptrace_dead);
333 335
334 write_lock_irq(&tasklist_lock); 336 if (likely(list_empty(&tracer->ptraced)))
337 return;
338
335 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 339 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
336 if (__ptrace_detach(tracer, p)) 340 if (__ptrace_detach(tracer, p))
337 list_add(&p->ptrace_entry, &ptrace_dead); 341 list_add(&p->ptrace_entry, &ptrace_dead);
338 } 342 }
339 write_unlock_irq(&tasklist_lock);
340 343
344 write_unlock_irq(&tasklist_lock);
341 BUG_ON(!list_empty(&tracer->ptraced)); 345 BUG_ON(!list_empty(&tracer->ptraced));
342 346
343 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { 347 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
344 list_del_init(&p->ptrace_entry); 348 list_del_init(&p->ptrace_entry);
345 release_task(p); 349 release_task(p);
346 } 350 }
351
352 write_lock_irq(&tasklist_lock);
347} 353}
348 354
349int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 355int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/range.c b/kernel/range.c
index 74e2e6114927..471b66acabb5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -7,10 +7,6 @@
7 7
8#include <linux/range.h> 8#include <linux/range.h>
9 9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) 10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{ 11{
16 if (start >= end) 12 if (start >= end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void)
114} 114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); 115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */ 116#endif /* #ifdef CONFIG_PROVE_RCU */
117
118#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
119static inline void debug_init_rcu_head(struct rcu_head *head)
120{
121 debug_object_init(head, &rcuhead_debug_descr);
122}
123
124static inline void debug_rcu_head_free(struct rcu_head *head)
125{
126 debug_object_free(head, &rcuhead_debug_descr);
127}
128
129/*
130 * fixup_init is called when:
131 * - an active object is initialized
132 */
133static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
134{
135 struct rcu_head *head = addr;
136
137 switch (state) {
138 case ODEBUG_STATE_ACTIVE:
139 /*
140 * Ensure that queued callbacks are all executed.
141 * If we detect that we are nested in a RCU read-side critical
142 * section, we should simply fail, otherwise we would deadlock.
143 */
144 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
145 irqs_disabled()) {
146 WARN_ON(1);
147 return 0;
148 }
149 rcu_barrier();
150 rcu_barrier_sched();
151 rcu_barrier_bh();
152 debug_object_init(head, &rcuhead_debug_descr);
153 return 1;
154 default:
155 return 0;
156 }
157}
158
159/*
160 * fixup_activate is called when:
161 * - an active object is activated
162 * - an unknown object is activated (might be a statically initialized object)
163 * Activation is performed internally by call_rcu().
164 */
165static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
166{
167 struct rcu_head *head = addr;
168
169 switch (state) {
170
171 case ODEBUG_STATE_NOTAVAILABLE:
172 /*
173 * This is not really a fixup. We just make sure that it is
174 * tracked in the object tracker.
175 */
176 debug_object_init(head, &rcuhead_debug_descr);
177 debug_object_activate(head, &rcuhead_debug_descr);
178 return 0;
179
180 case ODEBUG_STATE_ACTIVE:
181 /*
182 * Ensure that queued callbacks are all executed.
183 * If we detect that we are nested in a RCU read-side critical
184 * section, we should simply fail, otherwise we would deadlock.
185 */
186 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
187 irqs_disabled()) {
188 WARN_ON(1);
189 return 0;
190 }
191 rcu_barrier();
192 rcu_barrier_sched();
193 rcu_barrier_bh();
194 debug_object_activate(head, &rcuhead_debug_descr);
195 return 1;
196 default:
197 return 0;
198 }
199}
200
201/*
202 * fixup_free is called when:
203 * - an active object is freed
204 */
205static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
206{
207 struct rcu_head *head = addr;
208
209 switch (state) {
210 case ODEBUG_STATE_ACTIVE:
211 /*
212 * Ensure that queued callbacks are all executed.
213 * If we detect that we are nested in a RCU read-side critical
214 * section, we should simply fail, otherwise we would deadlock.
215 */
216#ifndef CONFIG_PREEMPT
217 WARN_ON(1);
218 return 0;
219#else
220 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
221 irqs_disabled()) {
222 WARN_ON(1);
223 return 0;
224 }
225 rcu_barrier();
226 rcu_barrier_sched();
227 rcu_barrier_bh();
228 debug_object_free(head, &rcuhead_debug_descr);
229 return 1;
230#endif
231 default:
232 return 0;
233 }
234}
235
236/**
237 * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
238 * @head: pointer to rcu_head structure to be initialized
239 *
240 * This function informs debugobjects of a new rcu_head structure that
241 * has been allocated as an auto variable on the stack. This function
242 * is not required for rcu_head structures that are statically defined or
243 * that are dynamically allocated on the heap. This function has no
244 * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
245 */
246void init_rcu_head_on_stack(struct rcu_head *head)
247{
248 debug_object_init_on_stack(head, &rcuhead_debug_descr);
249}
250EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
251
252/**
253 * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
254 * @head: pointer to rcu_head structure to be initialized
255 *
256 * This function informs debugobjects that an on-stack rcu_head structure
257 * is about to go out of scope. As with init_rcu_head_on_stack(), this
258 * function is not required for rcu_head structures that are statically
259 * defined or that are dynamically allocated on the heap. Also as with
260 * init_rcu_head_on_stack(), this function has no effect for
261 * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
262 */
263void destroy_rcu_head_on_stack(struct rcu_head *head)
264{
265 debug_object_free(head, &rcuhead_debug_descr);
266}
267EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
268
269struct debug_obj_descr rcuhead_debug_descr = {
270 .name = "rcu_head",
271 .fixup_init = rcuhead_fixup_init,
272 .fixup_activate = rcuhead_fixup_activate,
273 .fixup_free = rcuhead_fixup_free,
274};
275EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
276#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
169 while (list) { 169 while (list) {
170 next = list->next; 170 next = list->next;
171 prefetch(next); 171 prefetch(next);
172 debug_rcu_head_unqueue(list);
172 list->func(list); 173 list->func(list);
173 list = next; 174 list = next;
174 } 175 }
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
211{ 212{
212 unsigned long flags; 213 unsigned long flags;
213 214
215 debug_rcu_head_queue(head);
214 head->func = func; 216 head->func = func;
215 head->next = NULL; 217 head->next = NULL;
216 218
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
239rcu_random(struct rcu_random_state *rrsp) 239rcu_random(struct rcu_random_state *rrsp)
240{ 240{
241 if (--rrsp->rrs_count < 0) { 241 if (--rrsp->rrs_count < 0) {
242 rrsp->rrs_state += 242 rrsp->rrs_state += (unsigned long)local_clock();
243 (unsigned long)cpu_clock(raw_smp_processor_id());
244 rrsp->rrs_count = RCU_RANDOM_REFRESH; 243 rrsp->rrs_count = RCU_RANDOM_REFRESH;
245 } 244 }
246 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 245 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1112 while (list) { 1112 while (list) {
1113 next = list->next; 1113 next = list->next;
1114 prefetch(next); 1114 prefetch(next);
1115 debug_rcu_head_unqueue(list);
1115 list->func(list); 1116 list->func(list);
1116 list = next; 1117 list = next;
1117 if (++count >= rdp->blimit) 1118 if (++count >= rdp->blimit)
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1388 unsigned long flags; 1389 unsigned long flags;
1389 struct rcu_data *rdp; 1390 struct rcu_data *rdp;
1390 1391
1392 debug_rcu_head_queue(head);
1391 head->func = func; 1393 head->func = func;
1392 head->next = NULL; 1394 head->next = NULL;
1393 1395
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..dc85ceb90832 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
456 unsigned long nr_running; 457 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 462 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
462#endif 464#endif
463 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
464 466
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
1193 1195
1194#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1195/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
1232 smp_send_reschedule(cpu); 1255 smp_send_reschedule(cpu);
1233} 1256}
1234 1257
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1245#endif /* CONFIG_NO_HZ */ 1258#endif /* CONFIG_NO_HZ */
1246 1259
1247static u64 sched_avg_period(void) 1260static u64 sched_avg_period(void)
@@ -1281,6 +1294,10 @@ static void resched_task(struct task_struct *p)
1281static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1294static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1282{ 1295{
1283} 1296}
1297
1298static void sched_avg_update(struct rq *rq)
1299{
1300}
1284#endif /* CONFIG_SMP */ 1301#endif /* CONFIG_SMP */
1285 1302
1286#if BITS_PER_LONG == 32 1303#if BITS_PER_LONG == 32
@@ -1652,7 +1669,7 @@ static void update_shares(struct sched_domain *sd)
1652 if (root_task_group_empty()) 1669 if (root_task_group_empty())
1653 return; 1670 return;
1654 1671
1655 now = cpu_clock(raw_smp_processor_id()); 1672 now = local_clock();
1656 elapsed = now - sd->last_update; 1673 elapsed = now - sd->last_update;
1657 1674
1658 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1822,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805static void calc_load_account_idle(struct rq *this_rq); 1822static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1823static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1824static int get_update_sysctl_factor(void);
1825static void update_cpu_load(struct rq *this_rq);
1808 1826
1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1827static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1810{ 1828{
@@ -2267,11 +2285,55 @@ static void update_avg(u64 *avg, u64 sample)
2267} 2285}
2268#endif 2286#endif
2269 2287
2270/*** 2288static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2289 bool is_sync, bool is_migrate, bool is_local,
2290 unsigned long en_flags)
2291{
2292 schedstat_inc(p, se.statistics.nr_wakeups);
2293 if (is_sync)
2294 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2295 if (is_migrate)
2296 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2297 if (is_local)
2298 schedstat_inc(p, se.statistics.nr_wakeups_local);
2299 else
2300 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2301
2302 activate_task(rq, p, en_flags);
2303}
2304
2305static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2306 int wake_flags, bool success)
2307{
2308 trace_sched_wakeup(p, success);
2309 check_preempt_curr(rq, p, wake_flags);
2310
2311 p->state = TASK_RUNNING;
2312#ifdef CONFIG_SMP
2313 if (p->sched_class->task_woken)
2314 p->sched_class->task_woken(rq, p);
2315
2316 if (unlikely(rq->idle_stamp)) {
2317 u64 delta = rq->clock - rq->idle_stamp;
2318 u64 max = 2*sysctl_sched_migration_cost;
2319
2320 if (delta > max)
2321 rq->avg_idle = max;
2322 else
2323 update_avg(&rq->avg_idle, delta);
2324 rq->idle_stamp = 0;
2325 }
2326#endif
2327 /* if a worker is waking up, notify workqueue */
2328 if ((p->flags & PF_WQ_WORKER) && success)
2329 wq_worker_waking_up(p, cpu_of(rq));
2330}
2331
2332/**
2271 * try_to_wake_up - wake up a thread 2333 * try_to_wake_up - wake up a thread
2272 * @p: the to-be-woken-up thread 2334 * @p: the thread to be awakened
2273 * @state: the mask of task states that can be woken 2335 * @state: the mask of task states that can be woken
2274 * @sync: do a synchronous wakeup? 2336 * @wake_flags: wake modifier flags (WF_*)
2275 * 2337 *
2276 * Put it on the run-queue if it's not already there. The "current" 2338 * Put it on the run-queue if it's not already there. The "current"
2277 * thread is always on the run-queue (except when the actual 2339 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2341,8 @@ static void update_avg(u64 *avg, u64 sample)
2279 * the simpler "current->state = TASK_RUNNING" to mark yourself 2341 * the simpler "current->state = TASK_RUNNING" to mark yourself
2280 * runnable without the overhead of this. 2342 * runnable without the overhead of this.
2281 * 2343 *
2282 * returns failure only if the task is already active. 2344 * Returns %true if @p was woken up, %false if it was already running
2345 * or @state didn't match @p's state.
2283 */ 2346 */
2284static int try_to_wake_up(struct task_struct *p, unsigned int state, 2347static int try_to_wake_up(struct task_struct *p, unsigned int state,
2285 int wake_flags) 2348 int wake_flags)
@@ -2359,38 +2422,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359 2422
2360out_activate: 2423out_activate:
2361#endif /* CONFIG_SMP */ 2424#endif /* CONFIG_SMP */
2362 schedstat_inc(p, se.statistics.nr_wakeups); 2425 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2363 if (wake_flags & WF_SYNC) 2426 cpu == this_cpu, en_flags);
2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2365 if (orig_cpu != cpu)
2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2367 if (cpu == this_cpu)
2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2369 else
2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2371 activate_task(rq, p, en_flags);
2372 success = 1; 2427 success = 1;
2373
2374out_running: 2428out_running:
2375 trace_sched_wakeup(p, success); 2429 ttwu_post_activation(p, rq, wake_flags, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394out: 2430out:
2395 task_rq_unlock(rq, &flags); 2431 task_rq_unlock(rq, &flags);
2396 put_cpu(); 2432 put_cpu();
@@ -2399,6 +2435,37 @@ out:
2399} 2435}
2400 2436
2401/** 2437/**
2438 * try_to_wake_up_local - try to wake up a local task with rq lock held
2439 * @p: the thread to be awakened
2440 *
2441 * Put @p on the run-queue if it's not alredy there. The caller must
2442 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2443 * the current task. this_rq() stays locked over invocation.
2444 */
2445static void try_to_wake_up_local(struct task_struct *p)
2446{
2447 struct rq *rq = task_rq(p);
2448 bool success = false;
2449
2450 BUG_ON(rq != this_rq());
2451 BUG_ON(p == current);
2452 lockdep_assert_held(&rq->lock);
2453
2454 if (!(p->state & TASK_NORMAL))
2455 return;
2456
2457 if (!p->se.on_rq) {
2458 if (likely(!task_running(rq, p))) {
2459 schedstat_inc(rq, ttwu_count);
2460 schedstat_inc(rq, ttwu_local);
2461 }
2462 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2463 success = true;
2464 }
2465 ttwu_post_activation(p, rq, 0, success);
2466}
2467
2468/**
2402 * wake_up_process - Wake up a specific process 2469 * wake_up_process - Wake up a specific process
2403 * @p: The process to be woken up. 2470 * @p: The process to be woken up.
2404 * 2471 *
@@ -3012,23 +3079,102 @@ static void calc_load_account_active(struct rq *this_rq)
3012} 3079}
3013 3080
3014/* 3081/*
3082 * The exact cpuload at various idx values, calculated at every tick would be
3083 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3084 *
3085 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3086 * on nth tick when cpu may be busy, then we have:
3087 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3088 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3089 *
3090 * decay_load_missed() below does efficient calculation of
3091 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3092 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3093 *
3094 * The calculation is approximated on a 128 point scale.
3095 * degrade_zero_ticks is the number of ticks after which load at any
3096 * particular idx is approximated to be zero.
3097 * degrade_factor is a precomputed table, a row for each load idx.
3098 * Each column corresponds to degradation factor for a power of two ticks,
3099 * based on 128 point scale.
3100 * Example:
3101 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3102 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3103 *
3104 * With this power of 2 load factors, we can degrade the load n times
3105 * by looking at 1 bits in n and doing as many mult/shift instead of
3106 * n mult/shifts needed by the exact degradation.
3107 */
3108#define DEGRADE_SHIFT 7
3109static const unsigned char
3110 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3111static const unsigned char
3112 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3113 {0, 0, 0, 0, 0, 0, 0, 0},
3114 {64, 32, 8, 0, 0, 0, 0, 0},
3115 {96, 72, 40, 12, 1, 0, 0},
3116 {112, 98, 75, 43, 15, 1, 0},
3117 {120, 112, 98, 76, 45, 16, 2} };
3118
3119/*
3120 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3121 * would be when CPU is idle and so we just decay the old load without
3122 * adding any new load.
3123 */
3124static unsigned long
3125decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3126{
3127 int j = 0;
3128
3129 if (!missed_updates)
3130 return load;
3131
3132 if (missed_updates >= degrade_zero_ticks[idx])
3133 return 0;
3134
3135 if (idx == 1)
3136 return load >> missed_updates;
3137
3138 while (missed_updates) {
3139 if (missed_updates % 2)
3140 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3141
3142 missed_updates >>= 1;
3143 j++;
3144 }
3145 return load;
3146}
3147
3148/*
3015 * Update rq->cpu_load[] statistics. This function is usually called every 3149 * Update rq->cpu_load[] statistics. This function is usually called every
3016 * scheduler tick (TICK_NSEC). 3150 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3151 * every tick. We fix it up based on jiffies.
3017 */ 3152 */
3018static void update_cpu_load(struct rq *this_rq) 3153static void update_cpu_load(struct rq *this_rq)
3019{ 3154{
3020 unsigned long this_load = this_rq->load.weight; 3155 unsigned long this_load = this_rq->load.weight;
3156 unsigned long curr_jiffies = jiffies;
3157 unsigned long pending_updates;
3021 int i, scale; 3158 int i, scale;
3022 3159
3023 this_rq->nr_load_updates++; 3160 this_rq->nr_load_updates++;
3024 3161
3162 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3163 if (curr_jiffies == this_rq->last_load_update_tick)
3164 return;
3165
3166 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3167 this_rq->last_load_update_tick = curr_jiffies;
3168
3025 /* Update our load: */ 3169 /* Update our load: */
3026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3170 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3171 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3027 unsigned long old_load, new_load; 3172 unsigned long old_load, new_load;
3028 3173
3029 /* scale is effectively 1 << i now, and >> i divides by scale */ 3174 /* scale is effectively 1 << i now, and >> i divides by scale */
3030 3175
3031 old_load = this_rq->cpu_load[i]; 3176 old_load = this_rq->cpu_load[i];
3177 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3032 new_load = this_load; 3178 new_load = this_load;
3033 /* 3179 /*
3034 * Round up the averaging division if load is increasing. This 3180 * Round up the averaging division if load is increasing. This
@@ -3036,10 +3182,18 @@ static void update_cpu_load(struct rq *this_rq)
3036 * example. 3182 * example.
3037 */ 3183 */
3038 if (new_load > old_load) 3184 if (new_load > old_load)
3039 new_load += scale-1; 3185 new_load += scale - 1;
3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3186
3187 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3041 } 3188 }
3042 3189
3190 sched_avg_update(this_rq);
3191}
3192
3193static void update_cpu_load_active(struct rq *this_rq)
3194{
3195 update_cpu_load(this_rq);
3196
3043 calc_load_account_active(this_rq); 3197 calc_load_account_active(this_rq);
3044} 3198}
3045 3199
@@ -3359,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3359 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3513 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3360 3514
3361 if (total) { 3515 if (total) {
3362 u64 temp; 3516 u64 temp = rtime;
3363 3517
3364 temp = (u64)(rtime * utime); 3518 temp *= utime;
3365 do_div(temp, total); 3519 do_div(temp, total);
3366 utime = (cputime_t)temp; 3520 utime = (cputime_t)temp;
3367 } else 3521 } else
@@ -3392,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3392 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3546 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3393 3547
3394 if (total) { 3548 if (total) {
3395 u64 temp; 3549 u64 temp = rtime;
3396 3550
3397 temp = (u64)(rtime * cputime.utime); 3551 temp *= cputime.utime;
3398 do_div(temp, total); 3552 do_div(temp, total);
3399 utime = (cputime_t)temp; 3553 utime = (cputime_t)temp;
3400 } else 3554 } else
@@ -3426,7 +3580,7 @@ void scheduler_tick(void)
3426 3580
3427 raw_spin_lock(&rq->lock); 3581 raw_spin_lock(&rq->lock);
3428 update_rq_clock(rq); 3582 update_rq_clock(rq);
3429 update_cpu_load(rq); 3583 update_cpu_load_active(rq);
3430 curr->sched_class->task_tick(rq, curr, 0); 3584 curr->sched_class->task_tick(rq, curr, 0);
3431 raw_spin_unlock(&rq->lock); 3585 raw_spin_unlock(&rq->lock);
3432 3586
@@ -3598,7 +3752,6 @@ need_resched:
3598 rq = cpu_rq(cpu); 3752 rq = cpu_rq(cpu);
3599 rcu_note_context_switch(cpu); 3753 rcu_note_context_switch(cpu);
3600 prev = rq->curr; 3754 prev = rq->curr;
3601 switch_count = &prev->nivcsw;
3602 3755
3603 release_kernel_lock(prev); 3756 release_kernel_lock(prev);
3604need_resched_nonpreemptible: 3757need_resched_nonpreemptible:
@@ -3611,11 +3764,26 @@ need_resched_nonpreemptible:
3611 raw_spin_lock_irq(&rq->lock); 3764 raw_spin_lock_irq(&rq->lock);
3612 clear_tsk_need_resched(prev); 3765 clear_tsk_need_resched(prev);
3613 3766
3767 switch_count = &prev->nivcsw;
3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3768 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3615 if (unlikely(signal_pending_state(prev->state, prev))) 3769 if (unlikely(signal_pending_state(prev->state, prev))) {
3616 prev->state = TASK_RUNNING; 3770 prev->state = TASK_RUNNING;
3617 else 3771 } else {
3772 /*
3773 * If a worker is going to sleep, notify and
3774 * ask workqueue whether it wants to wake up a
3775 * task to maintain concurrency. If so, wake
3776 * up the task.
3777 */
3778 if (prev->flags & PF_WQ_WORKER) {
3779 struct task_struct *to_wakeup;
3780
3781 to_wakeup = wq_worker_sleeping(prev, cpu);
3782 if (to_wakeup)
3783 try_to_wake_up_local(to_wakeup);
3784 }
3618 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3785 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3786 }
3619 switch_count = &prev->nvcsw; 3787 switch_count = &prev->nvcsw;
3620 } 3788 }
3621 3789
@@ -3637,8 +3805,10 @@ need_resched_nonpreemptible:
3637 3805
3638 context_switch(rq, prev, next); /* unlocks the rq */ 3806 context_switch(rq, prev, next); /* unlocks the rq */
3639 /* 3807 /*
3640 * the context switch might have flipped the stack from under 3808 * The context switch have flipped the stack from under us
3641 * us, hence refresh the local variables. 3809 * and restored the local variables which were saved when
3810 * this task called schedule() in the past. prev == current
3811 * is still correct, but it can be moved to another cpu/rq.
3642 */ 3812 */
3643 cpu = smp_processor_id(); 3813 cpu = smp_processor_id();
3644 rq = cpu_rq(cpu); 3814 rq = cpu_rq(cpu);
@@ -3647,11 +3817,8 @@ need_resched_nonpreemptible:
3647 3817
3648 post_schedule(rq); 3818 post_schedule(rq);
3649 3819
3650 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3820 if (unlikely(reacquire_kernel_lock(prev)))
3651 prev = rq->curr;
3652 switch_count = &prev->nivcsw;
3653 goto need_resched_nonpreemptible; 3821 goto need_resched_nonpreemptible;
3654 }
3655 3822
3656 preempt_enable_no_resched(); 3823 preempt_enable_no_resched();
3657 if (need_resched()) 3824 if (need_resched())
@@ -3704,8 +3871,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3704 /* 3871 /*
3705 * Owner changed, break to re-assess state. 3872 * Owner changed, break to re-assess state.
3706 */ 3873 */
3707 if (lock->owner != owner) 3874 if (lock->owner != owner) {
3875 /*
3876 * If the lock has switched to a different owner,
3877 * we likely have heavy contention. Return 0 to quit
3878 * optimistic spinning and not contend further:
3879 */
3880 if (lock->owner)
3881 return 0;
3708 break; 3882 break;
3883 }
3709 3884
3710 /* 3885 /*
3711 * Is that owner really running on that cpu? 3886 * Is that owner really running on that cpu?
@@ -3726,7 +3901,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3726 * off of preempt_enable. Kernel preemptions off return from interrupt 3901 * off of preempt_enable. Kernel preemptions off return from interrupt
3727 * occur there and call schedule directly. 3902 * occur there and call schedule directly.
3728 */ 3903 */
3729asmlinkage void __sched preempt_schedule(void) 3904asmlinkage void __sched notrace preempt_schedule(void)
3730{ 3905{
3731 struct thread_info *ti = current_thread_info(); 3906 struct thread_info *ti = current_thread_info();
3732 3907
@@ -3738,9 +3913,9 @@ asmlinkage void __sched preempt_schedule(void)
3738 return; 3913 return;
3739 3914
3740 do { 3915 do {
3741 add_preempt_count(PREEMPT_ACTIVE); 3916 add_preempt_count_notrace(PREEMPT_ACTIVE);
3742 schedule(); 3917 schedule();
3743 sub_preempt_count(PREEMPT_ACTIVE); 3918 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3744 3919
3745 /* 3920 /*
3746 * Check again in case we missed a preemption opportunity 3921 * Check again in case we missed a preemption opportunity
@@ -4441,12 +4616,8 @@ recheck:
4441 */ 4616 */
4442 if (user && !capable(CAP_SYS_NICE)) { 4617 if (user && !capable(CAP_SYS_NICE)) {
4443 if (rt_policy(policy)) { 4618 if (rt_policy(policy)) {
4444 unsigned long rlim_rtprio; 4619 unsigned long rlim_rtprio =
4445 4620 task_rlimit(p, RLIMIT_RTPRIO);
4446 if (!lock_task_sighand(p, &flags))
4447 return -ESRCH;
4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4449 unlock_task_sighand(p, &flags);
4450 4621
4451 /* can't set/change the rt policy */ 4622 /* can't set/change the rt policy */
4452 if (policy != p->policy && !rlim_rtprio) 4623 if (policy != p->policy && !rlim_rtprio)
@@ -5816,20 +5987,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5816 */ 5987 */
5817static struct notifier_block __cpuinitdata migration_notifier = { 5988static struct notifier_block __cpuinitdata migration_notifier = {
5818 .notifier_call = migration_call, 5989 .notifier_call = migration_call,
5819 .priority = 10 5990 .priority = CPU_PRI_MIGRATION,
5820}; 5991};
5821 5992
5993static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5994 unsigned long action, void *hcpu)
5995{
5996 switch (action & ~CPU_TASKS_FROZEN) {
5997 case CPU_ONLINE:
5998 case CPU_DOWN_FAILED:
5999 set_cpu_active((long)hcpu, true);
6000 return NOTIFY_OK;
6001 default:
6002 return NOTIFY_DONE;
6003 }
6004}
6005
6006static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6007 unsigned long action, void *hcpu)
6008{
6009 switch (action & ~CPU_TASKS_FROZEN) {
6010 case CPU_DOWN_PREPARE:
6011 set_cpu_active((long)hcpu, false);
6012 return NOTIFY_OK;
6013 default:
6014 return NOTIFY_DONE;
6015 }
6016}
6017
5822static int __init migration_init(void) 6018static int __init migration_init(void)
5823{ 6019{
5824 void *cpu = (void *)(long)smp_processor_id(); 6020 void *cpu = (void *)(long)smp_processor_id();
5825 int err; 6021 int err;
5826 6022
5827 /* Start one for the boot CPU: */ 6023 /* Initialize migration for the boot CPU */
5828 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6024 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5829 BUG_ON(err == NOTIFY_BAD); 6025 BUG_ON(err == NOTIFY_BAD);
5830 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6026 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5831 register_cpu_notifier(&migration_notifier); 6027 register_cpu_notifier(&migration_notifier);
5832 6028
6029 /* Register cpu active notifiers */
6030 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6031 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6032
5833 return 0; 6033 return 0;
5834} 6034}
5835early_initcall(migration_init); 6035early_initcall(migration_init);
@@ -6064,23 +6264,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6064 free_rootdomain(old_rd); 6264 free_rootdomain(old_rd);
6065} 6265}
6066 6266
6067static int init_rootdomain(struct root_domain *rd, bool bootmem) 6267static int init_rootdomain(struct root_domain *rd)
6068{ 6268{
6069 gfp_t gfp = GFP_KERNEL;
6070
6071 memset(rd, 0, sizeof(*rd)); 6269 memset(rd, 0, sizeof(*rd));
6072 6270
6073 if (bootmem) 6271 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6074 gfp = GFP_NOWAIT;
6075
6076 if (!alloc_cpumask_var(&rd->span, gfp))
6077 goto out; 6272 goto out;
6078 if (!alloc_cpumask_var(&rd->online, gfp)) 6273 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6079 goto free_span; 6274 goto free_span;
6080 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6275 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6081 goto free_online; 6276 goto free_online;
6082 6277
6083 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6278 if (cpupri_init(&rd->cpupri) != 0)
6084 goto free_rto_mask; 6279 goto free_rto_mask;
6085 return 0; 6280 return 0;
6086 6281
@@ -6096,7 +6291,7 @@ out:
6096 6291
6097static void init_defrootdomain(void) 6292static void init_defrootdomain(void)
6098{ 6293{
6099 init_rootdomain(&def_root_domain, true); 6294 init_rootdomain(&def_root_domain);
6100 6295
6101 atomic_set(&def_root_domain.refcount, 1); 6296 atomic_set(&def_root_domain.refcount, 1);
6102} 6297}
@@ -6109,7 +6304,7 @@ static struct root_domain *alloc_rootdomain(void)
6109 if (!rd) 6304 if (!rd)
6110 return NULL; 6305 return NULL;
6111 6306
6112 if (init_rootdomain(rd, false) != 0) { 6307 if (init_rootdomain(rd) != 0) {
6113 kfree(rd); 6308 kfree(rd);
6114 return NULL; 6309 return NULL;
6115 } 6310 }
@@ -7288,29 +7483,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7288} 7483}
7289#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7484#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7290 7485
7291#ifndef CONFIG_CPUSETS
7292/* 7486/*
7293 * Add online and remove offline CPUs from the scheduler domains. 7487 * Update cpusets according to cpu_active mask. If cpusets are
7294 * When cpusets are enabled they take over this function. 7488 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7489 * around partition_sched_domains().
7295 */ 7490 */
7296static int update_sched_domains(struct notifier_block *nfb, 7491static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7297 unsigned long action, void *hcpu) 7492 void *hcpu)
7298{ 7493{
7299 switch (action) { 7494 switch (action & ~CPU_TASKS_FROZEN) {
7300 case CPU_ONLINE: 7495 case CPU_ONLINE:
7301 case CPU_ONLINE_FROZEN:
7302 case CPU_DOWN_PREPARE:
7303 case CPU_DOWN_PREPARE_FROZEN:
7304 case CPU_DOWN_FAILED: 7496 case CPU_DOWN_FAILED:
7305 case CPU_DOWN_FAILED_FROZEN: 7497 cpuset_update_active_cpus();
7306 partition_sched_domains(1, NULL, NULL);
7307 return NOTIFY_OK; 7498 return NOTIFY_OK;
7499 default:
7500 return NOTIFY_DONE;
7501 }
7502}
7308 7503
7504static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7505 void *hcpu)
7506{
7507 switch (action & ~CPU_TASKS_FROZEN) {
7508 case CPU_DOWN_PREPARE:
7509 cpuset_update_active_cpus();
7510 return NOTIFY_OK;
7309 default: 7511 default:
7310 return NOTIFY_DONE; 7512 return NOTIFY_DONE;
7311 } 7513 }
7312} 7514}
7313#endif
7314 7515
7315static int update_runtime(struct notifier_block *nfb, 7516static int update_runtime(struct notifier_block *nfb,
7316 unsigned long action, void *hcpu) 7517 unsigned long action, void *hcpu)
@@ -7356,10 +7557,8 @@ void __init sched_init_smp(void)
7356 mutex_unlock(&sched_domains_mutex); 7557 mutex_unlock(&sched_domains_mutex);
7357 put_online_cpus(); 7558 put_online_cpus();
7358 7559
7359#ifndef CONFIG_CPUSETS 7560 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7360 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7561 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7361 hotcpu_notifier(update_sched_domains, 0);
7362#endif
7363 7562
7364 /* RT runtime code needs to handle some hotplug events */ 7563 /* RT runtime code needs to handle some hotplug events */
7365 hotcpu_notifier(update_runtime, 0); 7564 hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +7803,9 @@ void __init sched_init(void)
7604 7803
7605 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7804 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7606 rq->cpu_load[j] = 0; 7805 rq->cpu_load[j] = 0;
7806
7807 rq->last_load_update_tick = jiffies;
7808
7607#ifdef CONFIG_SMP 7809#ifdef CONFIG_SMP
7608 rq->sd = NULL; 7810 rq->sd = NULL;
7609 rq->rd = NULL; 7811 rq->rd = NULL;
@@ -7617,6 +7819,10 @@ void __init sched_init(void)
7617 rq->idle_stamp = 0; 7819 rq->idle_stamp = 0;
7618 rq->avg_idle = 2*sysctl_sched_migration_cost; 7820 rq->avg_idle = 2*sysctl_sched_migration_cost;
7619 rq_attach_root(rq, &def_root_domain); 7821 rq_attach_root(rq, &def_root_domain);
7822#ifdef CONFIG_NO_HZ
7823 rq->nohz_balance_kick = 0;
7824 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7825#endif
7620#endif 7826#endif
7621 init_rq_hrtick(rq); 7827 init_rq_hrtick(rq);
7622 atomic_set(&rq->nr_iowait, 0); 7828 atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +7867,11 @@ void __init sched_init(void)
7661 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7867 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7662#ifdef CONFIG_SMP 7868#ifdef CONFIG_SMP
7663#ifdef CONFIG_NO_HZ 7869#ifdef CONFIG_NO_HZ
7664 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7870 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7665 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7871 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7872 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7873 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7874 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7666#endif 7875#endif
7667 /* May be allocated at isolcpus cmdline parse time */ 7876 /* May be allocated at isolcpus cmdline parse time */
7668 if (cpu_isolated_map == NULL) 7877 if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
10 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 *
14 * - gtod 14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
15 * - sched_clock() 43 * - sched_clock()
16 * - explicit idle events 44 * - explicit idle events
17 * 45 *
18 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
19 * making it monotonic and keeping it within an expected window. 47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
20 * 49 *
21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
22 * that is otherwise invisible (TSC gets stopped). 51 * that is otherwise invisible (TSC gets stopped).
23 * 52 *
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 53 *
25 * consistent between cpus (never more than 2 jiffies difference). 54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
26 */ 62 */
27#include <linux/spinlock.h> 63#include <linux/spinlock.h>
28#include <linux/hardirq.h> 64#include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
170 return val; 206 return val;
171} 207}
172 208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
173u64 sched_clock_cpu(int cpu) 214u64 sched_clock_cpu(int cpu)
174{ 215{
175 struct sched_clock_data *scd; 216 struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
237} 278}
238EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
239 280
240unsigned long long cpu_clock(int cpu) 281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
241{ 292{
242 unsigned long long clock; 293 u64 clock;
243 unsigned long flags; 294 unsigned long flags;
244 295
245 local_irq_save(flags); 296 local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
249 return clock; 300 return clock;
250} 301}
251 302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
252#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
253 323
254void sched_clock_init(void) 324void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
264 return sched_clock(); 334 return sched_clock();
265} 335}
266 336
267 337u64 cpu_clock(int cpu)
268unsigned long long cpu_clock(int cpu)
269{ 338{
270 return sched_clock_cpu(cpu); 339 return sched_clock_cpu(cpu);
271} 340}
272 341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
273#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
274 348
275EXPORT_SYMBOL_GPL(cpu_clock); 349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
166 * 166 *
167 * Returns: -ENOMEM if memory fails. 167 * Returns: -ENOMEM if memory fails.
168 */ 168 */
169int cpupri_init(struct cpupri *cp, bool bootmem) 169int cpupri_init(struct cpupri *cp)
170{ 170{
171 gfp_t gfp = GFP_KERNEL;
172 int i; 171 int i;
173 172
174 if (bootmem)
175 gfp = GFP_NOWAIT;
176
177 memset(cp, 0, sizeof(*cp)); 173 memset(cp, 0, sizeof(*cp));
178 174
179 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
181 177
182 raw_spin_lock_init(&vec->lock); 178 raw_spin_lock_init(&vec->lock);
183 vec->count = 0; 179 vec->count = 0;
184 if (!zalloc_cpumask_var(&vec->mask, gfp)) 180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
185 goto cleanup; 181 goto cleanup;
186 } 182 }
187 183
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, struct cpumask *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
32#else 32#else
33#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 35565395d00d..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
332 PN(sysctl_sched_latency); 332 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 333 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 334 PN(sysctl_sched_wakeup_granularity);
335 PN(sysctl_sched_child_runs_first); 335 P(sysctl_sched_child_runs_first);
336 P(sysctl_sched_features); 336 P(sysctl_sched_features);
337#undef PN 337#undef PN
338#undef P 338#undef P
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..db3f674ca49d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -54,13 +54,13 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 2000000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 3; 63static unsigned int sched_nr_latency = 8;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -1313,7 +1313,7 @@ static struct sched_group *
1313find_idlest_group(struct sched_domain *sd, struct task_struct *p, 1313find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1314 int this_cpu, int load_idx) 1314 int this_cpu, int load_idx)
1315{ 1315{
1316 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 1316 struct sched_group *idlest = NULL, *group = sd->groups;
1317 unsigned long min_load = ULONG_MAX, this_load = 0; 1317 unsigned long min_load = ULONG_MAX, this_load = 0;
1318 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1318 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1319 1319
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1348 1348
1349 if (local_group) { 1349 if (local_group) {
1350 this_load = avg_load; 1350 this_load = avg_load;
1351 this = group;
1352 } else if (avg_load < min_load) { 1351 } else if (avg_load < min_load) {
1353 min_load = avg_load; 1352 min_load = avg_load;
1354 idlest = group; 1353 idlest = group;
@@ -2268,8 +2267,6 @@ unsigned long scale_rt_power(int cpu)
2268 struct rq *rq = cpu_rq(cpu); 2267 struct rq *rq = cpu_rq(cpu);
2269 u64 total, available; 2268 u64 total, available;
2270 2269
2271 sched_avg_update(rq);
2272
2273 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2270 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2274 available = total - rq->rt_avg; 2271 available = total - rq->rt_avg;
2275 2272
@@ -2287,13 +2284,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2287 unsigned long power = SCHED_LOAD_SCALE; 2284 unsigned long power = SCHED_LOAD_SCALE;
2288 struct sched_group *sdg = sd->groups; 2285 struct sched_group *sdg = sd->groups;
2289 2286
2290 if (sched_feat(ARCH_POWER))
2291 power *= arch_scale_freq_power(sd, cpu);
2292 else
2293 power *= default_scale_freq_power(sd, cpu);
2294
2295 power >>= SCHED_LOAD_SHIFT;
2296
2297 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2287 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2298 if (sched_feat(ARCH_POWER)) 2288 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_smt_power(sd, cpu); 2289 power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2293,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2303 power >>= SCHED_LOAD_SHIFT; 2293 power >>= SCHED_LOAD_SHIFT;
2304 } 2294 }
2305 2295
2296 sdg->cpu_power_orig = power;
2297
2298 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_freq_power(sd, cpu);
2300 else
2301 power *= default_scale_freq_power(sd, cpu);
2302
2303 power >>= SCHED_LOAD_SHIFT;
2304
2306 power *= scale_rt_power(cpu); 2305 power *= scale_rt_power(cpu);
2307 power >>= SCHED_LOAD_SHIFT; 2306 power >>= SCHED_LOAD_SHIFT;
2308 2307
@@ -2335,6 +2334,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2335 sdg->cpu_power = power; 2334 sdg->cpu_power = power;
2336} 2335}
2337 2336
2337/*
2338 * Try and fix up capacity for tiny siblings, this is needed when
2339 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2340 * which on its own isn't powerful enough.
2341 *
2342 * See update_sd_pick_busiest() and check_asym_packing().
2343 */
2344static inline int
2345fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2346{
2347 /*
2348 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2349 */
2350 if (sd->level != SD_LV_SIBLING)
2351 return 0;
2352
2353 /*
2354 * If ~90% of the cpu_power is still there, we're good.
2355 */
2356 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2357 return 1;
2358
2359 return 0;
2360}
2361
2338/** 2362/**
2339 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2363 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2340 * @sd: The sched_domain whose statistics are to be updated. 2364 * @sd: The sched_domain whose statistics are to be updated.
@@ -2400,14 +2424,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2400 * domains. In the newly idle case, we will allow all the cpu's 2424 * domains. In the newly idle case, we will allow all the cpu's
2401 * to do the newly idle load balance. 2425 * to do the newly idle load balance.
2402 */ 2426 */
2403 if (idle != CPU_NEWLY_IDLE && local_group && 2427 if (idle != CPU_NEWLY_IDLE && local_group) {
2404 balance_cpu != this_cpu) { 2428 if (balance_cpu != this_cpu) {
2405 *balance = 0; 2429 *balance = 0;
2406 return; 2430 return;
2431 }
2432 update_group_power(sd, this_cpu);
2407 } 2433 }
2408 2434
2409 update_group_power(sd, this_cpu);
2410
2411 /* Adjust by relative CPU power of the group */ 2435 /* Adjust by relative CPU power of the group */
2412 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2436 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2413 2437
@@ -2428,6 +2452,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2428 2452
2429 sgs->group_capacity = 2453 sgs->group_capacity =
2430 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2455 if (!sgs->group_capacity)
2456 sgs->group_capacity = fix_small_capacity(sd, group);
2457}
2458
2459/**
2460 * update_sd_pick_busiest - return 1 on busiest group
2461 * @sd: sched_domain whose statistics are to be checked
2462 * @sds: sched_domain statistics
2463 * @sg: sched_group candidate to be checked for being the busiest
2464 * @sgs: sched_group statistics
2465 * @this_cpu: the current cpu
2466 *
2467 * Determine if @sg is a busier group than the previously selected
2468 * busiest group.
2469 */
2470static bool update_sd_pick_busiest(struct sched_domain *sd,
2471 struct sd_lb_stats *sds,
2472 struct sched_group *sg,
2473 struct sg_lb_stats *sgs,
2474 int this_cpu)
2475{
2476 if (sgs->avg_load <= sds->max_load)
2477 return false;
2478
2479 if (sgs->sum_nr_running > sgs->group_capacity)
2480 return true;
2481
2482 if (sgs->group_imb)
2483 return true;
2484
2485 /*
2486 * ASYM_PACKING needs to move all the work to the lowest
2487 * numbered CPUs in the group, therefore mark all groups
2488 * higher than ourself as busy.
2489 */
2490 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2491 this_cpu < group_first_cpu(sg)) {
2492 if (!sds->busiest)
2493 return true;
2494
2495 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2496 return true;
2497 }
2498
2499 return false;
2431} 2500}
2432 2501
2433/** 2502/**
@@ -2435,7 +2504,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2435 * @sd: sched_domain whose statistics are to be updated. 2504 * @sd: sched_domain whose statistics are to be updated.
2436 * @this_cpu: Cpu for which load balance is currently performed. 2505 * @this_cpu: Cpu for which load balance is currently performed.
2437 * @idle: Idle status of this_cpu 2506 * @idle: Idle status of this_cpu
2438 * @sd_idle: Idle status of the sched_domain containing group. 2507 * @sd_idle: Idle status of the sched_domain containing sg.
2439 * @cpus: Set of cpus considered for load balancing. 2508 * @cpus: Set of cpus considered for load balancing.
2440 * @balance: Should we balance. 2509 * @balance: Should we balance.
2441 * @sds: variable to hold the statistics for this sched_domain. 2510 * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2446 struct sd_lb_stats *sds) 2515 struct sd_lb_stats *sds)
2447{ 2516{
2448 struct sched_domain *child = sd->child; 2517 struct sched_domain *child = sd->child;
2449 struct sched_group *group = sd->groups; 2518 struct sched_group *sg = sd->groups;
2450 struct sg_lb_stats sgs; 2519 struct sg_lb_stats sgs;
2451 int load_idx, prefer_sibling = 0; 2520 int load_idx, prefer_sibling = 0;
2452 2521
@@ -2459,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2459 do { 2528 do {
2460 int local_group; 2529 int local_group;
2461 2530
2462 local_group = cpumask_test_cpu(this_cpu, 2531 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2463 sched_group_cpus(group));
2464 memset(&sgs, 0, sizeof(sgs)); 2532 memset(&sgs, 0, sizeof(sgs));
2465 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2533 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2466 local_group, cpus, balance, &sgs); 2534 local_group, cpus, balance, &sgs);
2467 2535
2468 if (local_group && !(*balance)) 2536 if (local_group && !(*balance))
2469 return; 2537 return;
2470 2538
2471 sds->total_load += sgs.group_load; 2539 sds->total_load += sgs.group_load;
2472 sds->total_pwr += group->cpu_power; 2540 sds->total_pwr += sg->cpu_power;
2473 2541
2474 /* 2542 /*
2475 * In case the child domain prefers tasks go to siblings 2543 * In case the child domain prefers tasks go to siblings
2476 * first, lower the group capacity to one so that we'll try 2544 * first, lower the sg capacity to one so that we'll try
2477 * and move all the excess tasks away. 2545 * and move all the excess tasks away.
2478 */ 2546 */
2479 if (prefer_sibling) 2547 if (prefer_sibling)
@@ -2481,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2481 2549
2482 if (local_group) { 2550 if (local_group) {
2483 sds->this_load = sgs.avg_load; 2551 sds->this_load = sgs.avg_load;
2484 sds->this = group; 2552 sds->this = sg;
2485 sds->this_nr_running = sgs.sum_nr_running; 2553 sds->this_nr_running = sgs.sum_nr_running;
2486 sds->this_load_per_task = sgs.sum_weighted_load; 2554 sds->this_load_per_task = sgs.sum_weighted_load;
2487 } else if (sgs.avg_load > sds->max_load && 2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2488 (sgs.sum_nr_running > sgs.group_capacity ||
2489 sgs.group_imb)) {
2490 sds->max_load = sgs.avg_load; 2556 sds->max_load = sgs.avg_load;
2491 sds->busiest = group; 2557 sds->busiest = sg;
2492 sds->busiest_nr_running = sgs.sum_nr_running; 2558 sds->busiest_nr_running = sgs.sum_nr_running;
2493 sds->busiest_group_capacity = sgs.group_capacity; 2559 sds->busiest_group_capacity = sgs.group_capacity;
2494 sds->busiest_load_per_task = sgs.sum_weighted_load; 2560 sds->busiest_load_per_task = sgs.sum_weighted_load;
2495 sds->group_imb = sgs.group_imb; 2561 sds->group_imb = sgs.group_imb;
2496 } 2562 }
2497 2563
2498 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2564 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2499 group = group->next; 2565 sg = sg->next;
2500 } while (group != sd->groups); 2566 } while (sg != sd->groups);
2567}
2568
2569int __weak arch_sd_sibling_asym_packing(void)
2570{
2571 return 0*SD_ASYM_PACKING;
2572}
2573
2574/**
2575 * check_asym_packing - Check to see if the group is packed into the
2576 * sched doman.
2577 *
2578 * This is primarily intended to used at the sibling level. Some
2579 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2580 * case of POWER7, it can move to lower SMT modes only when higher
2581 * threads are idle. When in lower SMT modes, the threads will
2582 * perform better since they share less core resources. Hence when we
2583 * have idle threads, we want them to be the higher ones.
2584 *
2585 * This packing function is run on idle threads. It checks to see if
2586 * the busiest CPU in this domain (core in the P7 case) has a higher
2587 * CPU number than the packing function is being run on. Here we are
2588 * assuming lower CPU number will be equivalent to lower a SMT thread
2589 * number.
2590 *
2591 * Returns 1 when packing is required and a task should be moved to
2592 * this CPU. The amount of the imbalance is returned in *imbalance.
2593 *
2594 * @sd: The sched_domain whose packing is to be checked.
2595 * @sds: Statistics of the sched_domain which is to be packed
2596 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2597 * @imbalance: returns amount of imbalanced due to packing.
2598 */
2599static int check_asym_packing(struct sched_domain *sd,
2600 struct sd_lb_stats *sds,
2601 int this_cpu, unsigned long *imbalance)
2602{
2603 int busiest_cpu;
2604
2605 if (!(sd->flags & SD_ASYM_PACKING))
2606 return 0;
2607
2608 if (!sds->busiest)
2609 return 0;
2610
2611 busiest_cpu = group_first_cpu(sds->busiest);
2612 if (this_cpu > busiest_cpu)
2613 return 0;
2614
2615 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2616 SCHED_LOAD_SCALE);
2617 return 1;
2501} 2618}
2502 2619
2503/** 2620/**
@@ -2692,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2692 if (!(*balance)) 2809 if (!(*balance))
2693 goto ret; 2810 goto ret;
2694 2811
2812 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2813 check_asym_packing(sd, &sds, this_cpu, imbalance))
2814 return sds.busiest;
2815
2695 if (!sds.busiest || sds.busiest_nr_running == 0) 2816 if (!sds.busiest || sds.busiest_nr_running == 0)
2696 goto out_balanced; 2817 goto out_balanced;
2697 2818
@@ -2726,8 +2847,9 @@ ret:
2726 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2847 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2727 */ 2848 */
2728static struct rq * 2849static struct rq *
2729find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2850find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2730 unsigned long imbalance, const struct cpumask *cpus) 2851 enum cpu_idle_type idle, unsigned long imbalance,
2852 const struct cpumask *cpus)
2731{ 2853{
2732 struct rq *busiest = NULL, *rq; 2854 struct rq *busiest = NULL, *rq;
2733 unsigned long max_load = 0; 2855 unsigned long max_load = 0;
@@ -2738,6 +2860,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2738 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2860 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2739 unsigned long wl; 2861 unsigned long wl;
2740 2862
2863 if (!capacity)
2864 capacity = fix_small_capacity(sd, group);
2865
2741 if (!cpumask_test_cpu(i, cpus)) 2866 if (!cpumask_test_cpu(i, cpus))
2742 continue; 2867 continue;
2743 2868
@@ -2777,9 +2902,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2777/* Working cpumask for load_balance and load_balance_newidle. */ 2902/* Working cpumask for load_balance and load_balance_newidle. */
2778static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2903static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2779 2904
2780static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2905static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2906 int busiest_cpu, int this_cpu)
2781{ 2907{
2782 if (idle == CPU_NEWLY_IDLE) { 2908 if (idle == CPU_NEWLY_IDLE) {
2909
2910 /*
2911 * ASYM_PACKING needs to force migrate tasks from busy but
2912 * higher numbered CPUs in order to pack all tasks in the
2913 * lowest numbered CPUs.
2914 */
2915 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2916 return 1;
2917
2783 /* 2918 /*
2784 * The only task running in a non-idle cpu can be moved to this 2919 * The only task running in a non-idle cpu can be moved to this
2785 * cpu in an attempt to completely freeup the other CPU 2920 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +2989,7 @@ redo:
2854 goto out_balanced; 2989 goto out_balanced;
2855 } 2990 }
2856 2991
2857 busiest = find_busiest_queue(group, idle, imbalance, cpus); 2992 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2858 if (!busiest) { 2993 if (!busiest) {
2859 schedstat_inc(sd, lb_nobusyq[idle]); 2994 schedstat_inc(sd, lb_nobusyq[idle]);
2860 goto out_balanced; 2995 goto out_balanced;
@@ -2898,7 +3033,8 @@ redo:
2898 schedstat_inc(sd, lb_failed[idle]); 3033 schedstat_inc(sd, lb_failed[idle]);
2899 sd->nr_balance_failed++; 3034 sd->nr_balance_failed++;
2900 3035
2901 if (need_active_balance(sd, sd_idle, idle)) { 3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037 this_cpu)) {
2902 raw_spin_lock_irqsave(&busiest->lock, flags); 3038 raw_spin_lock_irqsave(&busiest->lock, flags);
2903 3039
2904 /* don't kick the active_load_balance_cpu_stop, 3040 /* don't kick the active_load_balance_cpu_stop,
@@ -3093,13 +3229,40 @@ out_unlock:
3093} 3229}
3094 3230
3095#ifdef CONFIG_NO_HZ 3231#ifdef CONFIG_NO_HZ
3232
3233static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3234
3235static void trigger_sched_softirq(void *data)
3236{
3237 raise_softirq_irqoff(SCHED_SOFTIRQ);
3238}
3239
3240static inline void init_sched_softirq_csd(struct call_single_data *csd)
3241{
3242 csd->func = trigger_sched_softirq;
3243 csd->info = NULL;
3244 csd->flags = 0;
3245 csd->priv = 0;
3246}
3247
3248/*
3249 * idle load balancing details
3250 * - One of the idle CPUs nominates itself as idle load_balancer, while
3251 * entering idle.
3252 * - This idle load balancer CPU will also go into tickless mode when
3253 * it is idle, just like all other idle CPUs
3254 * - When one of the busy CPUs notice that there may be an idle rebalancing
3255 * needed, they will kick the idle load balancer, which then does idle
3256 * load balancing for all the idle CPUs.
3257 */
3096static struct { 3258static struct {
3097 atomic_t load_balancer; 3259 atomic_t load_balancer;
3098 cpumask_var_t cpu_mask; 3260 atomic_t first_pick_cpu;
3099 cpumask_var_t ilb_grp_nohz_mask; 3261 atomic_t second_pick_cpu;
3100} nohz ____cacheline_aligned = { 3262 cpumask_var_t idle_cpus_mask;
3101 .load_balancer = ATOMIC_INIT(-1), 3263 cpumask_var_t grp_idle_mask;
3102}; 3264 unsigned long next_balance; /* in jiffy units */
3265} nohz ____cacheline_aligned;
3103 3266
3104int get_nohz_load_balancer(void) 3267int get_nohz_load_balancer(void)
3105{ 3268{
@@ -3153,17 +3316,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3153 */ 3316 */
3154static inline int is_semi_idle_group(struct sched_group *ilb_group) 3317static inline int is_semi_idle_group(struct sched_group *ilb_group)
3155{ 3318{
3156 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3319 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3157 sched_group_cpus(ilb_group)); 3320 sched_group_cpus(ilb_group));
3158 3321
3159 /* 3322 /*
3160 * A sched_group is semi-idle when it has atleast one busy cpu 3323 * A sched_group is semi-idle when it has atleast one busy cpu
3161 * and atleast one idle cpu. 3324 * and atleast one idle cpu.
3162 */ 3325 */
3163 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3326 if (cpumask_empty(nohz.grp_idle_mask))
3164 return 0; 3327 return 0;
3165 3328
3166 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3329 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3167 return 0; 3330 return 0;
3168 3331
3169 return 1; 3332 return 1;
@@ -3196,7 +3359,7 @@ static int find_new_ilb(int cpu)
3196 * Optimize for the case when we have no idle CPUs or only one 3359 * Optimize for the case when we have no idle CPUs or only one
3197 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3360 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3198 */ 3361 */
3199 if (cpumask_weight(nohz.cpu_mask) < 2) 3362 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3200 goto out_done; 3363 goto out_done;
3201 3364
3202 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3365 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3367,7 @@ static int find_new_ilb(int cpu)
3204 3367
3205 do { 3368 do {
3206 if (is_semi_idle_group(ilb_group)) 3369 if (is_semi_idle_group(ilb_group))
3207 return cpumask_first(nohz.ilb_grp_nohz_mask); 3370 return cpumask_first(nohz.grp_idle_mask);
3208 3371
3209 ilb_group = ilb_group->next; 3372 ilb_group = ilb_group->next;
3210 3373
@@ -3212,98 +3375,116 @@ static int find_new_ilb(int cpu)
3212 } 3375 }
3213 3376
3214out_done: 3377out_done:
3215 return cpumask_first(nohz.cpu_mask); 3378 return nr_cpu_ids;
3216} 3379}
3217#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3380#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3218static inline int find_new_ilb(int call_cpu) 3381static inline int find_new_ilb(int call_cpu)
3219{ 3382{
3220 return cpumask_first(nohz.cpu_mask); 3383 return nr_cpu_ids;
3221} 3384}
3222#endif 3385#endif
3223 3386
3224/* 3387/*
3388 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3389 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3390 * CPU (if there is one).
3391 */
3392static void nohz_balancer_kick(int cpu)
3393{
3394 int ilb_cpu;
3395
3396 nohz.next_balance++;
3397
3398 ilb_cpu = get_nohz_load_balancer();
3399
3400 if (ilb_cpu >= nr_cpu_ids) {
3401 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3402 if (ilb_cpu >= nr_cpu_ids)
3403 return;
3404 }
3405
3406 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3407 struct call_single_data *cp;
3408
3409 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3410 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3411 __smp_call_function_single(ilb_cpu, cp, 0);
3412 }
3413 return;
3414}
3415
3416/*
3225 * This routine will try to nominate the ilb (idle load balancing) 3417 * This routine will try to nominate the ilb (idle load balancing)
3226 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3418 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3227 * load balancing on behalf of all those cpus. If all the cpus in the system 3419 * load balancing on behalf of all those cpus.
3228 * go into this tickless mode, then there will be no ilb owner (as there is
3229 * no need for one) and all the cpus will sleep till the next wakeup event
3230 * arrives...
3231 *
3232 * For the ilb owner, tick is not stopped. And this tick will be used
3233 * for idle load balancing. ilb owner will still be part of
3234 * nohz.cpu_mask..
3235 * 3420 *
3236 * While stopping the tick, this cpu will become the ilb owner if there 3421 * When the ilb owner becomes busy, we will not have new ilb owner until some
3237 * is no other owner. And will be the owner till that cpu becomes busy 3422 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3238 * or if all cpus in the system stop their ticks at which point 3423 * idle load balancing by kicking one of the idle CPUs.
3239 * there is no need for ilb owner.
3240 * 3424 *
3241 * When the ilb owner becomes busy, it nominates another owner, during the 3425 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3242 * next busy scheduler_tick() 3426 * ilb owner CPU in future (when there is a need for idle load balancing on
3427 * behalf of all idle CPUs).
3243 */ 3428 */
3244int select_nohz_load_balancer(int stop_tick) 3429void select_nohz_load_balancer(int stop_tick)
3245{ 3430{
3246 int cpu = smp_processor_id(); 3431 int cpu = smp_processor_id();
3247 3432
3248 if (stop_tick) { 3433 if (stop_tick) {
3249 cpu_rq(cpu)->in_nohz_recently = 1;
3250
3251 if (!cpu_active(cpu)) { 3434 if (!cpu_active(cpu)) {
3252 if (atomic_read(&nohz.load_balancer) != cpu) 3435 if (atomic_read(&nohz.load_balancer) != cpu)
3253 return 0; 3436 return;
3254 3437
3255 /* 3438 /*
3256 * If we are going offline and still the leader, 3439 * If we are going offline and still the leader,
3257 * give up! 3440 * give up!
3258 */ 3441 */
3259 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3442 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3443 nr_cpu_ids) != cpu)
3260 BUG(); 3444 BUG();
3261 3445
3262 return 0; 3446 return;
3263 } 3447 }
3264 3448
3265 cpumask_set_cpu(cpu, nohz.cpu_mask); 3449 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3266 3450
3267 /* time for ilb owner also to sleep */ 3451 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3268 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3452 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3269 if (atomic_read(&nohz.load_balancer) == cpu) 3453 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3270 atomic_set(&nohz.load_balancer, -1); 3454 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3271 return 0;
3272 }
3273 3455
3274 if (atomic_read(&nohz.load_balancer) == -1) { 3456 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3275 /* make me the ilb owner */
3276 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3277 return 1;
3278 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3279 int new_ilb; 3457 int new_ilb;
3280 3458
3281 if (!(sched_smt_power_savings || 3459 /* make me the ilb owner */
3282 sched_mc_power_savings)) 3460 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3283 return 1; 3461 cpu) != nr_cpu_ids)
3462 return;
3463
3284 /* 3464 /*
3285 * Check to see if there is a more power-efficient 3465 * Check to see if there is a more power-efficient
3286 * ilb. 3466 * ilb.
3287 */ 3467 */
3288 new_ilb = find_new_ilb(cpu); 3468 new_ilb = find_new_ilb(cpu);
3289 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3469 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3290 atomic_set(&nohz.load_balancer, -1); 3470 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3291 resched_cpu(new_ilb); 3471 resched_cpu(new_ilb);
3292 return 0; 3472 return;
3293 } 3473 }
3294 return 1; 3474 return;
3295 } 3475 }
3296 } else { 3476 } else {
3297 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3477 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3298 return 0; 3478 return;
3299 3479
3300 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3480 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3301 3481
3302 if (atomic_read(&nohz.load_balancer) == cpu) 3482 if (atomic_read(&nohz.load_balancer) == cpu)
3303 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3483 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3484 nr_cpu_ids) != cpu)
3304 BUG(); 3485 BUG();
3305 } 3486 }
3306 return 0; 3487 return;
3307} 3488}
3308#endif 3489#endif
3309 3490
@@ -3385,11 +3566,102 @@ out:
3385 rq->next_balance = next_balance; 3566 rq->next_balance = next_balance;
3386} 3567}
3387 3568
3569#ifdef CONFIG_NO_HZ
3388/* 3570/*
3389 * run_rebalance_domains is triggered when needed from the scheduler tick. 3571 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3390 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3391 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3572 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3392 */ 3573 */
3574static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3575{
3576 struct rq *this_rq = cpu_rq(this_cpu);
3577 struct rq *rq;
3578 int balance_cpu;
3579
3580 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3581 return;
3582
3583 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3584 if (balance_cpu == this_cpu)
3585 continue;
3586
3587 /*
3588 * If this cpu gets work to do, stop the load balancing
3589 * work being done for other cpus. Next load
3590 * balancing owner will pick it up.
3591 */
3592 if (need_resched()) {
3593 this_rq->nohz_balance_kick = 0;
3594 break;
3595 }
3596
3597 raw_spin_lock_irq(&this_rq->lock);
3598 update_rq_clock(this_rq);
3599 update_cpu_load(this_rq);
3600 raw_spin_unlock_irq(&this_rq->lock);
3601
3602 rebalance_domains(balance_cpu, CPU_IDLE);
3603
3604 rq = cpu_rq(balance_cpu);
3605 if (time_after(this_rq->next_balance, rq->next_balance))
3606 this_rq->next_balance = rq->next_balance;
3607 }
3608 nohz.next_balance = this_rq->next_balance;
3609 this_rq->nohz_balance_kick = 0;
3610}
3611
3612/*
3613 * Current heuristic for kicking the idle load balancer
3614 * - first_pick_cpu is the one of the busy CPUs. It will kick
3615 * idle load balancer when it has more than one process active. This
3616 * eliminates the need for idle load balancing altogether when we have
3617 * only one running process in the system (common case).
3618 * - If there are more than one busy CPU, idle load balancer may have
3619 * to run for active_load_balance to happen (i.e., two busy CPUs are
3620 * SMT or core siblings and can run better if they move to different
3621 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3622 * which will kick idle load balancer as soon as it has any load.
3623 */
3624static inline int nohz_kick_needed(struct rq *rq, int cpu)
3625{
3626 unsigned long now = jiffies;
3627 int ret;
3628 int first_pick_cpu, second_pick_cpu;
3629
3630 if (time_before(now, nohz.next_balance))
3631 return 0;
3632
3633 if (rq->idle_at_tick)
3634 return 0;
3635
3636 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3637 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3638
3639 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3640 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3641 return 0;
3642
3643 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3644 if (ret == nr_cpu_ids || ret == cpu) {
3645 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3646 if (rq->nr_running > 1)
3647 return 1;
3648 } else {
3649 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3650 if (ret == nr_cpu_ids || ret == cpu) {
3651 if (rq->nr_running)
3652 return 1;
3653 }
3654 }
3655 return 0;
3656}
3657#else
3658static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3659#endif
3660
3661/*
3662 * run_rebalance_domains is triggered when needed from the scheduler tick.
3663 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3664 */
3393static void run_rebalance_domains(struct softirq_action *h) 3665static void run_rebalance_domains(struct softirq_action *h)
3394{ 3666{
3395 int this_cpu = smp_processor_id(); 3667 int this_cpu = smp_processor_id();
@@ -3399,37 +3671,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3399 3671
3400 rebalance_domains(this_cpu, idle); 3672 rebalance_domains(this_cpu, idle);
3401 3673
3402#ifdef CONFIG_NO_HZ
3403 /* 3674 /*
3404 * If this cpu is the owner for idle load balancing, then do the 3675 * If this cpu has a pending nohz_balance_kick, then do the
3405 * balancing on behalf of the other idle cpus whose ticks are 3676 * balancing on behalf of the other idle cpus whose ticks are
3406 * stopped. 3677 * stopped.
3407 */ 3678 */
3408 if (this_rq->idle_at_tick && 3679 nohz_idle_balance(this_cpu, idle);
3409 atomic_read(&nohz.load_balancer) == this_cpu) {
3410 struct rq *rq;
3411 int balance_cpu;
3412
3413 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3414 if (balance_cpu == this_cpu)
3415 continue;
3416
3417 /*
3418 * If this cpu gets work to do, stop the load balancing
3419 * work being done for other cpus. Next load
3420 * balancing owner will pick it up.
3421 */
3422 if (need_resched())
3423 break;
3424
3425 rebalance_domains(balance_cpu, CPU_IDLE);
3426
3427 rq = cpu_rq(balance_cpu);
3428 if (time_after(this_rq->next_balance, rq->next_balance))
3429 this_rq->next_balance = rq->next_balance;
3430 }
3431 }
3432#endif
3433} 3680}
3434 3681
3435static inline int on_null_domain(int cpu) 3682static inline int on_null_domain(int cpu)
@@ -3439,57 +3686,17 @@ static inline int on_null_domain(int cpu)
3439 3686
3440/* 3687/*
3441 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3688 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3442 *
3443 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3444 * idle load balancing owner or decide to stop the periodic load balancing,
3445 * if the whole system is idle.
3446 */ 3689 */
3447static inline void trigger_load_balance(struct rq *rq, int cpu) 3690static inline void trigger_load_balance(struct rq *rq, int cpu)
3448{ 3691{
3449#ifdef CONFIG_NO_HZ
3450 /*
3451 * If we were in the nohz mode recently and busy at the current
3452 * scheduler tick, then check if we need to nominate new idle
3453 * load balancer.
3454 */
3455 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3456 rq->in_nohz_recently = 0;
3457
3458 if (atomic_read(&nohz.load_balancer) == cpu) {
3459 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3460 atomic_set(&nohz.load_balancer, -1);
3461 }
3462
3463 if (atomic_read(&nohz.load_balancer) == -1) {
3464 int ilb = find_new_ilb(cpu);
3465
3466 if (ilb < nr_cpu_ids)
3467 resched_cpu(ilb);
3468 }
3469 }
3470
3471 /*
3472 * If this cpu is idle and doing idle load balancing for all the
3473 * cpus with ticks stopped, is it time for that to stop?
3474 */
3475 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3476 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3477 resched_cpu(cpu);
3478 return;
3479 }
3480
3481 /*
3482 * If this cpu is idle and the idle load balancing is done by
3483 * someone else, then no need raise the SCHED_SOFTIRQ
3484 */
3485 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3486 cpumask_test_cpu(cpu, nohz.cpu_mask))
3487 return;
3488#endif
3489 /* Don't need to rebalance while attached to NULL domain */ 3692 /* Don't need to rebalance while attached to NULL domain */
3490 if (time_after_eq(jiffies, rq->next_balance) && 3693 if (time_after_eq(jiffies, rq->next_balance) &&
3491 likely(!on_null_domain(cpu))) 3694 likely(!on_null_domain(cpu)))
3492 raise_softirq(SCHED_SOFTIRQ); 3695 raise_softirq(SCHED_SOFTIRQ);
3696#ifdef CONFIG_NO_HZ
3697 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3698 nohz_balancer_kick(cpu);
3699#endif
3493} 3700}
3494 3701
3495static void rq_online_fair(struct rq *rq) 3702static void rq_online_fair(struct rq *rq)
@@ -3542,6 +3749,8 @@ static void task_fork_fair(struct task_struct *p)
3542 3749
3543 raw_spin_lock_irqsave(&rq->lock, flags); 3750 raw_spin_lock_irqsave(&rq->lock, flags);
3544 3751
3752 update_rq_clock(rq);
3753
3545 if (unlikely(task_cpu(p) != this_cpu)) 3754 if (unlikely(task_cpu(p) != this_cpu))
3546 __set_task_cpu(p, this_cpu); 3755 __set_task_cpu(p, this_cpu);
3547 3756
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..d10c80ebb67a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1663{ 1663{
1664 unsigned long soft, hard; 1664 unsigned long soft, hard;
1665 1665
1666 if (!p->signal)
1667 return;
1668
1669 /* max may change after cur was read, this will be fixed next tick */ 1666 /* max may change after cur was read, this will be fixed next tick */
1670 soft = task_rlimit(p, RLIMIT_RTTIME); 1667 soft = task_rlimit(p, RLIMIT_RTTIME);
1671 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1668 hard = task_rlimit_max(p, RLIMIT_RTTIME);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
295static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime) 296 cputime_t cputime)
297{ 297{
298 struct thread_group_cputimer *cputimer; 298 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
299
300 /* tsk == current, ensure it is safe to use ->signal */
301 if (unlikely(tsk->exit_state))
302 return;
303
304 cputimer = &tsk->signal->cputimer;
305 299
306 if (!cputimer->running) 300 if (!cputimer->running)
307 return; 301 return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 319static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 320 cputime_t cputime)
327{ 321{
328 struct thread_group_cputimer *cputimer; 322 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
329
330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state))
332 return;
333
334 cputimer = &tsk->signal->cputimer;
335 323
336 if (!cputimer->running) 324 if (!cputimer->running)
337 return; 325 return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
355static inline void account_group_exec_runtime(struct task_struct *tsk, 343static inline void account_group_exec_runtime(struct task_struct *tsk,
356 unsigned long long ns) 344 unsigned long long ns)
357{ 345{
358 struct thread_group_cputimer *cputimer; 346 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
359 struct signal_struct *sig;
360
361 sig = tsk->signal;
362 /* see __exit_signal()->task_rq_unlock_wait() */
363 barrier();
364 if (unlikely(!sig))
365 return;
366
367 cputimer = &sig->cputimer;
368 347
369 if (!cputimer->running) 348 if (!cputimer->running)
370 return; 349 return;
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
1/* Slow work debugging
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/slow-work.h>
14#include <linux/fs.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include "slow-work.h"
18
19#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
20#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
21#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
22
23void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
24{
25 seq_puts(m, "Slow-work: New thread");
26}
27
28/*
29 * Render the time mark field on a work item into a 5-char time with units plus
30 * a space
31 */
32static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
33{
34 struct timespec now, diff;
35
36 now = CURRENT_TIME;
37 diff = timespec_sub(now, work->mark);
38
39 if (diff.tv_sec < 0)
40 seq_puts(m, " -ve ");
41 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
42 seq_printf(m, "%3luns ", diff.tv_nsec);
43 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
44 seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
45 else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
46 seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
47 else if (diff.tv_sec <= 1)
48 seq_puts(m, " 1s ");
49 else if (diff.tv_sec < 60)
50 seq_printf(m, "%4lus ", diff.tv_sec);
51 else if (diff.tv_sec < 60 * 60)
52 seq_printf(m, "%4lum ", diff.tv_sec / 60);
53 else if (diff.tv_sec < 60 * 60 * 24)
54 seq_printf(m, "%4luh ", diff.tv_sec / 3600);
55 else
56 seq_puts(m, "exces ");
57}
58
59/*
60 * Describe a slow work item for debugfs
61 */
62static int slow_work_runqueue_show(struct seq_file *m, void *v)
63{
64 struct slow_work *work;
65 struct list_head *p = v;
66 unsigned long id;
67
68 switch ((unsigned long) v) {
69 case 1:
70 seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
71 return 0;
72 case 2:
73 seq_puts(m, "=== ===== ================ == ===== ==========\n");
74 return 0;
75
76 case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
77 id = (unsigned long) v - 3;
78
79 read_lock(&slow_work_execs_lock);
80 work = slow_work_execs[id];
81 if (work) {
82 smp_read_barrier_depends();
83
84 seq_printf(m, "%3lu %5d %16p %2lx ",
85 id, slow_work_pids[id], work, work->flags);
86 slow_work_print_mark(m, work);
87
88 if (work->ops->desc)
89 work->ops->desc(work, m);
90 seq_putc(m, '\n');
91 }
92 read_unlock(&slow_work_execs_lock);
93 return 0;
94
95 default:
96 work = list_entry(p, struct slow_work, link);
97 seq_printf(m, "%3s - %16p %2lx ",
98 work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
99 work, work->flags);
100 slow_work_print_mark(m, work);
101
102 if (work->ops->desc)
103 work->ops->desc(work, m);
104 seq_putc(m, '\n');
105 return 0;
106 }
107}
108
109/*
110 * map the iterator to a work item
111 */
112static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
113{
114 struct list_head *p;
115 unsigned long count, id;
116
117 switch (*_pos >> ITERATOR_SHIFT) {
118 case 0x0:
119 if (*_pos == 0)
120 *_pos = 1;
121 if (*_pos < 3)
122 return (void *)(unsigned long) *_pos;
123 if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
124 for (id = *_pos - 3;
125 id < SLOW_WORK_THREAD_LIMIT;
126 id++, (*_pos)++)
127 if (slow_work_execs[id])
128 return (void *)(unsigned long) *_pos;
129 *_pos = 0x1UL << ITERATOR_SHIFT;
130
131 case 0x1:
132 count = *_pos & ITERATOR_COUNTER;
133 list_for_each(p, &slow_work_queue) {
134 if (count == 0)
135 return p;
136 count--;
137 }
138 *_pos = 0x2UL << ITERATOR_SHIFT;
139
140 case 0x2:
141 count = *_pos & ITERATOR_COUNTER;
142 list_for_each(p, &vslow_work_queue) {
143 if (count == 0)
144 return p;
145 count--;
146 }
147 *_pos = 0x3UL << ITERATOR_SHIFT;
148
149 default:
150 return NULL;
151 }
152}
153
154/*
155 * set up the iterator to start reading from the first line
156 */
157static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
158{
159 spin_lock_irq(&slow_work_queue_lock);
160 return slow_work_runqueue_index(m, _pos);
161}
162
163/*
164 * move to the next line
165 */
166static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
167{
168 struct list_head *p = v;
169 unsigned long selector = *_pos >> ITERATOR_SHIFT;
170
171 (*_pos)++;
172 switch (selector) {
173 case 0x0:
174 return slow_work_runqueue_index(m, _pos);
175
176 case 0x1:
177 if (*_pos >> ITERATOR_SHIFT == 0x1) {
178 p = p->next;
179 if (p != &slow_work_queue)
180 return p;
181 }
182 *_pos = 0x2UL << ITERATOR_SHIFT;
183 p = &vslow_work_queue;
184
185 case 0x2:
186 if (*_pos >> ITERATOR_SHIFT == 0x2) {
187 p = p->next;
188 if (p != &vslow_work_queue)
189 return p;
190 }
191 *_pos = 0x3UL << ITERATOR_SHIFT;
192
193 default:
194 return NULL;
195 }
196}
197
198/*
199 * clean up after reading
200 */
201static void slow_work_runqueue_stop(struct seq_file *m, void *v)
202{
203 spin_unlock_irq(&slow_work_queue_lock);
204}
205
206static const struct seq_operations slow_work_runqueue_ops = {
207 .start = slow_work_runqueue_start,
208 .stop = slow_work_runqueue_stop,
209 .next = slow_work_runqueue_next,
210 .show = slow_work_runqueue_show,
211};
212
213/*
214 * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
215 */
216static int slow_work_runqueue_open(struct inode *inode, struct file *file)
217{
218 return seq_open(file, &slow_work_runqueue_ops);
219}
220
221const struct file_operations slow_work_runqueue_fops = {
222 .owner = THIS_MODULE,
223 .open = slow_work_runqueue_open,
224 .read = seq_read,
225 .llseek = seq_lseek,
226 .release = seq_release,
227};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19#include <linux/debugfs.h>
20#include "slow-work.h"
21
22static void slow_work_cull_timeout(unsigned long);
23static void slow_work_oom_timeout(unsigned long);
24
25#ifdef CONFIG_SYSCTL
26static int slow_work_min_threads_sysctl(struct ctl_table *, int,
27 void __user *, size_t *, loff_t *);
28
29static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
30 void __user *, size_t *, loff_t *);
31#endif
32
33/*
34 * The pool of threads has at least min threads in it as long as someone is
35 * using the facility, and may have as many as max.
36 *
37 * A portion of the pool may be processing very slow operations.
38 */
39static unsigned slow_work_min_threads = 2;
40static unsigned slow_work_max_threads = 4;
41static unsigned vslow_work_proportion = 50; /* % of threads that may process
42 * very slow work */
43
44#ifdef CONFIG_SYSCTL
45static const int slow_work_min_min_threads = 2;
46static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
47static const int slow_work_min_vslow = 1;
48static const int slow_work_max_vslow = 99;
49
50ctl_table slow_work_sysctls[] = {
51 {
52 .procname = "min-threads",
53 .data = &slow_work_min_threads,
54 .maxlen = sizeof(unsigned),
55 .mode = 0644,
56 .proc_handler = slow_work_min_threads_sysctl,
57 .extra1 = (void *) &slow_work_min_min_threads,
58 .extra2 = &slow_work_max_threads,
59 },
60 {
61 .procname = "max-threads",
62 .data = &slow_work_max_threads,
63 .maxlen = sizeof(unsigned),
64 .mode = 0644,
65 .proc_handler = slow_work_max_threads_sysctl,
66 .extra1 = &slow_work_min_threads,
67 .extra2 = (void *) &slow_work_max_max_threads,
68 },
69 {
70 .procname = "vslow-percentage",
71 .data = &vslow_work_proportion,
72 .maxlen = sizeof(unsigned),
73 .mode = 0644,
74 .proc_handler = proc_dointvec_minmax,
75 .extra1 = (void *) &slow_work_min_vslow,
76 .extra2 = (void *) &slow_work_max_vslow,
77 },
78 {}
79};
80#endif
81
82/*
83 * The active state of the thread pool
84 */
85static atomic_t slow_work_thread_count;
86static atomic_t vslow_work_executing_count;
87
88static bool slow_work_may_not_start_new_thread;
89static bool slow_work_cull; /* cull a thread due to lack of activity */
90static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
91static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
92static struct slow_work slow_work_new_thread; /* new thread starter */
93
94/*
95 * slow work ID allocation (use slow_work_queue_lock)
96 */
97static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
98
99/*
100 * Unregistration tracking to prevent put_ref() from disappearing during module
101 * unload
102 */
103#ifdef CONFIG_MODULES
104static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
105static struct module *slow_work_unreg_module;
106static struct slow_work *slow_work_unreg_work_item;
107static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
108static DEFINE_MUTEX(slow_work_unreg_sync_lock);
109
110static void slow_work_set_thread_processing(int id, struct slow_work *work)
111{
112 if (work)
113 slow_work_thread_processing[id] = work->owner;
114}
115static void slow_work_done_thread_processing(int id, struct slow_work *work)
116{
117 struct module *module = slow_work_thread_processing[id];
118
119 slow_work_thread_processing[id] = NULL;
120 smp_mb();
121 if (slow_work_unreg_work_item == work ||
122 slow_work_unreg_module == module)
123 wake_up_all(&slow_work_unreg_wq);
124}
125static void slow_work_clear_thread_processing(int id)
126{
127 slow_work_thread_processing[id] = NULL;
128}
129#else
130static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
131static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
132static void slow_work_clear_thread_processing(int id) {}
133#endif
134
135/*
136 * Data for tracking currently executing items for indication through /proc
137 */
138#ifdef CONFIG_SLOW_WORK_DEBUG
139struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
140pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
141DEFINE_RWLOCK(slow_work_execs_lock);
142#endif
143
144/*
145 * The queues of work items and the lock governing access to them. These are
146 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
147 * as the number of threads bears no relation to the number of CPUs.
148 *
149 * There are two queues of work items: one for slow work items, and one for
150 * very slow work items.
151 */
152LIST_HEAD(slow_work_queue);
153LIST_HEAD(vslow_work_queue);
154DEFINE_SPINLOCK(slow_work_queue_lock);
155
156/*
157 * The following are two wait queues that get pinged when a work item is placed
158 * on an empty queue. These allow work items that are hogging a thread by
159 * sleeping in a way that could be deferred to yield their thread and enqueue
160 * themselves.
161 */
162static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
163static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
164
165/*
166 * The thread controls. A variable used to signal to the threads that they
167 * should exit when the queue is empty, a waitqueue used by the threads to wait
168 * for signals, and a completion set by the last thread to exit.
169 */
170static bool slow_work_threads_should_exit;
171static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
172static DECLARE_COMPLETION(slow_work_last_thread_exited);
173
174/*
175 * The number of users of the thread pool and its lock. Whilst this is zero we
176 * have no threads hanging around, and when this reaches zero, we wait for all
177 * active or queued work items to complete and kill all the threads we do have.
178 */
179static int slow_work_user_count;
180static DEFINE_MUTEX(slow_work_user_lock);
181
182static inline int slow_work_get_ref(struct slow_work *work)
183{
184 if (work->ops->get_ref)
185 return work->ops->get_ref(work);
186
187 return 0;
188}
189
190static inline void slow_work_put_ref(struct slow_work *work)
191{
192 if (work->ops->put_ref)
193 work->ops->put_ref(work);
194}
195
196/*
197 * Calculate the maximum number of active threads in the pool that are
198 * permitted to process very slow work items.
199 *
200 * The answer is rounded up to at least 1, but may not equal or exceed the
201 * maximum number of the threads in the pool. This means we always have at
202 * least one thread that can process slow work items, and we always have at
203 * least one thread that won't get tied up doing so.
204 */
205static unsigned slow_work_calc_vsmax(void)
206{
207 unsigned vsmax;
208
209 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
210 vsmax /= 100;
211 vsmax = max(vsmax, 1U);
212 return min(vsmax, slow_work_max_threads - 1);
213}
214
215/*
216 * Attempt to execute stuff queued on a slow thread. Return true if we managed
217 * it, false if there was nothing to do.
218 */
219static noinline bool slow_work_execute(int id)
220{
221 struct slow_work *work = NULL;
222 unsigned vsmax;
223 bool very_slow;
224
225 vsmax = slow_work_calc_vsmax();
226
227 /* see if we can schedule a new thread to be started if we're not
228 * keeping up with the work */
229 if (!waitqueue_active(&slow_work_thread_wq) &&
230 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
231 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
232 !slow_work_may_not_start_new_thread)
233 slow_work_enqueue(&slow_work_new_thread);
234
235 /* find something to execute */
236 spin_lock_irq(&slow_work_queue_lock);
237 if (!list_empty(&vslow_work_queue) &&
238 atomic_read(&vslow_work_executing_count) < vsmax) {
239 work = list_entry(vslow_work_queue.next,
240 struct slow_work, link);
241 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
242 BUG();
243 list_del_init(&work->link);
244 atomic_inc(&vslow_work_executing_count);
245 very_slow = true;
246 } else if (!list_empty(&slow_work_queue)) {
247 work = list_entry(slow_work_queue.next,
248 struct slow_work, link);
249 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
250 BUG();
251 list_del_init(&work->link);
252 very_slow = false;
253 } else {
254 very_slow = false; /* avoid the compiler warning */
255 }
256
257 slow_work_set_thread_processing(id, work);
258 if (work) {
259 slow_work_mark_time(work);
260 slow_work_begin_exec(id, work);
261 }
262
263 spin_unlock_irq(&slow_work_queue_lock);
264
265 if (!work)
266 return false;
267
268 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
269 BUG();
270
271 /* don't execute if the work is in the process of being cancelled */
272 if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
273 work->ops->execute(work);
274
275 if (very_slow)
276 atomic_dec(&vslow_work_executing_count);
277 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
278
279 /* wake up anyone waiting for this work to be complete */
280 wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
281
282 slow_work_end_exec(id, work);
283
284 /* if someone tried to enqueue the item whilst we were executing it,
285 * then it'll be left unenqueued to avoid multiple threads trying to
286 * execute it simultaneously
287 *
288 * there is, however, a race between us testing the pending flag and
289 * getting the spinlock, and between the enqueuer setting the pending
290 * flag and getting the spinlock, so we use a deferral bit to tell us
291 * if the enqueuer got there first
292 */
293 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
294 spin_lock_irq(&slow_work_queue_lock);
295
296 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
297 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
298 goto auto_requeue;
299
300 spin_unlock_irq(&slow_work_queue_lock);
301 }
302
303 /* sort out the race between module unloading and put_ref() */
304 slow_work_put_ref(work);
305 slow_work_done_thread_processing(id, work);
306
307 return true;
308
309auto_requeue:
310 /* we must complete the enqueue operation
311 * - we transfer our ref on the item back to the appropriate queue
312 * - don't wake another thread up as we're awake already
313 */
314 slow_work_mark_time(work);
315 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
316 list_add_tail(&work->link, &vslow_work_queue);
317 else
318 list_add_tail(&work->link, &slow_work_queue);
319 spin_unlock_irq(&slow_work_queue_lock);
320 slow_work_clear_thread_processing(id);
321 return true;
322}
323
324/**
325 * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
326 * work: The work item under execution that wants to sleep
327 * _timeout: Scheduler sleep timeout
328 *
329 * Allow a requeueable work item to sleep on a slow-work processor thread until
330 * that thread is needed to do some other work or the sleep is interrupted by
331 * some other event.
332 *
333 * The caller must set up a wake up event before calling this and must have set
334 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
335 * condition before calling this function as no test is made here.
336 *
337 * False is returned if there is nothing on the queue; true is returned if the
338 * work item should be requeued
339 */
340bool slow_work_sleep_till_thread_needed(struct slow_work *work,
341 signed long *_timeout)
342{
343 wait_queue_head_t *wfo_wq;
344 struct list_head *queue;
345
346 DEFINE_WAIT(wait);
347
348 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
349 wfo_wq = &vslow_work_queue_waits_for_occupation;
350 queue = &vslow_work_queue;
351 } else {
352 wfo_wq = &slow_work_queue_waits_for_occupation;
353 queue = &slow_work_queue;
354 }
355
356 if (!list_empty(queue))
357 return true;
358
359 add_wait_queue_exclusive(wfo_wq, &wait);
360 if (list_empty(queue))
361 *_timeout = schedule_timeout(*_timeout);
362 finish_wait(wfo_wq, &wait);
363
364 return !list_empty(queue);
365}
366EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
367
368/**
369 * slow_work_enqueue - Schedule a slow work item for processing
370 * @work: The work item to queue
371 *
372 * Schedule a slow work item for processing. If the item is already undergoing
373 * execution, this guarantees not to re-enter the execution routine until the
374 * first execution finishes.
375 *
376 * The item is pinned by this function as it retains a reference to it, managed
377 * through the item operations. The item is unpinned once it has been
378 * executed.
379 *
380 * An item may hog the thread that is running it for a relatively large amount
381 * of time, sufficient, for example, to perform several lookup, mkdir, create
382 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
383 *
384 * Conversely, if a number of items are awaiting processing, it may take some
385 * time before any given item is given attention. The number of threads in the
386 * pool may be increased to deal with demand, but only up to a limit.
387 *
388 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
389 * the very slow queue, from which only a portion of the threads will be
390 * allowed to pick items to execute. This ensures that very slow items won't
391 * overly block ones that are just ordinarily slow.
392 *
393 * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
394 * attempted queued)
395 */
396int slow_work_enqueue(struct slow_work *work)
397{
398 wait_queue_head_t *wfo_wq;
399 struct list_head *queue;
400 unsigned long flags;
401 int ret;
402
403 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
404 return -ECANCELED;
405
406 BUG_ON(slow_work_user_count <= 0);
407 BUG_ON(!work);
408 BUG_ON(!work->ops);
409
410 /* when honouring an enqueue request, we only promise that we will run
411 * the work function in the future; we do not promise to run it once
412 * per enqueue request
413 *
414 * we use the PENDING bit to merge together repeat requests without
415 * having to disable IRQs and take the spinlock, whilst still
416 * maintaining our promise
417 */
418 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
419 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
420 wfo_wq = &vslow_work_queue_waits_for_occupation;
421 queue = &vslow_work_queue;
422 } else {
423 wfo_wq = &slow_work_queue_waits_for_occupation;
424 queue = &slow_work_queue;
425 }
426
427 spin_lock_irqsave(&slow_work_queue_lock, flags);
428
429 if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
430 goto cancelled;
431
432 /* we promise that we will not attempt to execute the work
433 * function in more than one thread simultaneously
434 *
435 * this, however, leaves us with a problem if we're asked to
436 * enqueue the work whilst someone is executing the work
437 * function as simply queueing the work immediately means that
438 * another thread may try executing it whilst it is already
439 * under execution
440 *
441 * to deal with this, we set the ENQ_DEFERRED bit instead of
442 * enqueueing, and the thread currently executing the work
443 * function will enqueue the work item when the work function
444 * returns and it has cleared the EXECUTING bit
445 */
446 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
447 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
448 } else {
449 ret = slow_work_get_ref(work);
450 if (ret < 0)
451 goto failed;
452 slow_work_mark_time(work);
453 list_add_tail(&work->link, queue);
454 wake_up(&slow_work_thread_wq);
455
456 /* if someone who could be requeued is sleeping on a
457 * thread, then ask them to yield their thread */
458 if (work->link.prev == queue)
459 wake_up(wfo_wq);
460 }
461
462 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
463 }
464 return 0;
465
466cancelled:
467 ret = -ECANCELED;
468failed:
469 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
470 return ret;
471}
472EXPORT_SYMBOL(slow_work_enqueue);
473
474static int slow_work_wait(void *word)
475{
476 schedule();
477 return 0;
478}
479
480/**
481 * slow_work_cancel - Cancel a slow work item
482 * @work: The work item to cancel
483 *
484 * This function will cancel a previously enqueued work item. If we cannot
485 * cancel the work item, it is guarenteed to have run when this function
486 * returns.
487 */
488void slow_work_cancel(struct slow_work *work)
489{
490 bool wait = true, put = false;
491
492 set_bit(SLOW_WORK_CANCELLING, &work->flags);
493 smp_mb();
494
495 /* if the work item is a delayed work item with an active timer, we
496 * need to wait for the timer to finish _before_ getting the spinlock,
497 * lest we deadlock against the timer routine
498 *
499 * the timer routine will leave DELAYED set if it notices the
500 * CANCELLING flag in time
501 */
502 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
503 struct delayed_slow_work *dwork =
504 container_of(work, struct delayed_slow_work, work);
505 del_timer_sync(&dwork->timer);
506 }
507
508 spin_lock_irq(&slow_work_queue_lock);
509
510 if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
511 /* the timer routine aborted or never happened, so we are left
512 * holding the timer's reference on the item and should just
513 * drop the pending flag and wait for any ongoing execution to
514 * finish */
515 struct delayed_slow_work *dwork =
516 container_of(work, struct delayed_slow_work, work);
517
518 BUG_ON(timer_pending(&dwork->timer));
519 BUG_ON(!list_empty(&work->link));
520
521 clear_bit(SLOW_WORK_DELAYED, &work->flags);
522 put = true;
523 clear_bit(SLOW_WORK_PENDING, &work->flags);
524
525 } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
526 !list_empty(&work->link)) {
527 /* the link in the pending queue holds a reference on the item
528 * that we will need to release */
529 list_del_init(&work->link);
530 wait = false;
531 put = true;
532 clear_bit(SLOW_WORK_PENDING, &work->flags);
533
534 } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
535 /* the executor is holding our only reference on the item, so
536 * we merely need to wait for it to finish executing */
537 clear_bit(SLOW_WORK_PENDING, &work->flags);
538 }
539
540 spin_unlock_irq(&slow_work_queue_lock);
541
542 /* the EXECUTING flag is set by the executor whilst the spinlock is set
543 * and before the item is dequeued - so assuming the above doesn't
544 * actually dequeue it, simply waiting for the EXECUTING flag to be
545 * released here should be sufficient */
546 if (wait)
547 wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
548 TASK_UNINTERRUPTIBLE);
549
550 clear_bit(SLOW_WORK_CANCELLING, &work->flags);
551 if (put)
552 slow_work_put_ref(work);
553}
554EXPORT_SYMBOL(slow_work_cancel);
555
556/*
557 * Handle expiry of the delay timer, indicating that a delayed slow work item
558 * should now be queued if not cancelled
559 */
560static void delayed_slow_work_timer(unsigned long data)
561{
562 wait_queue_head_t *wfo_wq;
563 struct list_head *queue;
564 struct slow_work *work = (struct slow_work *) data;
565 unsigned long flags;
566 bool queued = false, put = false, first = false;
567
568 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
569 wfo_wq = &vslow_work_queue_waits_for_occupation;
570 queue = &vslow_work_queue;
571 } else {
572 wfo_wq = &slow_work_queue_waits_for_occupation;
573 queue = &slow_work_queue;
574 }
575
576 spin_lock_irqsave(&slow_work_queue_lock, flags);
577 if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
578 clear_bit(SLOW_WORK_DELAYED, &work->flags);
579
580 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
581 /* we discard the reference the timer was holding in
582 * favour of the one the executor holds */
583 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
584 put = true;
585 } else {
586 slow_work_mark_time(work);
587 list_add_tail(&work->link, queue);
588 queued = true;
589 if (work->link.prev == queue)
590 first = true;
591 }
592 }
593
594 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
595 if (put)
596 slow_work_put_ref(work);
597 if (first)
598 wake_up(wfo_wq);
599 if (queued)
600 wake_up(&slow_work_thread_wq);
601}
602
603/**
604 * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
605 * @dwork: The delayed work item to queue
606 * @delay: When to start executing the work, in jiffies from now
607 *
608 * This is similar to slow_work_enqueue(), but it adds a delay before the work
609 * is actually queued for processing.
610 *
611 * The item can have delayed processing requested on it whilst it is being
612 * executed. The delay will begin immediately, and if it expires before the
613 * item finishes executing, the item will be placed back on the queue when it
614 * has done executing.
615 */
616int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
617 unsigned long delay)
618{
619 struct slow_work *work = &dwork->work;
620 unsigned long flags;
621 int ret;
622
623 if (delay == 0)
624 return slow_work_enqueue(&dwork->work);
625
626 BUG_ON(slow_work_user_count <= 0);
627 BUG_ON(!work);
628 BUG_ON(!work->ops);
629
630 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
631 return -ECANCELED;
632
633 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
634 spin_lock_irqsave(&slow_work_queue_lock, flags);
635
636 if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
637 goto cancelled;
638
639 /* the timer holds a reference whilst it is pending */
640 ret = slow_work_get_ref(work);
641 if (ret < 0)
642 goto cant_get_ref;
643
644 if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
645 BUG();
646 dwork->timer.expires = jiffies + delay;
647 dwork->timer.data = (unsigned long) work;
648 dwork->timer.function = delayed_slow_work_timer;
649 add_timer(&dwork->timer);
650
651 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
652 }
653
654 return 0;
655
656cancelled:
657 ret = -ECANCELED;
658cant_get_ref:
659 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
660 return ret;
661}
662EXPORT_SYMBOL(delayed_slow_work_enqueue);
663
664/*
665 * Schedule a cull of the thread pool at some time in the near future
666 */
667static void slow_work_schedule_cull(void)
668{
669 mod_timer(&slow_work_cull_timer,
670 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
671}
672
673/*
674 * Worker thread culling algorithm
675 */
676static bool slow_work_cull_thread(void)
677{
678 unsigned long flags;
679 bool do_cull = false;
680
681 spin_lock_irqsave(&slow_work_queue_lock, flags);
682
683 if (slow_work_cull) {
684 slow_work_cull = false;
685
686 if (list_empty(&slow_work_queue) &&
687 list_empty(&vslow_work_queue) &&
688 atomic_read(&slow_work_thread_count) >
689 slow_work_min_threads) {
690 slow_work_schedule_cull();
691 do_cull = true;
692 }
693 }
694
695 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
696 return do_cull;
697}
698
699/*
700 * Determine if there is slow work available for dispatch
701 */
702static inline bool slow_work_available(int vsmax)
703{
704 return !list_empty(&slow_work_queue) ||
705 (!list_empty(&vslow_work_queue) &&
706 atomic_read(&vslow_work_executing_count) < vsmax);
707}
708
709/*
710 * Worker thread dispatcher
711 */
712static int slow_work_thread(void *_data)
713{
714 int vsmax, id;
715
716 DEFINE_WAIT(wait);
717
718 set_freezable();
719 set_user_nice(current, -5);
720
721 /* allocate ourselves an ID */
722 spin_lock_irq(&slow_work_queue_lock);
723 id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
724 BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
725 __set_bit(id, slow_work_ids);
726 slow_work_set_thread_pid(id, current->pid);
727 spin_unlock_irq(&slow_work_queue_lock);
728
729 sprintf(current->comm, "kslowd%03u", id);
730
731 for (;;) {
732 vsmax = vslow_work_proportion;
733 vsmax *= atomic_read(&slow_work_thread_count);
734 vsmax /= 100;
735
736 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
737 TASK_INTERRUPTIBLE);
738 if (!freezing(current) &&
739 !slow_work_threads_should_exit &&
740 !slow_work_available(vsmax) &&
741 !slow_work_cull)
742 schedule();
743 finish_wait(&slow_work_thread_wq, &wait);
744
745 try_to_freeze();
746
747 vsmax = vslow_work_proportion;
748 vsmax *= atomic_read(&slow_work_thread_count);
749 vsmax /= 100;
750
751 if (slow_work_available(vsmax) && slow_work_execute(id)) {
752 cond_resched();
753 if (list_empty(&slow_work_queue) &&
754 list_empty(&vslow_work_queue) &&
755 atomic_read(&slow_work_thread_count) >
756 slow_work_min_threads)
757 slow_work_schedule_cull();
758 continue;
759 }
760
761 if (slow_work_threads_should_exit)
762 break;
763
764 if (slow_work_cull && slow_work_cull_thread())
765 break;
766 }
767
768 spin_lock_irq(&slow_work_queue_lock);
769 slow_work_set_thread_pid(id, 0);
770 __clear_bit(id, slow_work_ids);
771 spin_unlock_irq(&slow_work_queue_lock);
772
773 if (atomic_dec_and_test(&slow_work_thread_count))
774 complete_and_exit(&slow_work_last_thread_exited, 0);
775 return 0;
776}
777
778/*
779 * Handle thread cull timer expiration
780 */
781static void slow_work_cull_timeout(unsigned long data)
782{
783 slow_work_cull = true;
784 wake_up(&slow_work_thread_wq);
785}
786
787/*
788 * Start a new slow work thread
789 */
790static void slow_work_new_thread_execute(struct slow_work *work)
791{
792 struct task_struct *p;
793
794 if (slow_work_threads_should_exit)
795 return;
796
797 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
798 return;
799
800 if (!mutex_trylock(&slow_work_user_lock))
801 return;
802
803 slow_work_may_not_start_new_thread = true;
804 atomic_inc(&slow_work_thread_count);
805 p = kthread_run(slow_work_thread, NULL, "kslowd");
806 if (IS_ERR(p)) {
807 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
808 if (atomic_dec_and_test(&slow_work_thread_count))
809 BUG(); /* we're running on a slow work thread... */
810 mod_timer(&slow_work_oom_timer,
811 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
812 } else {
813 /* ratelimit the starting of new threads */
814 mod_timer(&slow_work_oom_timer, jiffies + 1);
815 }
816
817 mutex_unlock(&slow_work_user_lock);
818}
819
820static const struct slow_work_ops slow_work_new_thread_ops = {
821 .owner = THIS_MODULE,
822 .execute = slow_work_new_thread_execute,
823#ifdef CONFIG_SLOW_WORK_DEBUG
824 .desc = slow_work_new_thread_desc,
825#endif
826};
827
828/*
829 * post-OOM new thread start suppression expiration
830 */
831static void slow_work_oom_timeout(unsigned long data)
832{
833 slow_work_may_not_start_new_thread = false;
834}
835
836#ifdef CONFIG_SYSCTL
837/*
838 * Handle adjustment of the minimum number of threads
839 */
840static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
841 void __user *buffer,
842 size_t *lenp, loff_t *ppos)
843{
844 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
845 int n;
846
847 if (ret == 0) {
848 mutex_lock(&slow_work_user_lock);
849 if (slow_work_user_count > 0) {
850 /* see if we need to start or stop threads */
851 n = atomic_read(&slow_work_thread_count) -
852 slow_work_min_threads;
853
854 if (n < 0 && !slow_work_may_not_start_new_thread)
855 slow_work_enqueue(&slow_work_new_thread);
856 else if (n > 0)
857 slow_work_schedule_cull();
858 }
859 mutex_unlock(&slow_work_user_lock);
860 }
861
862 return ret;
863}
864
865/*
866 * Handle adjustment of the maximum number of threads
867 */
868static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
869 void __user *buffer,
870 size_t *lenp, loff_t *ppos)
871{
872 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
873 int n;
874
875 if (ret == 0) {
876 mutex_lock(&slow_work_user_lock);
877 if (slow_work_user_count > 0) {
878 /* see if we need to stop threads */
879 n = slow_work_max_threads -
880 atomic_read(&slow_work_thread_count);
881
882 if (n < 0)
883 slow_work_schedule_cull();
884 }
885 mutex_unlock(&slow_work_user_lock);
886 }
887
888 return ret;
889}
890#endif /* CONFIG_SYSCTL */
891
892/**
893 * slow_work_register_user - Register a user of the facility
894 * @module: The module about to make use of the facility
895 *
896 * Register a user of the facility, starting up the initial threads if there
897 * aren't any other users at this point. This will return 0 if successful, or
898 * an error if not.
899 */
900int slow_work_register_user(struct module *module)
901{
902 struct task_struct *p;
903 int loop;
904
905 mutex_lock(&slow_work_user_lock);
906
907 if (slow_work_user_count == 0) {
908 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
909 init_completion(&slow_work_last_thread_exited);
910
911 slow_work_threads_should_exit = false;
912 slow_work_init(&slow_work_new_thread,
913 &slow_work_new_thread_ops);
914 slow_work_may_not_start_new_thread = false;
915 slow_work_cull = false;
916
917 /* start the minimum number of threads */
918 for (loop = 0; loop < slow_work_min_threads; loop++) {
919 atomic_inc(&slow_work_thread_count);
920 p = kthread_run(slow_work_thread, NULL, "kslowd");
921 if (IS_ERR(p))
922 goto error;
923 }
924 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
925 }
926
927 slow_work_user_count++;
928 mutex_unlock(&slow_work_user_lock);
929 return 0;
930
931error:
932 if (atomic_dec_and_test(&slow_work_thread_count))
933 complete(&slow_work_last_thread_exited);
934 if (loop > 0) {
935 printk(KERN_ERR "Slow work thread pool:"
936 " Aborting startup on ENOMEM\n");
937 slow_work_threads_should_exit = true;
938 wake_up_all(&slow_work_thread_wq);
939 wait_for_completion(&slow_work_last_thread_exited);
940 printk(KERN_ERR "Slow work thread pool: Aborted\n");
941 }
942 mutex_unlock(&slow_work_user_lock);
943 return PTR_ERR(p);
944}
945EXPORT_SYMBOL(slow_work_register_user);
946
947/*
948 * wait for all outstanding items from the calling module to complete
949 * - note that more items may be queued whilst we're waiting
950 */
951static void slow_work_wait_for_items(struct module *module)
952{
953#ifdef CONFIG_MODULES
954 DECLARE_WAITQUEUE(myself, current);
955 struct slow_work *work;
956 int loop;
957
958 mutex_lock(&slow_work_unreg_sync_lock);
959 add_wait_queue(&slow_work_unreg_wq, &myself);
960
961 for (;;) {
962 spin_lock_irq(&slow_work_queue_lock);
963
964 /* first of all, we wait for the last queued item in each list
965 * to be processed */
966 list_for_each_entry_reverse(work, &vslow_work_queue, link) {
967 if (work->owner == module) {
968 set_current_state(TASK_UNINTERRUPTIBLE);
969 slow_work_unreg_work_item = work;
970 goto do_wait;
971 }
972 }
973 list_for_each_entry_reverse(work, &slow_work_queue, link) {
974 if (work->owner == module) {
975 set_current_state(TASK_UNINTERRUPTIBLE);
976 slow_work_unreg_work_item = work;
977 goto do_wait;
978 }
979 }
980
981 /* then we wait for the items being processed to finish */
982 slow_work_unreg_module = module;
983 smp_mb();
984 for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
985 if (slow_work_thread_processing[loop] == module)
986 goto do_wait;
987 }
988 spin_unlock_irq(&slow_work_queue_lock);
989 break; /* okay, we're done */
990
991 do_wait:
992 spin_unlock_irq(&slow_work_queue_lock);
993 schedule();
994 slow_work_unreg_work_item = NULL;
995 slow_work_unreg_module = NULL;
996 }
997
998 remove_wait_queue(&slow_work_unreg_wq, &myself);
999 mutex_unlock(&slow_work_unreg_sync_lock);
1000#endif /* CONFIG_MODULES */
1001}
1002
1003/**
1004 * slow_work_unregister_user - Unregister a user of the facility
1005 * @module: The module whose items should be cleared
1006 *
1007 * Unregister a user of the facility, killing all the threads if this was the
1008 * last one.
1009 *
1010 * This waits for all the work items belonging to the nominated module to go
1011 * away before proceeding.
1012 */
1013void slow_work_unregister_user(struct module *module)
1014{
1015 /* first of all, wait for all outstanding items from the calling module
1016 * to complete */
1017 if (module)
1018 slow_work_wait_for_items(module);
1019
1020 /* then we can actually go about shutting down the facility if need
1021 * be */
1022 mutex_lock(&slow_work_user_lock);
1023
1024 BUG_ON(slow_work_user_count <= 0);
1025
1026 slow_work_user_count--;
1027 if (slow_work_user_count == 0) {
1028 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
1029 slow_work_threads_should_exit = true;
1030 del_timer_sync(&slow_work_cull_timer);
1031 del_timer_sync(&slow_work_oom_timer);
1032 wake_up_all(&slow_work_thread_wq);
1033 wait_for_completion(&slow_work_last_thread_exited);
1034 printk(KERN_NOTICE "Slow work thread pool:"
1035 " Shut down complete\n");
1036 }
1037
1038 mutex_unlock(&slow_work_user_lock);
1039}
1040EXPORT_SYMBOL(slow_work_unregister_user);
1041
1042/*
1043 * Initialise the slow work facility
1044 */
1045static int __init init_slow_work(void)
1046{
1047 unsigned nr_cpus = num_possible_cpus();
1048
1049 if (slow_work_max_threads < nr_cpus)
1050 slow_work_max_threads = nr_cpus;
1051#ifdef CONFIG_SYSCTL
1052 if (slow_work_max_max_threads < nr_cpus * 2)
1053 slow_work_max_max_threads = nr_cpus * 2;
1054#endif
1055#ifdef CONFIG_SLOW_WORK_DEBUG
1056 {
1057 struct dentry *dbdir;
1058
1059 dbdir = debugfs_create_dir("slow_work", NULL);
1060 if (dbdir && !IS_ERR(dbdir))
1061 debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
1062 NULL, &slow_work_runqueue_fops);
1063 }
1064#endif
1065 return 0;
1066}
1067
1068subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/* Slow work private definitions
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
13 * things to do */
14#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
15 * OOM */
16
17#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
18
19/*
20 * slow-work.c
21 */
22#ifdef CONFIG_SLOW_WORK_DEBUG
23extern struct slow_work *slow_work_execs[];
24extern pid_t slow_work_pids[];
25extern rwlock_t slow_work_execs_lock;
26#endif
27
28extern struct list_head slow_work_queue;
29extern struct list_head vslow_work_queue;
30extern spinlock_t slow_work_queue_lock;
31
32/*
33 * slow-work-debugfs.c
34 */
35#ifdef CONFIG_SLOW_WORK_DEBUG
36extern const struct file_operations slow_work_runqueue_fops;
37
38extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
39#endif
40
41/*
42 * Helper functions
43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{
46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid;
48#endif
49}
50
51static inline void slow_work_mark_time(struct slow_work *work)
52{
53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME;
55#endif
56}
57
58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{
60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work;
62#endif
63}
64
65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{
67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock);
71#endif
72}
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c970c715d3..ed6aacfcb7ef 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -365,9 +365,10 @@ call:
365EXPORT_SYMBOL_GPL(smp_call_function_any); 365EXPORT_SYMBOL_GPL(smp_call_function_any);
366 366
367/** 367/**
368 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on a specific CPU
369 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
370 * @data: Pre-allocated and setup data structure 370 * @data: Pre-allocated and setup data structure
371 * @wait: If true, wait until function has completed on specified CPU.
371 * 372 *
372 * Like smp_call_function_single(), but allow caller to pass in a 373 * Like smp_call_function_single(), but allow caller to pass in a
373 * pre-allocated data structure. Useful for embedding @data inside 374 * pre-allocated data structure. Useful for embedding @data inside
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
376void __smp_call_function_single(int cpu, struct call_single_data *data, 377void __smp_call_function_single(int cpu, struct call_single_data *data,
377 int wait) 378 int wait)
378{ 379{
379 csd_lock(data); 380 unsigned int this_cpu;
381 unsigned long flags;
380 382
383 this_cpu = get_cpu();
381 /* 384 /*
382 * Can deadlock when called with interrupts disabled. 385 * Can deadlock when called with interrupts disabled.
383 * We allow cpu's that are not yet online though, as no one else can 386 * We allow cpu's that are not yet online though, as no one else can
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
387 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() 390 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
388 && !oops_in_progress); 391 && !oops_in_progress);
389 392
390 generic_exec_single(cpu, data, wait); 393 if (cpu == this_cpu) {
394 local_irq_save(flags);
395 data->func(data->info);
396 local_irq_restore(flags);
397 } else {
398 csd_lock(data);
399 generic_exec_single(cpu, data, wait);
400 }
401 put_cpu();
391} 402}
392 403
393/** 404/**
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index 4b493f67dcb5..000000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9#include <linux/mm.h>
10#include <linux/cpu.h>
11#include <linux/nmi.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/freezer.h>
15#include <linux/kthread.h>
16#include <linux/lockdep.h>
17#include <linux/notifier.h>
18#include <linux/module.h>
19#include <linux/sysctl.h>
20
21#include <asm/irq_regs.h>
22
23static DEFINE_SPINLOCK(print_lock);
24
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
29
30static int __read_mostly did_panic;
31int __read_mostly softlockup_thresh = 60;
32
33/*
34 * Should we panic (and reboot, if panic_timeout= is set) when a
35 * soft-lockup occurs:
36 */
37unsigned int __read_mostly softlockup_panic =
38 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
39
40static int __init softlockup_panic_setup(char *str)
41{
42 softlockup_panic = simple_strtoul(str, NULL, 0);
43
44 return 1;
45}
46__setup("softlockup_panic=", softlockup_panic_setup);
47
48static int
49softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
50{
51 did_panic = 1;
52
53 return NOTIFY_DONE;
54}
55
56static struct notifier_block panic_block = {
57 .notifier_call = softlock_panic,
58};
59
60/*
61 * Returns seconds, approximately. We don't need nanosecond
62 * resolution, and we don't need to waste time with a big divide when
63 * 2^30ns == 1.074s.
64 */
65static unsigned long get_timestamp(int this_cpu)
66{
67 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
68}
69
70static void __touch_softlockup_watchdog(void)
71{
72 int this_cpu = raw_smp_processor_id();
73
74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
75}
76
77void touch_softlockup_watchdog(void)
78{
79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
80}
81EXPORT_SYMBOL(touch_softlockup_watchdog);
82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
89void touch_all_softlockup_watchdogs(void)
90{
91 int cpu;
92
93 /* Cause each CPU to re-update its timestamp rather than complain */
94 for_each_online_cpu(cpu)
95 per_cpu(softlockup_touch_ts, cpu) = 0;
96}
97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
98
99int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
100 void __user *buffer,
101 size_t *lenp, loff_t *ppos)
102{
103 touch_all_softlockup_watchdogs();
104 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
105}
106
107/*
108 * This callback runs from the timer interrupt, and checks
109 * whether the watchdog thread has hung or not:
110 */
111void softlockup_tick(void)
112{
113 int this_cpu = smp_processor_id();
114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
115 unsigned long print_ts;
116 struct pt_regs *regs = get_irq_regs();
117 unsigned long now;
118
119 /* Is detection switched off? */
120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
121 /* Be sure we don't false trigger if switched back on */
122 if (touch_ts)
123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
124 return;
125 }
126
127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
136 __touch_softlockup_watchdog();
137 return;
138 }
139
140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
141
142 /* report at most once a second */
143 if (print_ts == touch_ts || did_panic)
144 return;
145
146 /* do not print during early bootup: */
147 if (unlikely(system_state != SYSTEM_RUNNING)) {
148 __touch_softlockup_watchdog();
149 return;
150 }
151
152 now = get_timestamp(this_cpu);
153
154 /*
155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan.
157 */
158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160
161 /* Warn about unreasonable delays: */
162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return;
164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
166
167 spin_lock(&print_lock);
168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
169 this_cpu, now - touch_ts,
170 current->comm, task_pid_nr(current));
171 print_modules();
172 print_irqtrace_events(current);
173 if (regs)
174 show_regs(regs);
175 else
176 dump_stack();
177 spin_unlock(&print_lock);
178
179 if (softlockup_panic)
180 panic("softlockup: hung tasks");
181}
182
183/*
184 * The watchdog thread - runs every second and touches the timestamp.
185 */
186static int watchdog(void *__bind_cpu)
187{
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 sched_setscheduler(current, SCHED_FIFO, &param);
191
192 /* initialize timestamp */
193 __touch_softlockup_watchdog();
194
195 set_current_state(TASK_INTERRUPTIBLE);
196 /*
197 * Run briefly once per second to reset the softlockup timestamp.
198 * If this gets delayed for more than 60 seconds then the
199 * debug-printout triggers in softlockup_tick().
200 */
201 while (!kthread_should_stop()) {
202 __touch_softlockup_watchdog();
203 schedule();
204
205 if (kthread_should_stop())
206 break;
207
208 set_current_state(TASK_INTERRUPTIBLE);
209 }
210 __set_current_state(TASK_RUNNING);
211
212 return 0;
213}
214
215/*
216 * Create/destroy watchdog threads as CPUs come and go:
217 */
218static int __cpuinit
219cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
220{
221 int hotcpu = (unsigned long)hcpu;
222 struct task_struct *p;
223
224 switch (action) {
225 case CPU_UP_PREPARE:
226 case CPU_UP_PREPARE_FROZEN:
227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
229 if (IS_ERR(p)) {
230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
231 return NOTIFY_BAD;
232 }
233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
234 per_cpu(softlockup_watchdog, hotcpu) = p;
235 kthread_bind(p, hotcpu);
236 break;
237 case CPU_ONLINE:
238 case CPU_ONLINE_FROZEN:
239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
240 break;
241#ifdef CONFIG_HOTPLUG_CPU
242 case CPU_UP_CANCELED:
243 case CPU_UP_CANCELED_FROZEN:
244 if (!per_cpu(softlockup_watchdog, hotcpu))
245 break;
246 /* Unbind so it can run. Fall thru. */
247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
248 cpumask_any(cpu_online_mask));
249 case CPU_DEAD:
250 case CPU_DEAD_FROZEN:
251 p = per_cpu(softlockup_watchdog, hotcpu);
252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
253 kthread_stop(p);
254 break;
255#endif /* CONFIG_HOTPLUG_CPU */
256 }
257 return NOTIFY_OK;
258}
259
260static struct notifier_block __cpuinitdata cpu_nfb = {
261 .notifier_call = cpu_callback
262};
263
264static int __initdata nosoftlockup;
265
266static int __init nosoftlockup_setup(char *str)
267{
268 nosoftlockup = 1;
269 return 1;
270}
271__setup("nosoftlockup", nosoftlockup_setup);
272
273static int __init spawn_softlockup_task(void)
274{
275 void *cpu = (void *)(long)smp_processor_id();
276 int err;
277
278 if (nosoftlockup)
279 return 0;
280
281 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
282 if (err == NOTIFY_BAD) {
283 BUG();
284 return 1;
285 }
286 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
287 register_cpu_notifier(&cpu_nfb);
288
289 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
290
291 return 0;
292}
293early_initcall(spawn_softlockup_task);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 70f8d90331e9..4372ccb25127 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,9 +35,9 @@ struct cpu_stop_done {
35/* the actual stopper, one per every possible cpu, enabled on online cpus */ 35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper { 36struct cpu_stopper {
37 spinlock_t lock; 37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */
38 struct list_head works; /* list of pending works */ 39 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */ 40 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41}; 41};
42 42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
diff --git a/kernel/sys.c b/kernel/sys.c
index e83ddbbaf89d..7f5a0cd296a9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
931 pgid = pid; 931 pgid = pid;
932 if (pgid < 0) 932 if (pgid < 0)
933 return -EINVAL; 933 return -EINVAL;
934 rcu_read_lock();
934 935
935 /* From this point forward we keep holding onto the tasklist lock 936 /* From this point forward we keep holding onto the tasklist lock
936 * so that our parent does not change from under us. -DaveM 937 * so that our parent does not change from under us. -DaveM
@@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
984out: 985out:
985 /* All paths lead to here, thus we are safe. -DaveM */ 986 /* All paths lead to here, thus we are safe. -DaveM */
986 write_unlock_irq(&tasklist_lock); 987 write_unlock_irq(&tasklist_lock);
988 rcu_read_unlock();
987 return err; 989 return err;
988} 990}
989 991
@@ -1236,15 +1238,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1236 1238
1237SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1239SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1238{ 1240{
1239 if (resource >= RLIM_NLIMITS) 1241 struct rlimit value;
1240 return -EINVAL; 1242 int ret;
1241 else { 1243
1242 struct rlimit value; 1244 ret = do_prlimit(current, resource, NULL, &value);
1243 task_lock(current->group_leader); 1245 if (!ret)
1244 value = current->signal->rlim[resource]; 1246 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1245 task_unlock(current->group_leader); 1247
1246 return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1248 return ret;
1247 }
1248} 1249}
1249 1250
1250#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1251#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1272,44 +1273,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1272 1273
1273#endif 1274#endif
1274 1275
1275SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1276static inline bool rlim64_is_infinity(__u64 rlim64)
1276{ 1277{
1277 struct rlimit new_rlim, *old_rlim; 1278#if BITS_PER_LONG < 64
1278 int retval; 1279 return rlim64 >= ULONG_MAX;
1280#else
1281 return rlim64 == RLIM64_INFINITY;
1282#endif
1283}
1284
1285static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
1286{
1287 if (rlim->rlim_cur == RLIM_INFINITY)
1288 rlim64->rlim_cur = RLIM64_INFINITY;
1289 else
1290 rlim64->rlim_cur = rlim->rlim_cur;
1291 if (rlim->rlim_max == RLIM_INFINITY)
1292 rlim64->rlim_max = RLIM64_INFINITY;
1293 else
1294 rlim64->rlim_max = rlim->rlim_max;
1295}
1296
1297static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
1298{
1299 if (rlim64_is_infinity(rlim64->rlim_cur))
1300 rlim->rlim_cur = RLIM_INFINITY;
1301 else
1302 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
1303 if (rlim64_is_infinity(rlim64->rlim_max))
1304 rlim->rlim_max = RLIM_INFINITY;
1305 else
1306 rlim->rlim_max = (unsigned long)rlim64->rlim_max;
1307}
1308
1309/* make sure you are allowed to change @tsk limits before calling this */
1310int do_prlimit(struct task_struct *tsk, unsigned int resource,
1311 struct rlimit *new_rlim, struct rlimit *old_rlim)
1312{
1313 struct rlimit *rlim;
1314 int retval = 0;
1279 1315
1280 if (resource >= RLIM_NLIMITS) 1316 if (resource >= RLIM_NLIMITS)
1281 return -EINVAL; 1317 return -EINVAL;
1282 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1318 if (new_rlim) {
1283 return -EFAULT; 1319 if (new_rlim->rlim_cur > new_rlim->rlim_max)
1284 if (new_rlim.rlim_cur > new_rlim.rlim_max) 1320 return -EINVAL;
1285 return -EINVAL; 1321 if (resource == RLIMIT_NOFILE &&
1286 old_rlim = current->signal->rlim + resource; 1322 new_rlim->rlim_max > sysctl_nr_open)
1287 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1323 return -EPERM;
1288 !capable(CAP_SYS_RESOURCE))
1289 return -EPERM;
1290 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1291 return -EPERM;
1292
1293 retval = security_task_setrlimit(resource, &new_rlim);
1294 if (retval)
1295 return retval;
1296
1297 if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
1298 /*
1299 * The caller is asking for an immediate RLIMIT_CPU
1300 * expiry. But we use the zero value to mean "it was
1301 * never set". So let's cheat and make it one second
1302 * instead
1303 */
1304 new_rlim.rlim_cur = 1;
1305 } 1324 }
1306 1325
1307 task_lock(current->group_leader); 1326 /* protect tsk->signal and tsk->sighand from disappearing */
1308 *old_rlim = new_rlim; 1327 read_lock(&tasklist_lock);
1309 task_unlock(current->group_leader); 1328 if (!tsk->sighand) {
1310 1329 retval = -ESRCH;
1311 if (resource != RLIMIT_CPU)
1312 goto out; 1330 goto out;
1331 }
1332
1333 rlim = tsk->signal->rlim + resource;
1334 task_lock(tsk->group_leader);
1335 if (new_rlim) {
1336 if (new_rlim->rlim_max > rlim->rlim_max &&
1337 !capable(CAP_SYS_RESOURCE))
1338 retval = -EPERM;
1339 if (!retval)
1340 retval = security_task_setrlimit(tsk->group_leader,
1341 resource, new_rlim);
1342 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1343 /*
1344 * The caller is asking for an immediate RLIMIT_CPU
1345 * expiry. But we use the zero value to mean "it was
1346 * never set". So let's cheat and make it one second
1347 * instead
1348 */
1349 new_rlim->rlim_cur = 1;
1350 }
1351 }
1352 if (!retval) {
1353 if (old_rlim)
1354 *old_rlim = *rlim;
1355 if (new_rlim)
1356 *rlim = *new_rlim;
1357 }
1358 task_unlock(tsk->group_leader);
1313 1359
1314 /* 1360 /*
1315 * RLIMIT_CPU handling. Note that the kernel fails to return an error 1361 * RLIMIT_CPU handling. Note that the kernel fails to return an error
@@ -1317,14 +1363,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1317 * very long-standing error, and fixing it now risks breakage of 1363 * very long-standing error, and fixing it now risks breakage of
1318 * applications, so we live with it 1364 * applications, so we live with it
1319 */ 1365 */
1320 if (new_rlim.rlim_cur == RLIM_INFINITY) 1366 if (!retval && new_rlim && resource == RLIMIT_CPU &&
1321 goto out; 1367 new_rlim->rlim_cur != RLIM_INFINITY)
1322 1368 update_rlimit_cpu(tsk, new_rlim->rlim_cur);
1323 update_rlimit_cpu(new_rlim.rlim_cur);
1324out: 1369out:
1370 read_unlock(&tasklist_lock);
1371 return retval;
1372}
1373
1374/* rcu lock must be held */
1375static int check_prlimit_permission(struct task_struct *task)
1376{
1377 const struct cred *cred = current_cred(), *tcred;
1378
1379 tcred = __task_cred(task);
1380 if ((cred->uid != tcred->euid ||
1381 cred->uid != tcred->suid ||
1382 cred->uid != tcred->uid ||
1383 cred->gid != tcred->egid ||
1384 cred->gid != tcred->sgid ||
1385 cred->gid != tcred->gid) &&
1386 !capable(CAP_SYS_RESOURCE)) {
1387 return -EPERM;
1388 }
1389
1325 return 0; 1390 return 0;
1326} 1391}
1327 1392
1393SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1394 const struct rlimit64 __user *, new_rlim,
1395 struct rlimit64 __user *, old_rlim)
1396{
1397 struct rlimit64 old64, new64;
1398 struct rlimit old, new;
1399 struct task_struct *tsk;
1400 int ret;
1401
1402 if (new_rlim) {
1403 if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1404 return -EFAULT;
1405 rlim64_to_rlim(&new64, &new);
1406 }
1407
1408 rcu_read_lock();
1409 tsk = pid ? find_task_by_vpid(pid) : current;
1410 if (!tsk) {
1411 rcu_read_unlock();
1412 return -ESRCH;
1413 }
1414 ret = check_prlimit_permission(tsk);
1415 if (ret) {
1416 rcu_read_unlock();
1417 return ret;
1418 }
1419 get_task_struct(tsk);
1420 rcu_read_unlock();
1421
1422 ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
1423 old_rlim ? &old : NULL);
1424
1425 if (!ret && old_rlim) {
1426 rlim_to_rlim64(&old, &old64);
1427 if (copy_to_user(old_rlim, &old64, sizeof(old64)))
1428 ret = -EFAULT;
1429 }
1430
1431 put_task_struct(tsk);
1432 return ret;
1433}
1434
1435SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1436{
1437 struct rlimit new_rlim;
1438
1439 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1440 return -EFAULT;
1441 return do_prlimit(current, resource, &new_rlim, NULL);
1442}
1443
1328/* 1444/*
1329 * It would make sense to put struct rusage in the task_struct, 1445 * It would make sense to put struct rusage in the task_struct,
1330 * except that would make the task_struct be *really big*. After 1446 * except that would make the task_struct be *really big*. After
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..bad369ec5403 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2);
181 181
182/* performance counters: */ 182/* performance counters: */
183cond_syscall(sys_perf_event_open); 183cond_syscall(sys_perf_event_open);
184
185/* fanotify! */
186cond_syscall(sys_fanotify_init);
187cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..f88552c6d227 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,16 +44,17 @@
44#include <linux/times.h> 44#include <linux/times.h>
45#include <linux/limits.h> 45#include <linux/limits.h>
46#include <linux/dcache.h> 46#include <linux/dcache.h>
47#include <linux/dnotify.h>
47#include <linux/syscalls.h> 48#include <linux/syscalls.h>
48#include <linux/vmstat.h> 49#include <linux/vmstat.h>
49#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
50#include <linux/acpi.h> 51#include <linux/acpi.h>
51#include <linux/reboot.h> 52#include <linux/reboot.h>
52#include <linux/ftrace.h> 53#include <linux/ftrace.h>
53#include <linux/slow-work.h>
54#include <linux/perf_event.h> 54#include <linux/perf_event.h>
55#include <linux/kprobes.h> 55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59#include <asm/processor.h> 60#include <asm/processor.h>
@@ -76,15 +77,16 @@
76#include <scsi/sg.h> 77#include <scsi/sg.h>
77#endif 78#endif
78 79
80#ifdef CONFIG_LOCKUP_DETECTOR
81#include <linux/nmi.h>
82#endif
83
79 84
80#if defined(CONFIG_SYSCTL) 85#if defined(CONFIG_SYSCTL)
81 86
82/* External variables not in a header file. */ 87/* External variables not in a header file. */
83extern int sysctl_overcommit_memory; 88extern int sysctl_overcommit_memory;
84extern int sysctl_overcommit_ratio; 89extern int sysctl_overcommit_ratio;
85extern int sysctl_panic_on_oom;
86extern int sysctl_oom_kill_allocating_task;
87extern int sysctl_oom_dump_tasks;
88extern int max_threads; 90extern int max_threads;
89extern int core_uses_pid; 91extern int core_uses_pid;
90extern int suid_dumpable; 92extern int suid_dumpable;
@@ -106,7 +108,7 @@ extern int blk_iopoll_enabled;
106#endif 108#endif
107 109
108/* Constants used for minimum and maximum */ 110/* Constants used for minimum and maximum */
109#ifdef CONFIG_DETECT_SOFTLOCKUP 111#ifdef CONFIG_LOCKUP_DETECTOR
110static int sixty = 60; 112static int sixty = 60;
111static int neg_one = -1; 113static int neg_one = -1;
112#endif 114#endif
@@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
130 132
131static int ngroups_max = NGROUPS_MAX; 133static int ngroups_max = NGROUPS_MAX;
132 134
135#ifdef CONFIG_INOTIFY_USER
136#include <linux/inotify.h>
137#endif
133#ifdef CONFIG_SPARC 138#ifdef CONFIG_SPARC
134#include <asm/system.h> 139#include <asm/system.h>
135#endif 140#endif
@@ -206,9 +211,6 @@ static struct ctl_table fs_table[];
206static struct ctl_table debug_table[]; 211static struct ctl_table debug_table[];
207static struct ctl_table dev_table[]; 212static struct ctl_table dev_table[];
208extern struct ctl_table random_table[]; 213extern struct ctl_table random_table[];
209#ifdef CONFIG_INOTIFY_USER
210extern struct ctl_table inotify_table[];
211#endif
212#ifdef CONFIG_EPOLL 214#ifdef CONFIG_EPOLL
213extern struct ctl_table epoll_table[]; 215extern struct ctl_table epoll_table[];
214#endif 216#endif
@@ -562,7 +564,7 @@ static struct ctl_table kern_table[] = {
562 .extra2 = &one, 564 .extra2 = &one,
563 }, 565 },
564#endif 566#endif
565#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 567#ifdef CONFIG_HOTPLUG
566 { 568 {
567 .procname = "hotplug", 569 .procname = "hotplug",
568 .data = &uevent_helper, 570 .data = &uevent_helper,
@@ -710,7 +712,34 @@ static struct ctl_table kern_table[] = {
710 .mode = 0444, 712 .mode = 0444,
711 .proc_handler = proc_dointvec, 713 .proc_handler = proc_dointvec,
712 }, 714 },
713#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 715#if defined(CONFIG_LOCKUP_DETECTOR)
716 {
717 .procname = "watchdog",
718 .data = &watchdog_enabled,
719 .maxlen = sizeof (int),
720 .mode = 0644,
721 .proc_handler = proc_dowatchdog_enabled,
722 },
723 {
724 .procname = "watchdog_thresh",
725 .data = &softlockup_thresh,
726 .maxlen = sizeof(int),
727 .mode = 0644,
728 .proc_handler = proc_dowatchdog_thresh,
729 .extra1 = &neg_one,
730 .extra2 = &sixty,
731 },
732 {
733 .procname = "softlockup_panic",
734 .data = &softlockup_panic,
735 .maxlen = sizeof(int),
736 .mode = 0644,
737 .proc_handler = proc_dointvec_minmax,
738 .extra1 = &zero,
739 .extra2 = &one,
740 },
741#endif
742#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
714 { 743 {
715 .procname = "unknown_nmi_panic", 744 .procname = "unknown_nmi_panic",
716 .data = &unknown_nmi_panic, 745 .data = &unknown_nmi_panic,
@@ -813,26 +842,6 @@ static struct ctl_table kern_table[] = {
813 .proc_handler = proc_dointvec, 842 .proc_handler = proc_dointvec,
814 }, 843 },
815#endif 844#endif
816#ifdef CONFIG_DETECT_SOFTLOCKUP
817 {
818 .procname = "softlockup_panic",
819 .data = &softlockup_panic,
820 .maxlen = sizeof(int),
821 .mode = 0644,
822 .proc_handler = proc_dointvec_minmax,
823 .extra1 = &zero,
824 .extra2 = &one,
825 },
826 {
827 .procname = "softlockup_thresh",
828 .data = &softlockup_thresh,
829 .maxlen = sizeof(int),
830 .mode = 0644,
831 .proc_handler = proc_dosoftlockup_thresh,
832 .extra1 = &neg_one,
833 .extra2 = &sixty,
834 },
835#endif
836#ifdef CONFIG_DETECT_HUNG_TASK 845#ifdef CONFIG_DETECT_HUNG_TASK
837 { 846 {
838 .procname = "hung_task_panic", 847 .procname = "hung_task_panic",
@@ -906,13 +915,6 @@ static struct ctl_table kern_table[] = {
906 .proc_handler = proc_dointvec, 915 .proc_handler = proc_dointvec,
907 }, 916 },
908#endif 917#endif
909#ifdef CONFIG_SLOW_WORK
910 {
911 .procname = "slow-work",
912 .mode = 0555,
913 .child = slow_work_sysctls,
914 },
915#endif
916#ifdef CONFIG_PERF_EVENTS 918#ifdef CONFIG_PERF_EVENTS
917 { 919 {
918 .procname = "perf_event_paranoid", 920 .procname = "perf_event_paranoid",
@@ -1711,10 +1713,7 @@ static __init int sysctl_init(void)
1711{ 1713{
1712 sysctl_set_parent(NULL, root_table); 1714 sysctl_set_parent(NULL, root_table);
1713#ifdef CONFIG_SYSCTL_SYSCALL_CHECK 1715#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1714 { 1716 sysctl_check_table(current->nsproxy, root_table);
1715 int err;
1716 err = sysctl_check_table(current->nsproxy, root_table);
1717 }
1718#endif 1717#endif
1719 return 0; 1718 return 0;
1720} 1719}
diff --git a/kernel/time.c b/kernel/time.c
index 848b1c2ab09a..ba9b338d1835 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
300} 300}
301EXPORT_SYMBOL(timespec_trunc); 301EXPORT_SYMBOL(timespec_trunc);
302 302
303#ifndef CONFIG_GENERIC_TIME
304/*
305 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
306 * and therefore only yields usec accuracy
307 */
308void getnstimeofday(struct timespec *tv)
309{
310 struct timeval x;
311
312 do_gettimeofday(&x);
313 tv->tv_sec = x.tv_sec;
314 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
315}
316EXPORT_SYMBOL_GPL(getnstimeofday);
317#endif
318
319/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 303/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
320 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 304 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
321 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. 305 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0a..f06a8a365648 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
6 6
7config NO_HZ 7config NO_HZ
8 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
9 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 9 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
10 select TICK_ONESHOT 10 select TICK_ONESHOT
11 help 11 help
12 This option enables a tickless system: timer interrupts will 12 This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
15 15
16config HIGH_RES_TIMERS 16config HIGH_RES_TIMERS
17 bool "High Resolution Timer Support" 17 bool "High Resolution Timer Support"
18 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS 18 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
19 select TICK_ONESHOT 19 select TICK_ONESHOT
20 help 20 help
21 This option enables high resolution timer support. If your 21 This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f08e99c1d561..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
531 return max_nsecs - (max_nsecs >> 5); 531 return max_nsecs - (max_nsecs >> 5);
532} 532}
533 533
534#ifdef CONFIG_GENERIC_TIME 534#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
535 535
536/** 536/**
537 * clocksource_select - Select the best clocksource available 537 * clocksource_select - Select the best clocksource available
@@ -577,7 +577,7 @@ static void clocksource_select(void)
577 } 577 }
578} 578}
579 579
580#else /* CONFIG_GENERIC_TIME */ 580#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
581 581
582static inline void clocksource_select(void) { } 582static inline void clocksource_select(void) { }
583 583
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
639#define MAX_UPDATE_LENGTH 5 /* Seconds */ 639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640 640
641/** 641/**
642 * __clocksource_register_scale - Used to install new clocksources 642 * __clocksource_updatefreq_scale - Used update clocksource with new freq
643 * @t: clocksource to be registered 643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz 644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale 645 * @freq: clocksource frequency (cycles per second) divided by scale
646 * 646 *
647 * Returns -EBUSY if registration fails, zero otherwise. 647 * This should only be called from the clocksource->enable() method.
648 * 648 *
649 * This *SHOULD NOT* be called directly! Please use the 649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions. 650 * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
651 */ 651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 652void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
653{ 653{
654
655 /* 654 /*
656 * Ideally we want to use some of the limits used in 655 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed 656 * clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
662 NSEC_PER_SEC/scale, 661 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale); 662 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 663 cs->max_idle_ns = clocksource_max_deferment(cs);
664}
665EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
666
667/**
668 * __clocksource_register_scale - Used to install new clocksources
669 * @t: clocksource to be registered
670 * @scale: Scale factor multiplied against freq to get clocksource hz
671 * @freq: clocksource frequency (cycles per second) divided by scale
672 *
673 * Returns -EBUSY if registration fails, zero otherwise.
674 *
675 * This *SHOULD NOT* be called directly! Please use the
676 * clocksource_register_hz() or clocksource_register_khz helper functions.
677 */
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{
680
681 /* Intialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq);
665 683
684 /* Add clocksource to the clcoksource list */
666 mutex_lock(&clocksource_mutex); 685 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs); 686 clocksource_enqueue(cs);
668 clocksource_select(); 687 clocksource_select();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..3e216e01bbd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
325 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
326 326
327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { 328 arch_needs_cpu(cpu)) {
329 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
330 delta_jiffies = 1; 330 delta_jiffies = 1;
331 } else { 331 } else {
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
405 * the scheduler tick in nohz_restart_sched_tick. 405 * the scheduler tick in nohz_restart_sched_tick.
406 */ 406 */
407 if (!ts->tick_stopped) { 407 if (!ts->tick_stopped) {
408 if (select_nohz_load_balancer(1)) { 408 select_nohz_load_balancer(1);
409 /*
410 * sched tick not stopped!
411 */
412 cpumask_clear_cpu(cpu, nohz_cpu_mask);
413 goto out;
414 }
415 409
416 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
417 ts->tick_stopped = 1; 411 ts->tick_stopped = 1;
@@ -780,7 +774,6 @@ void tick_setup_sched_timer(void)
780{ 774{
781 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 775 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
782 ktime_t now = ktime_get(); 776 ktime_t now = ktime_get();
783 u64 offset;
784 777
785 /* 778 /*
786 * Emulate tick processing via per-CPU hrtimers: 779 * Emulate tick processing via per-CPU hrtimers:
@@ -790,10 +783,6 @@ void tick_setup_sched_timer(void)
790 783
791 /* Get the next period (per cpu) */ 784 /* Get the next period (per cpu) */
792 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 785 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
793 offset = ktime_to_ns(tick_period) >> 1;
794 do_div(offset, num_possible_cpus());
795 offset *= smp_processor_id();
796 hrtimer_add_expires_ns(&ts->sched_timer, offset);
797 786
798 for (;;) { 787 for (;;) {
799 hrtimer_forward(&ts->sched_timer, now, tick_period); 788 hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..49010d822f72 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
153 * - wall_to_monotonic is no longer the boot time, getboottime must be 153 * - wall_to_monotonic is no longer the boot time, getboottime must be
154 * used instead. 154 * used instead.
155 */ 155 */
156struct timespec xtime __attribute__ ((aligned (16))); 156static struct timespec xtime __attribute__ ((aligned (16)));
157struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 157static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
158static struct timespec total_sleep_time; 158static struct timespec total_sleep_time;
159 159
160/* 160/*
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond)
170{ 170{
171 xtime.tv_sec += leapsecond; 171 xtime.tv_sec += leapsecond;
172 wall_to_monotonic.tv_sec -= leapsecond; 172 wall_to_monotonic.tv_sec -= leapsecond;
173 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 173 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
174 timekeeper.mult);
174} 175}
175 176
176#ifdef CONFIG_GENERIC_TIME
177
178/** 177/**
179 * timekeeping_forward_now - update clock to the current time 178 * timekeeping_forward_now - update clock to the current time
180 * 179 *
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
328 timekeeper.ntp_error = 0; 327 timekeeper.ntp_error = 0;
329 ntp_clear(); 328 ntp_clear();
330 329
331 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 330 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
331 timekeeper.mult);
332 332
333 write_sequnlock_irqrestore(&xtime_lock, flags); 333 write_sequnlock_irqrestore(&xtime_lock, flags);
334 334
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
376 tick_clock_notify(); 376 tick_clock_notify();
377} 377}
378 378
379#else /* GENERIC_TIME */
380
381static inline void timekeeping_forward_now(void) { }
382
383/**
384 * ktime_get - get the monotonic time in ktime_t format
385 *
386 * returns the time in ktime_t format
387 */
388ktime_t ktime_get(void)
389{
390 struct timespec now;
391
392 ktime_get_ts(&now);
393
394 return timespec_to_ktime(now);
395}
396EXPORT_SYMBOL_GPL(ktime_get);
397
398/**
399 * ktime_get_ts - get the monotonic clock in timespec format
400 * @ts: pointer to timespec variable
401 *
402 * The function calculates the monotonic clock from the realtime
403 * clock and the wall_to_monotonic offset and stores the result
404 * in normalized timespec format in the variable pointed to by @ts.
405 */
406void ktime_get_ts(struct timespec *ts)
407{
408 struct timespec tomono;
409 unsigned long seq;
410
411 do {
412 seq = read_seqbegin(&xtime_lock);
413 getnstimeofday(ts);
414 tomono = wall_to_monotonic;
415
416 } while (read_seqretry(&xtime_lock, seq));
417
418 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
419 ts->tv_nsec + tomono.tv_nsec);
420}
421EXPORT_SYMBOL_GPL(ktime_get_ts);
422
423#endif /* !GENERIC_TIME */
424
425/** 379/**
426 * ktime_get_real - get the real (wall-) time in ktime_t format 380 * ktime_get_real - get the real (wall-) time in ktime_t format
427 * 381 *
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev)
579 533
580 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 534 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
581 ts = timespec_sub(ts, timekeeping_suspend_time); 535 ts = timespec_sub(ts, timekeeping_suspend_time);
582 xtime = timespec_add_safe(xtime, ts); 536 xtime = timespec_add(xtime, ts);
583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 537 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
584 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 538 total_sleep_time = timespec_add(total_sleep_time, ts);
585 } 539 }
586 /* re-base the last cycle value */ 540 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 541 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset)
736static cycle_t logarithmic_accumulation(cycle_t offset, int shift) 690static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
737{ 691{
738 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 692 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
693 u64 raw_nsecs;
739 694
740 /* If the offset is smaller then a shifted interval, do nothing */ 695 /* If the offset is smaller then a shifted interval, do nothing */
741 if (offset < timekeeper.cycle_interval<<shift) 696 if (offset < timekeeper.cycle_interval<<shift)
@@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
752 second_overflow(); 707 second_overflow();
753 } 708 }
754 709
755 /* Accumulate into raw time */ 710 /* Accumulate raw time */
756 raw_time.tv_nsec += timekeeper.raw_interval << shift;; 711 raw_nsecs = timekeeper.raw_interval << shift;
757 while (raw_time.tv_nsec >= NSEC_PER_SEC) { 712 raw_nsecs += raw_time.tv_nsec;
758 raw_time.tv_nsec -= NSEC_PER_SEC; 713 if (raw_nsecs >= NSEC_PER_SEC) {
759 raw_time.tv_sec++; 714 u64 raw_secs = raw_nsecs;
715 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
716 raw_time.tv_sec += raw_secs;
760 } 717 }
718 raw_time.tv_nsec = raw_nsecs;
761 719
762 /* Accumulate error between NTP and clock interval */ 720 /* Accumulate error between NTP and clock interval */
763 timekeeper.ntp_error += tick_length << shift; 721 timekeeper.ntp_error += tick_length << shift;
@@ -784,10 +742,11 @@ void update_wall_time(void)
784 return; 742 return;
785 743
786 clock = timekeeper.clock; 744 clock = timekeeper.clock;
787#ifdef CONFIG_GENERIC_TIME 745
788 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 746#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
789#else
790 offset = timekeeper.cycle_interval; 747 offset = timekeeper.cycle_interval;
748#else
749 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
791#endif 750#endif
792 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 751 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
793 752
@@ -856,7 +815,8 @@ void update_wall_time(void)
856 } 815 }
857 816
858 /* check to see if there is a new clocksource to use */ 817 /* check to see if there is a new clocksource to use */
859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 818 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
819 timekeeper.mult);
860} 820}
861 821
862/** 822/**
@@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime);
887 */ 847 */
888void monotonic_to_bootbased(struct timespec *ts) 848void monotonic_to_bootbased(struct timespec *ts)
889{ 849{
890 *ts = timespec_add_safe(*ts, total_sleep_time); 850 *ts = timespec_add(*ts, total_sleep_time);
891} 851}
892EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 852EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
893 853
@@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void)
902 return xtime; 862 return xtime;
903} 863}
904 864
865struct timespec __get_wall_to_monotonic(void)
866{
867 return wall_to_monotonic;
868}
869
905struct timespec current_kernel_time(void) 870struct timespec current_kernel_time(void)
906{ 871{
907 struct timespec now; 872 struct timespec now;
diff --git a/kernel/timer.c b/kernel/timer.c
index efde11e197c4..97bf05baade7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/* 91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of 92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB for 93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * the new flag to indicate whether the timer is deferrable 94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
95 */ 100 */
96#define TBASE_DEFERRABLE_FLAG (0x1) 101#define TBASE_DEFERRABLE_FLAG (0x1)
97 102
@@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321 326
322/** 327/**
323 * set_timer_slack - set the allowed slack for a timer 328 * set_timer_slack - set the allowed slack for a timer
329 * @timer: the timer to be modified
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding 330 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 * 331 *
326 * Set the amount of time, in jiffies, that a certain timer has 332 * Set the amount of time, in jiffies, that a certain timer has
@@ -692,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
692 cpu = smp_processor_id(); 698 cpu = smp_processor_id();
693 699
694#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 700#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
695 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { 701 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
696 int preferred_cpu = get_nohz_load_balancer(); 702 cpu = get_nohz_timer_target();
697
698 if (preferred_cpu >= 0)
699 cpu = preferred_cpu;
700 }
701#endif 703#endif
702 new_base = per_cpu(tvec_bases, cpu); 704 new_base = per_cpu(tvec_bases, cpu);
703 705
@@ -1302,7 +1304,6 @@ void run_local_timers(void)
1302{ 1304{
1303 hrtimer_run_queues(); 1305 hrtimer_run_queues();
1304 raise_softirq(TIMER_SOFTIRQ); 1306 raise_softirq(TIMER_SOFTIRQ);
1305 softlockup_tick();
1306} 1307}
1307 1308
1308/* 1309/*
@@ -1763,3 +1764,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
1763} 1764}
1764 1765
1765EXPORT_SYMBOL(msleep_interruptible); 1766EXPORT_SYMBOL(msleep_interruptible);
1767
1768static int __sched do_usleep_range(unsigned long min, unsigned long max)
1769{
1770 ktime_t kmin;
1771 unsigned long delta;
1772
1773 kmin = ktime_set(0, min * NSEC_PER_USEC);
1774 delta = (max - min) * NSEC_PER_USEC;
1775 return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
1776}
1777
1778/**
1779 * usleep_range - Drop in replacement for udelay where wakeup is flexible
1780 * @min: Minimum time in usecs to sleep
1781 * @max: Maximum time in usecs to sleep
1782 */
1783void usleep_range(unsigned long min, unsigned long max)
1784{
1785 __set_current_state(TASK_UNINTERRUPTIBLE);
1786 do_usleep_range(min, max);
1787}
1788EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..538501c6ea50 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -153,7 +153,7 @@ config IRQSOFF_TRACER
153 bool "Interrupts-off Latency Tracer" 153 bool "Interrupts-off Latency Tracer"
154 default n 154 default n
155 depends on TRACE_IRQFLAGS_SUPPORT 155 depends on TRACE_IRQFLAGS_SUPPORT
156 depends on GENERIC_TIME 156 depends on !ARCH_USES_GETTIMEOFFSET
157 select TRACE_IRQFLAGS 157 select TRACE_IRQFLAGS
158 select GENERIC_TRACER 158 select GENERIC_TRACER
159 select TRACER_MAX_TRACE 159 select TRACER_MAX_TRACE
@@ -175,7 +175,7 @@ config IRQSOFF_TRACER
175config PREEMPT_TRACER 175config PREEMPT_TRACER
176 bool "Preemption-off Latency Tracer" 176 bool "Preemption-off Latency Tracer"
177 default n 177 default n
178 depends on GENERIC_TIME 178 depends on !ARCH_USES_GETTIMEOFFSET
179 depends on PREEMPT 179 depends on PREEMPT
180 select GENERIC_TRACER 180 select GENERIC_TRACER
181 select TRACER_MAX_TRACE 181 select TRACER_MAX_TRACE
@@ -194,15 +194,6 @@ config PREEMPT_TRACER
194 enabled. This option and the irqs-off timing option can be 194 enabled. This option and the irqs-off timing option can be
195 used together or separately.) 195 used together or separately.)
196 196
197config SYSPROF_TRACER
198 bool "Sysprof Tracer"
199 depends on X86
200 select GENERIC_TRACER
201 select CONTEXT_SWITCH_TRACER
202 help
203 This tracer provides the trace needed by the 'Sysprof' userspace
204 tool.
205
206config SCHED_TRACER 197config SCHED_TRACER
207 bool "Scheduling Latency Tracer" 198 bool "Scheduling Latency Tracer"
208 select GENERIC_TRACER 199 select GENERIC_TRACER
@@ -229,23 +220,6 @@ config FTRACE_SYSCALLS
229 help 220 help
230 Basic tracer to catch the syscall entry and exit events. 221 Basic tracer to catch the syscall entry and exit events.
231 222
232config BOOT_TRACER
233 bool "Trace boot initcalls"
234 select GENERIC_TRACER
235 select CONTEXT_SWITCH_TRACER
236 help
237 This tracer helps developers to optimize boot times: it records
238 the timings of the initcalls and traces key events and the identity
239 of tasks that can cause boot delays, such as context-switches.
240
241 Its aim is to be parsed by the scripts/bootgraph.pl tool to
242 produce pretty graphics about boot inefficiencies, giving a visual
243 representation of the delays during initcalls - but the raw
244 /debug/tracing/trace text output is readable too.
245
246 You must pass in initcall_debug and ftrace=initcall to the kernel
247 command line to enable this on bootup.
248
249config TRACE_BRANCH_PROFILING 223config TRACE_BRANCH_PROFILING
250 bool 224 bool
251 select GENERIC_TRACER 225 select GENERIC_TRACER
@@ -325,28 +299,6 @@ config BRANCH_TRACER
325 299
326 Say N if unsure. 300 Say N if unsure.
327 301
328config KSYM_TRACER
329 bool "Trace read and write access on kernel memory locations"
330 depends on HAVE_HW_BREAKPOINT
331 select TRACING
332 help
333 This tracer helps find read and write operations on any given kernel
334 symbol i.e. /proc/kallsyms.
335
336config PROFILE_KSYM_TRACER
337 bool "Profile all kernel memory accesses on 'watched' variables"
338 depends on KSYM_TRACER
339 help
340 This tracer profiles kernel accesses on variables watched through the
341 ksym tracer ftrace plugin. Depending upon the hardware, all read
342 and write operations on kernel variables can be monitored for
343 accesses.
344
345 The results will be displayed in:
346 /debugfs/tracing/profile_ksym
347
348 Say N if unsure.
349
350config STACK_TRACER 302config STACK_TRACER
351 bool "Trace max stack" 303 bool "Trace max stack"
352 depends on HAVE_FUNCTION_TRACER 304 depends on HAVE_FUNCTION_TRACER
@@ -371,37 +323,6 @@ config STACK_TRACER
371 323
372 Say N if unsure. 324 Say N if unsure.
373 325
374config KMEMTRACE
375 bool "Trace SLAB allocations"
376 select GENERIC_TRACER
377 help
378 kmemtrace provides tracing for slab allocator functions, such as
379 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
380 data is then fed to the userspace application in order to analyse
381 allocation hotspots, internal fragmentation and so on, making it
382 possible to see how well an allocator performs, as well as debug
383 and profile kernel code.
384
385 This requires an userspace application to use. See
386 Documentation/trace/kmemtrace.txt for more information.
387
388 Saying Y will make the kernel somewhat larger and slower. However,
389 if you disable kmemtrace at run-time or boot-time, the performance
390 impact is minimal (depending on the arch the kernel is built for).
391
392 If unsure, say N.
393
394config WORKQUEUE_TRACER
395 bool "Trace workqueues"
396 select GENERIC_TRACER
397 help
398 The workqueue tracer provides some statistical information
399 about each cpu workqueue thread such as the number of the
400 works inserted and executed since their creation. It can help
401 to evaluate the amount of work each of them has to perform.
402 For example it can help a developer to decide whether he should
403 choose a per-cpu workqueue instead of a singlethreaded one.
404
405config BLK_DEV_IO_TRACE 326config BLK_DEV_IO_TRACE
406 bool "Support for tracing block IO actions" 327 bool "Support for tracing block IO actions"
407 depends on SYSFS 328 depends on SYSFS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
30obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
31obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
33obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
34obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 33obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
35obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 34obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
36obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 35obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
38obj-$(CONFIG_NOP_TRACER) += trace_nop.o 37obj-$(CONFIG_NOP_TRACER) += trace_nop.o
39obj-$(CONFIG_STACK_TRACER) += trace_stack.o 38obj-$(CONFIG_STACK_TRACER) += trace_stack.o
40obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 39obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 40obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 41obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 42obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 43obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
47ifeq ($(CONFIG_BLOCK),y) 44ifeq ($(CONFIG_BLOCK),y)
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
55endif 52endif
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
58obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
59obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_EVENT_TRACING) += power-traces.o
56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif
60 59
61libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 638711c17504..959f8d6c8cc1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
170 BLK_TC_ACT(BLK_TC_WRITE) }; 170 BLK_TC_ACT(BLK_TC_WRITE) };
171 171
172#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
173#define BLK_TC_RAHEAD BLK_TC_AHEAD
174
172/* The ilog2() calls fall out because they're constant */ 175/* The ilog2() calls fall out because they're constant */
173#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ 176#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
174 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) 177 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
175 178
176/* 179/*
177 * The worker for the various blk_add_trace*() types. Fills out a 180 * The worker for the various blk_add_trace*() types. Fills out a
@@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
194 return; 197 return;
195 198
196 what |= ddir_act[rw & WRITE]; 199 what |= ddir_act[rw & WRITE];
197 what |= MASK_TC_BIT(rw, BARRIER); 200 what |= MASK_TC_BIT(rw, HARDBARRIER);
198 what |= MASK_TC_BIT(rw, SYNCIO); 201 what |= MASK_TC_BIT(rw, SYNC);
199 what |= MASK_TC_BIT(rw, AHEAD); 202 what |= MASK_TC_BIT(rw, RAHEAD);
200 what |= MASK_TC_BIT(rw, META); 203 what |= MASK_TC_BIT(rw, META);
201 what |= MASK_TC_BIT(rw, DISCARD); 204 what |= MASK_TC_BIT(rw, DISCARD);
202 205
@@ -549,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
549} 552}
550EXPORT_SYMBOL_GPL(blk_trace_setup); 553EXPORT_SYMBOL_GPL(blk_trace_setup);
551 554
555#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
556static int compat_blk_trace_setup(struct request_queue *q, char *name,
557 dev_t dev, struct block_device *bdev,
558 char __user *arg)
559{
560 struct blk_user_trace_setup buts;
561 struct compat_blk_user_trace_setup cbuts;
562 int ret;
563
564 if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
565 return -EFAULT;
566
567 buts = (struct blk_user_trace_setup) {
568 .act_mask = cbuts.act_mask,
569 .buf_size = cbuts.buf_size,
570 .buf_nr = cbuts.buf_nr,
571 .start_lba = cbuts.start_lba,
572 .end_lba = cbuts.end_lba,
573 .pid = cbuts.pid,
574 };
575 memcpy(&buts.name, &cbuts.name, 32);
576
577 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
578 if (ret)
579 return ret;
580
581 if (copy_to_user(arg, &buts.name, 32)) {
582 blk_trace_remove(q);
583 return -EFAULT;
584 }
585
586 return 0;
587}
588#endif
589
552int blk_trace_startstop(struct request_queue *q, int start) 590int blk_trace_startstop(struct request_queue *q, int start)
553{ 591{
554 int ret; 592 int ret;
@@ -601,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
601 if (!q) 639 if (!q)
602 return -ENXIO; 640 return -ENXIO;
603 641
642 lock_kernel();
604 mutex_lock(&bdev->bd_mutex); 643 mutex_lock(&bdev->bd_mutex);
605 644
606 switch (cmd) { 645 switch (cmd) {
@@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
608 bdevname(bdev, b); 647 bdevname(bdev, b);
609 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); 648 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
610 break; 649 break;
650#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
651 case BLKTRACESETUP32:
652 bdevname(bdev, b);
653 ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
654 break;
655#endif
611 case BLKTRACESTART: 656 case BLKTRACESTART:
612 start = 1; 657 start = 1;
613 case BLKTRACESTOP: 658 case BLKTRACESTOP:
@@ -622,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
622 } 667 }
623 668
624 mutex_unlock(&bdev->bd_mutex); 669 mutex_unlock(&bdev->bd_mutex);
670 unlock_kernel();
625 return ret; 671 return ret;
626} 672}
627 673
@@ -661,10 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
661 if (likely(!bt)) 707 if (likely(!bt))
662 return; 708 return;
663 709
664 if (blk_discard_rq(rq)) 710 if (rq->cmd_flags & REQ_DISCARD)
665 rw |= (1 << BIO_RW_DISCARD); 711 rw |= REQ_DISCARD;
712
713 if (rq->cmd_flags & REQ_SECURE)
714 rw |= REQ_SECURE;
666 715
667 if (blk_pc_request(rq)) { 716 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
668 what |= BLK_TC_ACT(BLK_TC_PC); 717 what |= BLK_TC_ACT(BLK_TC_PC);
669 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 718 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
670 what, rq->errors, rq->cmd_len, rq->cmd); 719 what, rq->errors, rq->cmd_len, rq->cmd);
@@ -925,7 +974,7 @@ void blk_add_driver_data(struct request_queue *q,
925 if (likely(!bt)) 974 if (likely(!bt))
926 return; 975 return;
927 976
928 if (blk_pc_request(rq)) 977 if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
929 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 978 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
930 BLK_TA_DRV_DATA, rq->errors, len, data); 979 BLK_TA_DRV_DATA, rq->errors, len, data);
931 else 980 else
@@ -1730,7 +1779,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
1730 int len = rq->cmd_len; 1779 int len = rq->cmd_len;
1731 unsigned char *cmd = rq->cmd; 1780 unsigned char *cmd = rq->cmd;
1732 1781
1733 if (!blk_pc_request(rq)) { 1782 if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
1734 buf[0] = '\0'; 1783 buf[0] = '\0';
1735 return; 1784 return;
1736 } 1785 }
@@ -1755,21 +1804,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1755 1804
1756 if (rw & WRITE) 1805 if (rw & WRITE)
1757 rwbs[i++] = 'W'; 1806 rwbs[i++] = 'W';
1758 else if (rw & 1 << BIO_RW_DISCARD) 1807 else if (rw & REQ_DISCARD)
1759 rwbs[i++] = 'D'; 1808 rwbs[i++] = 'D';
1760 else if (bytes) 1809 else if (bytes)
1761 rwbs[i++] = 'R'; 1810 rwbs[i++] = 'R';
1762 else 1811 else
1763 rwbs[i++] = 'N'; 1812 rwbs[i++] = 'N';
1764 1813
1765 if (rw & 1 << BIO_RW_AHEAD) 1814 if (rw & REQ_RAHEAD)
1766 rwbs[i++] = 'A'; 1815 rwbs[i++] = 'A';
1767 if (rw & 1 << BIO_RW_BARRIER) 1816 if (rw & REQ_HARDBARRIER)
1768 rwbs[i++] = 'B'; 1817 rwbs[i++] = 'B';
1769 if (rw & 1 << BIO_RW_SYNCIO) 1818 if (rw & REQ_SYNC)
1770 rwbs[i++] = 'S'; 1819 rwbs[i++] = 'S';
1771 if (rw & 1 << BIO_RW_META) 1820 if (rw & REQ_META)
1772 rwbs[i++] = 'M'; 1821 rwbs[i++] = 'M';
1822 if (rw & REQ_SECURE)
1823 rwbs[i++] = 'E';
1773 1824
1774 rwbs[i] = '\0'; 1825 rwbs[i] = '\0';
1775} 1826}
@@ -1779,8 +1830,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1779 int rw = rq->cmd_flags & 0x03; 1830 int rw = rq->cmd_flags & 0x03;
1780 int bytes; 1831 int bytes;
1781 1832
1782 if (blk_discard_rq(rq)) 1833 if (rq->cmd_flags & REQ_DISCARD)
1783 rw |= (1 << BIO_RW_DISCARD); 1834 rw |= REQ_DISCARD;
1835
1836 if (rq->cmd_flags & REQ_SECURE)
1837 rw |= REQ_SECURE;
1784 1838
1785 bytes = blk_rq_bytes(rq); 1839 bytes = blk_rq_bytes(rq);
1786 1840
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..fa7ece649fe1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
381{ 381{
382 struct ftrace_profile *rec = v; 382 struct ftrace_profile *rec = v;
383 char str[KSYM_SYMBOL_LEN]; 383 char str[KSYM_SYMBOL_LEN];
384 int ret = 0;
384#ifdef CONFIG_FUNCTION_GRAPH_TRACER 385#ifdef CONFIG_FUNCTION_GRAPH_TRACER
385 static DEFINE_MUTEX(mutex);
386 static struct trace_seq s; 386 static struct trace_seq s;
387 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev; 388 unsigned long long stddev;
389#endif 389#endif
390 mutex_lock(&ftrace_profile_lock);
391
392 /* we raced with function_profile_reset() */
393 if (unlikely(rec->counter == 0)) {
394 ret = -EBUSY;
395 goto out;
396 }
390 397
391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 398 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
392 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 399 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v)
408 do_div(stddev, (rec->counter - 1) * 1000); 415 do_div(stddev, (rec->counter - 1) * 1000);
409 } 416 }
410 417
411 mutex_lock(&mutex);
412 trace_seq_init(&s); 418 trace_seq_init(&s);
413 trace_print_graph_duration(rec->time, &s); 419 trace_print_graph_duration(rec->time, &s);
414 trace_seq_puts(&s, " "); 420 trace_seq_puts(&s, " ");
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v)
416 trace_seq_puts(&s, " "); 422 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s); 423 trace_print_graph_duration(stddev, &s);
418 trace_print_seq(m, &s); 424 trace_print_seq(m, &s);
419 mutex_unlock(&mutex);
420#endif 425#endif
421 seq_putc(m, '\n'); 426 seq_putc(m, '\n');
427out:
428 mutex_unlock(&ftrace_profile_lock);
422 429
423 return 0; 430 return ret;
424} 431}
425 432
426static void ftrace_profile_reset(struct ftrace_profile_stat *stat) 433static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -1503,6 +1510,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1503 if (*pos > 0) 1510 if (*pos > 0)
1504 return t_hash_start(m, pos); 1511 return t_hash_start(m, pos);
1505 iter->flags |= FTRACE_ITER_PRINTALL; 1512 iter->flags |= FTRACE_ITER_PRINTALL;
1513 /* reset in case of seek/pread */
1514 iter->flags &= ~FTRACE_ITER_HASH;
1506 return iter; 1515 return iter;
1507 } 1516 }
1508 1517
@@ -1883,7 +1892,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1883 struct hlist_head *hhd; 1892 struct hlist_head *hhd;
1884 struct hlist_node *n; 1893 struct hlist_node *n;
1885 unsigned long key; 1894 unsigned long key;
1886 int resched;
1887 1895
1888 key = hash_long(ip, FTRACE_HASH_BITS); 1896 key = hash_long(ip, FTRACE_HASH_BITS);
1889 1897
@@ -1897,12 +1905,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1897 * period. This syncs the hash iteration and freeing of items 1905 * period. This syncs the hash iteration and freeing of items
1898 * on the hash. rcu_read_lock is too dangerous here. 1906 * on the hash. rcu_read_lock is too dangerous here.
1899 */ 1907 */
1900 resched = ftrace_preempt_disable(); 1908 preempt_disable_notrace();
1901 hlist_for_each_entry_rcu(entry, n, hhd, node) { 1909 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1902 if (entry->ip == ip) 1910 if (entry->ip == ip)
1903 entry->ops->func(ip, parent_ip, &entry->data); 1911 entry->ops->func(ip, parent_ip, &entry->data);
1904 } 1912 }
1905 ftrace_preempt_enable(resched); 1913 preempt_enable_notrace();
1906} 1914}
1907 1915
1908static struct ftrace_ops trace_probe_ops __read_mostly = 1916static struct ftrace_ops trace_probe_ops __read_mostly =
@@ -2410,7 +2418,7 @@ static const struct file_operations ftrace_filter_fops = {
2410 .open = ftrace_filter_open, 2418 .open = ftrace_filter_open,
2411 .read = seq_read, 2419 .read = seq_read,
2412 .write = ftrace_filter_write, 2420 .write = ftrace_filter_write,
2413 .llseek = ftrace_regex_lseek, 2421 .llseek = no_llseek,
2414 .release = ftrace_filter_release, 2422 .release = ftrace_filter_release,
2415}; 2423};
2416 2424
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
11#include <linux/debugfs.h>
12#include <linux/dcache.h>
13#include <linux/fs.h>
14
15#include <linux/kmemtrace.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20/* Select an alternative, minimalistic output than the original one */
21#define TRACE_KMEM_OPT_MINIMAL 0x1
22
23static struct tracer_opt kmem_opts[] = {
24 /* Default disable the minimalistic output */
25 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
26 { }
27};
28
29static struct tracer_flags kmem_tracer_flags = {
30 .val = 0,
31 .opts = kmem_opts
32};
33
34static struct trace_array *kmemtrace_array;
35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
46 struct trace_array *tr = kmemtrace_array;
47 struct kmemtrace_alloc_entry *entry;
48 struct ring_buffer_event *event;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
51 if (!event)
52 return;
53
54 entry = ring_buffer_event_data(event);
55 tracing_generic_entry_update(&entry->ent, 0, 0);
56
57 entry->ent.type = TRACE_KMEM_ALLOC;
58 entry->type_id = type_id;
59 entry->call_site = call_site;
60 entry->ptr = ptr;
61 entry->bytes_req = bytes_req;
62 entry->bytes_alloc = bytes_alloc;
63 entry->gfp_flags = gfp_flags;
64 entry->node = node;
65
66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
68
69 trace_wake_up();
70}
71
72static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
73 unsigned long call_site,
74 const void *ptr)
75{
76 struct ftrace_event_call *call = &event_kmem_free;
77 struct trace_array *tr = kmemtrace_array;
78 struct kmemtrace_free_entry *entry;
79 struct ring_buffer_event *event;
80
81 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
82 if (!event)
83 return;
84 entry = ring_buffer_event_data(event);
85 tracing_generic_entry_update(&entry->ent, 0, 0);
86
87 entry->ent.type = TRACE_KMEM_FREE;
88 entry->type_id = type_id;
89 entry->call_site = call_site;
90 entry->ptr = ptr;
91
92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
94
95 trace_wake_up();
96}
97
98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
100 const void *ptr,
101 size_t bytes_req,
102 size_t bytes_alloc,
103 gfp_t gfp_flags)
104{
105 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
106 bytes_req, bytes_alloc, gfp_flags, -1);
107}
108
109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
111 const void *ptr,
112 size_t bytes_req,
113 size_t bytes_alloc,
114 gfp_t gfp_flags)
115{
116 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
117 bytes_req, bytes_alloc, gfp_flags, -1);
118}
119
120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
122 const void *ptr,
123 size_t bytes_req,
124 size_t bytes_alloc,
125 gfp_t gfp_flags,
126 int node)
127{
128 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
129 bytes_req, bytes_alloc, gfp_flags, node);
130}
131
132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
134 const void *ptr,
135 size_t bytes_req,
136 size_t bytes_alloc,
137 gfp_t gfp_flags,
138 int node)
139{
140 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
141 bytes_req, bytes_alloc, gfp_flags, node);
142}
143
144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
146{
147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
148}
149
150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
152{
153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
154}
155
156static int kmemtrace_start_probes(void)
157{
158 int err;
159
160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
161 if (err)
162 return err;
163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
164 if (err)
165 return err;
166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
167 if (err)
168 return err;
169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
170 if (err)
171 return err;
172 err = register_trace_kfree(kmemtrace_kfree, NULL);
173 if (err)
174 return err;
175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
176
177 return err;
178}
179
180static void kmemtrace_stop_probes(void)
181{
182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
186 unregister_trace_kfree(kmemtrace_kfree, NULL);
187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
188}
189
190static int kmem_trace_init(struct trace_array *tr)
191{
192 kmemtrace_array = tr;
193
194 tracing_reset_online_cpus(tr);
195
196 kmemtrace_start_probes();
197
198 return 0;
199}
200
201static void kmem_trace_reset(struct trace_array *tr)
202{
203 kmemtrace_stop_probes();
204}
205
206static void kmemtrace_headers(struct seq_file *s)
207{
208 /* Don't need headers for the original kmemtrace output */
209 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
210 return;
211
212 seq_printf(s, "#\n");
213 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
214 " POINTER NODE CALLER\n");
215 seq_printf(s, "# FREE | | | | "
216 " | | | |\n");
217 seq_printf(s, "# |\n\n");
218}
219
220/*
221 * The following functions give the original output from kmemtrace,
222 * plus the origin CPU, since reordering occurs in-kernel now.
223 */
224
225#define KMEMTRACE_USER_ALLOC 0
226#define KMEMTRACE_USER_FREE 1
227
228struct kmemtrace_user_event {
229 u8 event_id;
230 u8 type_id;
231 u16 event_size;
232 u32 cpu;
233 u64 timestamp;
234 unsigned long call_site;
235 unsigned long ptr;
236};
237
238struct kmemtrace_user_event_alloc {
239 size_t bytes_req;
240 size_t bytes_alloc;
241 unsigned gfp_flags;
242 int node;
243};
244
245static enum print_line_t
246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
248{
249 struct trace_seq *s = &iter->seq;
250 struct kmemtrace_alloc_entry *entry;
251 int ret;
252
253 trace_assign_type(entry, iter->ent);
254
255 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
256 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
257 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
258 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
259 (unsigned long)entry->gfp_flags, entry->node);
260
261 if (!ret)
262 return TRACE_TYPE_PARTIAL_LINE;
263 return TRACE_TYPE_HANDLED;
264}
265
266static enum print_line_t
267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
269{
270 struct trace_seq *s = &iter->seq;
271 struct kmemtrace_free_entry *entry;
272 int ret;
273
274 trace_assign_type(entry, iter->ent);
275
276 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
277 entry->type_id, (void *)entry->call_site,
278 (unsigned long)entry->ptr);
279
280 if (!ret)
281 return TRACE_TYPE_PARTIAL_LINE;
282 return TRACE_TYPE_HANDLED;
283}
284
285static enum print_line_t
286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
288{
289 struct trace_seq *s = &iter->seq;
290 struct kmemtrace_alloc_entry *entry;
291 struct kmemtrace_user_event *ev;
292 struct kmemtrace_user_event_alloc *ev_alloc;
293
294 trace_assign_type(entry, iter->ent);
295
296 ev = trace_seq_reserve(s, sizeof(*ev));
297 if (!ev)
298 return TRACE_TYPE_PARTIAL_LINE;
299
300 ev->event_id = KMEMTRACE_USER_ALLOC;
301 ev->type_id = entry->type_id;
302 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
303 ev->cpu = iter->cpu;
304 ev->timestamp = iter->ts;
305 ev->call_site = entry->call_site;
306 ev->ptr = (unsigned long)entry->ptr;
307
308 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
309 if (!ev_alloc)
310 return TRACE_TYPE_PARTIAL_LINE;
311
312 ev_alloc->bytes_req = entry->bytes_req;
313 ev_alloc->bytes_alloc = entry->bytes_alloc;
314 ev_alloc->gfp_flags = entry->gfp_flags;
315 ev_alloc->node = entry->node;
316
317 return TRACE_TYPE_HANDLED;
318}
319
320static enum print_line_t
321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
323{
324 struct trace_seq *s = &iter->seq;
325 struct kmemtrace_free_entry *entry;
326 struct kmemtrace_user_event *ev;
327
328 trace_assign_type(entry, iter->ent);
329
330 ev = trace_seq_reserve(s, sizeof(*ev));
331 if (!ev)
332 return TRACE_TYPE_PARTIAL_LINE;
333
334 ev->event_id = KMEMTRACE_USER_FREE;
335 ev->type_id = entry->type_id;
336 ev->event_size = sizeof(*ev);
337 ev->cpu = iter->cpu;
338 ev->timestamp = iter->ts;
339 ev->call_site = entry->call_site;
340 ev->ptr = (unsigned long)entry->ptr;
341
342 return TRACE_TYPE_HANDLED;
343}
344
345/* The two other following provide a more minimalistic output */
346static enum print_line_t
347kmemtrace_print_alloc_compress(struct trace_iterator *iter)
348{
349 struct kmemtrace_alloc_entry *entry;
350 struct trace_seq *s = &iter->seq;
351 int ret;
352
353 trace_assign_type(entry, iter->ent);
354
355 /* Alloc entry */
356 ret = trace_seq_printf(s, " + ");
357 if (!ret)
358 return TRACE_TYPE_PARTIAL_LINE;
359
360 /* Type */
361 switch (entry->type_id) {
362 case KMEMTRACE_TYPE_KMALLOC:
363 ret = trace_seq_printf(s, "K ");
364 break;
365 case KMEMTRACE_TYPE_CACHE:
366 ret = trace_seq_printf(s, "C ");
367 break;
368 case KMEMTRACE_TYPE_PAGES:
369 ret = trace_seq_printf(s, "P ");
370 break;
371 default:
372 ret = trace_seq_printf(s, "? ");
373 }
374
375 if (!ret)
376 return TRACE_TYPE_PARTIAL_LINE;
377
378 /* Requested */
379 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
380 if (!ret)
381 return TRACE_TYPE_PARTIAL_LINE;
382
383 /* Allocated */
384 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
385 if (!ret)
386 return TRACE_TYPE_PARTIAL_LINE;
387
388 /* Flags
389 * TODO: would be better to see the name of the GFP flag names
390 */
391 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
392 if (!ret)
393 return TRACE_TYPE_PARTIAL_LINE;
394
395 /* Pointer to allocated */
396 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
397 if (!ret)
398 return TRACE_TYPE_PARTIAL_LINE;
399
400 /* Node and call site*/
401 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
402 (void *)entry->call_site);
403 if (!ret)
404 return TRACE_TYPE_PARTIAL_LINE;
405
406 return TRACE_TYPE_HANDLED;
407}
408
409static enum print_line_t
410kmemtrace_print_free_compress(struct trace_iterator *iter)
411{
412 struct kmemtrace_free_entry *entry;
413 struct trace_seq *s = &iter->seq;
414 int ret;
415
416 trace_assign_type(entry, iter->ent);
417
418 /* Free entry */
419 ret = trace_seq_printf(s, " - ");
420 if (!ret)
421 return TRACE_TYPE_PARTIAL_LINE;
422
423 /* Type */
424 switch (entry->type_id) {
425 case KMEMTRACE_TYPE_KMALLOC:
426 ret = trace_seq_printf(s, "K ");
427 break;
428 case KMEMTRACE_TYPE_CACHE:
429 ret = trace_seq_printf(s, "C ");
430 break;
431 case KMEMTRACE_TYPE_PAGES:
432 ret = trace_seq_printf(s, "P ");
433 break;
434 default:
435 ret = trace_seq_printf(s, "? ");
436 }
437
438 if (!ret)
439 return TRACE_TYPE_PARTIAL_LINE;
440
441 /* Skip requested/allocated/flags */
442 ret = trace_seq_printf(s, " ");
443 if (!ret)
444 return TRACE_TYPE_PARTIAL_LINE;
445
446 /* Pointer to allocated */
447 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
448 if (!ret)
449 return TRACE_TYPE_PARTIAL_LINE;
450
451 /* Skip node and print call site*/
452 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
453 if (!ret)
454 return TRACE_TYPE_PARTIAL_LINE;
455
456 return TRACE_TYPE_HANDLED;
457}
458
459static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
460{
461 struct trace_entry *entry = iter->ent;
462
463 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
464 return TRACE_TYPE_UNHANDLED;
465
466 switch (entry->type) {
467 case TRACE_KMEM_ALLOC:
468 return kmemtrace_print_alloc_compress(iter);
469 case TRACE_KMEM_FREE:
470 return kmemtrace_print_free_compress(iter);
471 default:
472 return TRACE_TYPE_UNHANDLED;
473 }
474}
475
476static struct trace_event_functions kmem_trace_alloc_funcs = {
477 .trace = kmemtrace_print_alloc,
478 .binary = kmemtrace_print_alloc_user,
479};
480
481static struct trace_event kmem_trace_alloc = {
482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
487 .trace = kmemtrace_print_free,
488 .binary = kmemtrace_print_free_user,
489};
490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
496static struct tracer kmem_tracer __read_mostly = {
497 .name = "kmemtrace",
498 .init = kmem_trace_init,
499 .reset = kmem_trace_reset,
500 .print_line = kmemtrace_print_line,
501 .print_header = kmemtrace_headers,
502 .flags = &kmem_tracer_flags
503};
504
505void kmemtrace_init(void)
506{
507 /* earliest opportunity to start kmem tracing */
508}
509
510static int __init init_kmem_tracer(void)
511{
512 if (!register_ftrace_event(&kmem_trace_alloc)) {
513 pr_warning("Warning: could not register kmem events\n");
514 return 1;
515 }
516
517 if (!register_ftrace_event(&kmem_trace_free)) {
518 pr_warning("Warning: could not register kmem events\n");
519 return 1;
520 }
521
522 if (register_tracer(&kmem_tracer) != 0) {
523 pr_warning("Warning: could not register the kmem tracer\n");
524 return 1;
525 }
526
527 return 0;
528}
529device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1da7b6ea8b85..492197e2f86c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
443 */ 443 */
444struct ring_buffer_per_cpu { 444struct ring_buffer_per_cpu {
445 int cpu; 445 int cpu;
446 atomic_t record_disabled;
446 struct ring_buffer *buffer; 447 struct ring_buffer *buffer;
447 spinlock_t reader_lock; /* serialize readers */ 448 spinlock_t reader_lock; /* serialize readers */
448 arch_spinlock_t lock; 449 arch_spinlock_t lock;
@@ -462,7 +463,6 @@ struct ring_buffer_per_cpu {
462 unsigned long read; 463 unsigned long read;
463 u64 write_stamp; 464 u64 write_stamp;
464 u64 read_stamp; 465 u64 read_stamp;
465 atomic_t record_disabled;
466}; 466};
467 467
468struct ring_buffer { 468struct ring_buffer {
@@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void)
2242 2242
2243#endif 2243#endif
2244 2244
2245static DEFINE_PER_CPU(int, rb_need_resched);
2246
2247/** 2245/**
2248 * ring_buffer_lock_reserve - reserve a part of the buffer 2246 * ring_buffer_lock_reserve - reserve a part of the buffer
2249 * @buffer: the ring buffer to reserve from 2247 * @buffer: the ring buffer to reserve from
@@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2264{ 2262{
2265 struct ring_buffer_per_cpu *cpu_buffer; 2263 struct ring_buffer_per_cpu *cpu_buffer;
2266 struct ring_buffer_event *event; 2264 struct ring_buffer_event *event;
2267 int cpu, resched; 2265 int cpu;
2268 2266
2269 if (ring_buffer_flags != RB_BUFFERS_ON) 2267 if (ring_buffer_flags != RB_BUFFERS_ON)
2270 return NULL; 2268 return NULL;
2271 2269
2272 /* If we are tracing schedule, we don't want to recurse */ 2270 /* If we are tracing schedule, we don't want to recurse */
2273 resched = ftrace_preempt_disable(); 2271 preempt_disable_notrace();
2274 2272
2275 if (atomic_read(&buffer->record_disabled)) 2273 if (atomic_read(&buffer->record_disabled))
2276 goto out_nocheck; 2274 goto out_nocheck;
@@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2295 if (!event) 2293 if (!event)
2296 goto out; 2294 goto out;
2297 2295
2298 /*
2299 * Need to store resched state on this cpu.
2300 * Only the first needs to.
2301 */
2302
2303 if (preempt_count() == 1)
2304 per_cpu(rb_need_resched, cpu) = resched;
2305
2306 return event; 2296 return event;
2307 2297
2308 out: 2298 out:
2309 trace_recursive_unlock(); 2299 trace_recursive_unlock();
2310 2300
2311 out_nocheck: 2301 out_nocheck:
2312 ftrace_preempt_enable(resched); 2302 preempt_enable_notrace();
2313 return NULL; 2303 return NULL;
2314} 2304}
2315EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2305EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2355 2345
2356 trace_recursive_unlock(); 2346 trace_recursive_unlock();
2357 2347
2358 /* 2348 preempt_enable_notrace();
2359 * Only the last preempt count needs to restore preemption.
2360 */
2361 if (preempt_count() == 1)
2362 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2363 else
2364 preempt_enable_no_resched_notrace();
2365 2349
2366 return 0; 2350 return 0;
2367} 2351}
@@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
2469 2453
2470 trace_recursive_unlock(); 2454 trace_recursive_unlock();
2471 2455
2472 /* 2456 preempt_enable_notrace();
2473 * Only the last preempt count needs to restore preemption.
2474 */
2475 if (preempt_count() == 1)
2476 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2477 else
2478 preempt_enable_no_resched_notrace();
2479 2457
2480} 2458}
2481EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); 2459EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
2501 struct ring_buffer_event *event; 2479 struct ring_buffer_event *event;
2502 void *body; 2480 void *body;
2503 int ret = -EBUSY; 2481 int ret = -EBUSY;
2504 int cpu, resched; 2482 int cpu;
2505 2483
2506 if (ring_buffer_flags != RB_BUFFERS_ON) 2484 if (ring_buffer_flags != RB_BUFFERS_ON)
2507 return -EBUSY; 2485 return -EBUSY;
2508 2486
2509 resched = ftrace_preempt_disable(); 2487 preempt_disable_notrace();
2510 2488
2511 if (atomic_read(&buffer->record_disabled)) 2489 if (atomic_read(&buffer->record_disabled))
2512 goto out; 2490 goto out;
@@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
2536 2514
2537 ret = 0; 2515 ret = 0;
2538 out: 2516 out:
2539 ftrace_preempt_enable(resched); 2517 preempt_enable_notrace();
2540 2518
2541 return ret; 2519 return ret;
2542} 2520}
@@ -3007,13 +2985,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
3007 2985
3008static void rb_advance_iter(struct ring_buffer_iter *iter) 2986static void rb_advance_iter(struct ring_buffer_iter *iter)
3009{ 2987{
3010 struct ring_buffer *buffer;
3011 struct ring_buffer_per_cpu *cpu_buffer; 2988 struct ring_buffer_per_cpu *cpu_buffer;
3012 struct ring_buffer_event *event; 2989 struct ring_buffer_event *event;
3013 unsigned length; 2990 unsigned length;
3014 2991
3015 cpu_buffer = iter->cpu_buffer; 2992 cpu_buffer = iter->cpu_buffer;
3016 buffer = cpu_buffer->buffer;
3017 2993
3018 /* 2994 /*
3019 * Check if we are at the end of the buffer. 2995 * Check if we are at the end of the buffer.
@@ -3868,6 +3844,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3868 rpos = reader->read; 3844 rpos = reader->read;
3869 pos += size; 3845 pos += size;
3870 3846
3847 if (rpos >= commit)
3848 break;
3849
3871 event = rb_reader_event(cpu_buffer); 3850 event = rb_reader_event(cpu_buffer);
3872 size = rb_event_length(event); 3851 size = rb_event_length(event);
3873 } while (len > size); 3852 } while (len > size);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..9ec59f541156 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
101 preempt_enable(); 101 preempt_enable();
102} 102}
103 103
104static cpumask_var_t __read_mostly tracing_buffer_mask; 104cpumask_var_t __read_mostly tracing_buffer_mask;
105
106#define for_each_tracing_cpu(cpu) \
107 for_each_cpu(cpu, tracing_buffer_mask)
108 105
109/* 106/*
110 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 107 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344/* trace_flags holds trace_options default values */ 341/* trace_flags holds trace_options default values */
345unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 342unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
346 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 343 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
347 TRACE_ITER_GRAPH_TIME; 344 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
348 345
349static int trace_stop_count; 346static int trace_stop_count;
350static DEFINE_SPINLOCK(tracing_start_lock); 347static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +425,7 @@ static const char *trace_options[] = {
428 "latency-format", 425 "latency-format",
429 "sleep-time", 426 "sleep-time",
430 "graph-time", 427 "graph-time",
428 "record-cmd",
431 NULL 429 NULL
432}; 430};
433 431
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
659 return; 657 return;
660 658
661 WARN_ON_ONCE(!irqs_disabled()); 659 WARN_ON_ONCE(!irqs_disabled());
660 if (!current_trace->use_max_tr) {
661 WARN_ON_ONCE(1);
662 return;
663 }
662 arch_spin_lock(&ftrace_max_lock); 664 arch_spin_lock(&ftrace_max_lock);
663 665
664 tr->buffer = max_tr.buffer; 666 tr->buffer = max_tr.buffer;
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
685 return; 687 return;
686 688
687 WARN_ON_ONCE(!irqs_disabled()); 689 WARN_ON_ONCE(!irqs_disabled());
690 if (!current_trace->use_max_tr) {
691 WARN_ON_ONCE(1);
692 return;
693 }
694
688 arch_spin_lock(&ftrace_max_lock); 695 arch_spin_lock(&ftrace_max_lock);
689 696
690 ftrace_disable_cpu(); 697 ftrace_disable_cpu();
@@ -729,18 +736,11 @@ __acquires(kernel_lock)
729 return -1; 736 return -1;
730 } 737 }
731 738
732 if (strlen(type->name) > MAX_TRACER_SIZE) { 739 if (strlen(type->name) >= MAX_TRACER_SIZE) {
733 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); 740 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
734 return -1; 741 return -1;
735 } 742 }
736 743
737 /*
738 * When this gets called we hold the BKL which means that
739 * preemption is disabled. Various trace selftests however
740 * need to disable and enable preemption for successful tests.
741 * So we drop the BKL here and grab it after the tests again.
742 */
743 unlock_kernel();
744 mutex_lock(&trace_types_lock); 744 mutex_lock(&trace_types_lock);
745 745
746 tracing_selftest_running = true; 746 tracing_selftest_running = true;
@@ -822,7 +822,6 @@ __acquires(kernel_lock)
822#endif 822#endif
823 823
824 out_unlock: 824 out_unlock:
825 lock_kernel();
826 return ret; 825 return ret;
827} 826}
828 827
@@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1331 1330
1332#endif /* CONFIG_STACKTRACE */ 1331#endif /* CONFIG_STACKTRACE */
1333 1332
1334static void
1335ftrace_trace_special(void *__tr,
1336 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1337 int pc)
1338{
1339 struct ftrace_event_call *call = &event_special;
1340 struct ring_buffer_event *event;
1341 struct trace_array *tr = __tr;
1342 struct ring_buffer *buffer = tr->buffer;
1343 struct special_entry *entry;
1344
1345 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1346 sizeof(*entry), 0, pc);
1347 if (!event)
1348 return;
1349 entry = ring_buffer_event_data(event);
1350 entry->arg1 = arg1;
1351 entry->arg2 = arg2;
1352 entry->arg3 = arg3;
1353
1354 if (!filter_check_discard(call, entry, buffer, event))
1355 trace_buffer_unlock_commit(buffer, event, 0, pc);
1356}
1357
1358void
1359__trace_special(void *__tr, void *__data,
1360 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1361{
1362 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1363}
1364
1365void
1366ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1367{
1368 struct trace_array *tr = &global_trace;
1369 struct trace_array_cpu *data;
1370 unsigned long flags;
1371 int cpu;
1372 int pc;
1373
1374 if (tracing_disabled)
1375 return;
1376
1377 pc = preempt_count();
1378 local_irq_save(flags);
1379 cpu = raw_smp_processor_id();
1380 data = tr->data[cpu];
1381
1382 if (likely(atomic_inc_return(&data->disabled) == 1))
1383 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1384
1385 atomic_dec(&data->disabled);
1386 local_irq_restore(flags);
1387}
1388
1389/** 1333/**
1390 * trace_vbprintk - write binary msg to tracing buffer 1334 * trace_vbprintk - write binary msg to tracing buffer
1391 * 1335 *
@@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1404 struct bprint_entry *entry; 1348 struct bprint_entry *entry;
1405 unsigned long flags; 1349 unsigned long flags;
1406 int disable; 1350 int disable;
1407 int resched;
1408 int cpu, len = 0, size, pc; 1351 int cpu, len = 0, size, pc;
1409 1352
1410 if (unlikely(tracing_selftest_running || tracing_disabled)) 1353 if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1414 pause_graph_tracing(); 1357 pause_graph_tracing();
1415 1358
1416 pc = preempt_count(); 1359 pc = preempt_count();
1417 resched = ftrace_preempt_disable(); 1360 preempt_disable_notrace();
1418 cpu = raw_smp_processor_id(); 1361 cpu = raw_smp_processor_id();
1419 data = tr->data[cpu]; 1362 data = tr->data[cpu];
1420 1363
@@ -1452,7 +1395,7 @@ out_unlock:
1452 1395
1453out: 1396out:
1454 atomic_dec_return(&data->disabled); 1397 atomic_dec_return(&data->disabled);
1455 ftrace_preempt_enable(resched); 1398 preempt_enable_notrace();
1456 unpause_graph_tracing(); 1399 unpause_graph_tracing();
1457 1400
1458 return len; 1401 return len;
@@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1539} 1482}
1540EXPORT_SYMBOL_GPL(trace_vprintk); 1483EXPORT_SYMBOL_GPL(trace_vprintk);
1541 1484
1542enum trace_file_type {
1543 TRACE_FILE_LAT_FMT = 1,
1544 TRACE_FILE_ANNOTATE = 2,
1545};
1546
1547static void trace_iterator_increment(struct trace_iterator *iter) 1485static void trace_iterator_increment(struct trace_iterator *iter)
1548{ 1486{
1549 /* Don't allow ftrace to trace into the ring buffers */ 1487 /* Don't allow ftrace to trace into the ring buffers */
@@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1641} 1579}
1642 1580
1643/* Find the next real entry, and increment the iterator to the next entry */ 1581/* Find the next real entry, and increment the iterator to the next entry */
1644static void *find_next_entry_inc(struct trace_iterator *iter) 1582void *trace_find_next_entry_inc(struct trace_iterator *iter)
1645{ 1583{
1646 iter->ent = __find_next_entry(iter, &iter->cpu, 1584 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts); 1585 &iter->lost_events, &iter->ts);
@@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1676 return NULL; 1614 return NULL;
1677 1615
1678 if (iter->idx < 0) 1616 if (iter->idx < 0)
1679 ent = find_next_entry_inc(iter); 1617 ent = trace_find_next_entry_inc(iter);
1680 else 1618 else
1681 ent = iter; 1619 ent = iter;
1682 1620
1683 while (ent && iter->idx < i) 1621 while (ent && iter->idx < i)
1684 ent = find_next_entry_inc(iter); 1622 ent = trace_find_next_entry_inc(iter);
1685 1623
1686 iter->pos = *pos; 1624 iter->pos = *pos;
1687 1625
1688 return ent; 1626 return ent;
1689} 1627}
1690 1628
1691static void tracing_iter_reset(struct trace_iterator *iter, int cpu) 1629void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1692{ 1630{
1693 struct trace_array *tr = iter->tr; 1631 struct trace_array *tr = iter->tr;
1694 struct ring_buffer_event *event; 1632 struct ring_buffer_event *event;
@@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter)
2049} 1987}
2050 1988
2051/* Called with trace_event_read_lock() held. */ 1989/* Called with trace_event_read_lock() held. */
2052static enum print_line_t print_trace_line(struct trace_iterator *iter) 1990enum print_line_t print_trace_line(struct trace_iterator *iter)
2053{ 1991{
2054 enum print_line_t ret; 1992 enum print_line_t ret;
2055 1993
@@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = {
2394 .open = show_traces_open, 2332 .open = show_traces_open,
2395 .read = seq_read, 2333 .read = seq_read,
2396 .release = seq_release, 2334 .release = seq_release,
2335 .llseek = seq_lseek,
2397}; 2336};
2398 2337
2399/* 2338/*
@@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = {
2487 .open = tracing_open_generic, 2426 .open = tracing_open_generic,
2488 .read = tracing_cpumask_read, 2427 .read = tracing_cpumask_read,
2489 .write = tracing_cpumask_write, 2428 .write = tracing_cpumask_write,
2429 .llseek = generic_file_llseek,
2490}; 2430};
2491 2431
2492static int tracing_trace_options_show(struct seq_file *m, void *v) 2432static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2562 trace_flags |= mask; 2502 trace_flags |= mask;
2563 else 2503 else
2564 trace_flags &= ~mask; 2504 trace_flags &= ~mask;
2505
2506 if (mask == TRACE_ITER_RECORD_CMD)
2507 trace_event_enable_cmd_record(enabled);
2565} 2508}
2566 2509
2567static ssize_t 2510static ssize_t
@@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2653static const struct file_operations tracing_readme_fops = { 2596static const struct file_operations tracing_readme_fops = {
2654 .open = tracing_open_generic, 2597 .open = tracing_open_generic,
2655 .read = tracing_readme_read, 2598 .read = tracing_readme_read,
2599 .llseek = generic_file_llseek,
2656}; 2600};
2657 2601
2658static ssize_t 2602static ssize_t
@@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2703static const struct file_operations tracing_saved_cmdlines_fops = { 2647static const struct file_operations tracing_saved_cmdlines_fops = {
2704 .open = tracing_open_generic, 2648 .open = tracing_open_generic,
2705 .read = tracing_saved_cmdlines_read, 2649 .read = tracing_saved_cmdlines_read,
2650 .llseek = generic_file_llseek,
2706}; 2651};
2707 2652
2708static ssize_t 2653static ssize_t
@@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
2798 if (ret < 0) 2743 if (ret < 0)
2799 return ret; 2744 return ret;
2800 2745
2746 if (!current_trace->use_max_tr)
2747 goto out;
2748
2801 ret = ring_buffer_resize(max_tr.buffer, size); 2749 ret = ring_buffer_resize(max_tr.buffer, size);
2802 if (ret < 0) { 2750 if (ret < 0) {
2803 int r; 2751 int r;
@@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
2825 return ret; 2773 return ret;
2826 } 2774 }
2827 2775
2776 max_tr.entries = size;
2777 out:
2828 global_trace.entries = size; 2778 global_trace.entries = size;
2829 2779
2830 return ret; 2780 return ret;
2831} 2781}
2832 2782
2783
2833/** 2784/**
2834 * tracing_update_buffers - used by tracing facility to expand ring buffers 2785 * tracing_update_buffers - used by tracing facility to expand ring buffers
2835 * 2786 *
@@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf)
2890 trace_branch_disable(); 2841 trace_branch_disable();
2891 if (current_trace && current_trace->reset) 2842 if (current_trace && current_trace->reset)
2892 current_trace->reset(tr); 2843 current_trace->reset(tr);
2893 2844 if (current_trace && current_trace->use_max_tr) {
2845 /*
2846 * We don't free the ring buffer. instead, resize it because
2847 * The max_tr ring buffer has some state (e.g. ring->clock) and
2848 * we want preserve it.
2849 */
2850 ring_buffer_resize(max_tr.buffer, 1);
2851 max_tr.entries = 1;
2852 }
2894 destroy_trace_option_files(topts); 2853 destroy_trace_option_files(topts);
2895 2854
2896 current_trace = t; 2855 current_trace = t;
2897 2856
2898 topts = create_trace_option_files(current_trace); 2857 topts = create_trace_option_files(current_trace);
2858 if (current_trace->use_max_tr) {
2859 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
2860 if (ret < 0)
2861 goto out;
2862 max_tr.entries = global_trace.entries;
2863 }
2899 2864
2900 if (t->init) { 2865 if (t->init) {
2901 ret = tracer_init(t, tr); 2866 ret = tracer_init(t, tr);
@@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3032 if (iter->trace->pipe_open) 2997 if (iter->trace->pipe_open)
3033 iter->trace->pipe_open(iter); 2998 iter->trace->pipe_open(iter);
3034 2999
3000 nonseekable_open(inode, filp);
3035out: 3001out:
3036 mutex_unlock(&trace_types_lock); 3002 mutex_unlock(&trace_types_lock);
3037 return ret; 3003 return ret;
@@ -3211,7 +3177,7 @@ waitagain:
3211 3177
3212 trace_event_read_lock(); 3178 trace_event_read_lock();
3213 trace_access_lock(iter->cpu_file); 3179 trace_access_lock(iter->cpu_file);
3214 while (find_next_entry_inc(iter) != NULL) { 3180 while (trace_find_next_entry_inc(iter) != NULL) {
3215 enum print_line_t ret; 3181 enum print_line_t ret;
3216 int len = iter->seq.len; 3182 int len = iter->seq.len;
3217 3183
@@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3294 if (ret != TRACE_TYPE_NO_CONSUME) 3260 if (ret != TRACE_TYPE_NO_CONSUME)
3295 trace_consume(iter); 3261 trace_consume(iter);
3296 rem -= count; 3262 rem -= count;
3297 if (!find_next_entry_inc(iter)) { 3263 if (!trace_find_next_entry_inc(iter)) {
3298 rem = 0; 3264 rem = 0;
3299 iter->ent = NULL; 3265 iter->ent = NULL;
3300 break; 3266 break;
@@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3350 if (ret <= 0) 3316 if (ret <= 0)
3351 goto out_err; 3317 goto out_err;
3352 3318
3353 if (!iter->ent && !find_next_entry_inc(iter)) { 3319 if (!iter->ent && !trace_find_next_entry_inc(iter)) {
3354 ret = -EFAULT; 3320 ret = -EFAULT;
3355 goto out_err; 3321 goto out_err;
3356 } 3322 }
@@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3477 } 3443 }
3478 3444
3479 tracing_start(); 3445 tracing_start();
3480 max_tr.entries = global_trace.entries;
3481 mutex_unlock(&trace_types_lock); 3446 mutex_unlock(&trace_types_lock);
3482 3447
3483 return cnt; 3448 return cnt;
@@ -3498,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3498 size_t cnt, loff_t *fpos) 3463 size_t cnt, loff_t *fpos)
3499{ 3464{
3500 char *buf; 3465 char *buf;
3466 size_t written;
3501 3467
3502 if (tracing_disabled) 3468 if (tracing_disabled)
3503 return -EINVAL; 3469 return -EINVAL;
@@ -3519,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3519 } else 3485 } else
3520 buf[cnt] = '\0'; 3486 buf[cnt] = '\0';
3521 3487
3522 cnt = mark_printk("%s", buf); 3488 written = mark_printk("%s", buf);
3523 kfree(buf); 3489 kfree(buf);
3524 *fpos += cnt; 3490 *fpos += written;
3525 3491
3526 return cnt; 3492 /* don't tell userspace we wrote more - it might confuse them */
3493 if (written > cnt)
3494 written = cnt;
3495
3496 return written;
3527} 3497}
3528 3498
3529static int tracing_clock_show(struct seq_file *m, void *v) 3499static int tracing_clock_show(struct seq_file *m, void *v)
@@ -3590,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = {
3590 .open = tracing_open_generic, 3560 .open = tracing_open_generic,
3591 .read = tracing_max_lat_read, 3561 .read = tracing_max_lat_read,
3592 .write = tracing_max_lat_write, 3562 .write = tracing_max_lat_write,
3563 .llseek = generic_file_llseek,
3593}; 3564};
3594 3565
3595static const struct file_operations tracing_ctrl_fops = { 3566static const struct file_operations tracing_ctrl_fops = {
3596 .open = tracing_open_generic, 3567 .open = tracing_open_generic,
3597 .read = tracing_ctrl_read, 3568 .read = tracing_ctrl_read,
3598 .write = tracing_ctrl_write, 3569 .write = tracing_ctrl_write,
3570 .llseek = generic_file_llseek,
3599}; 3571};
3600 3572
3601static const struct file_operations set_tracer_fops = { 3573static const struct file_operations set_tracer_fops = {
3602 .open = tracing_open_generic, 3574 .open = tracing_open_generic,
3603 .read = tracing_set_trace_read, 3575 .read = tracing_set_trace_read,
3604 .write = tracing_set_trace_write, 3576 .write = tracing_set_trace_write,
3577 .llseek = generic_file_llseek,
3605}; 3578};
3606 3579
3607static const struct file_operations tracing_pipe_fops = { 3580static const struct file_operations tracing_pipe_fops = {
@@ -3610,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = {
3610 .read = tracing_read_pipe, 3583 .read = tracing_read_pipe,
3611 .splice_read = tracing_splice_read_pipe, 3584 .splice_read = tracing_splice_read_pipe,
3612 .release = tracing_release_pipe, 3585 .release = tracing_release_pipe,
3586 .llseek = no_llseek,
3613}; 3587};
3614 3588
3615static const struct file_operations tracing_entries_fops = { 3589static const struct file_operations tracing_entries_fops = {
3616 .open = tracing_open_generic, 3590 .open = tracing_open_generic,
3617 .read = tracing_entries_read, 3591 .read = tracing_entries_read,
3618 .write = tracing_entries_write, 3592 .write = tracing_entries_write,
3593 .llseek = generic_file_llseek,
3619}; 3594};
3620 3595
3621static const struct file_operations tracing_mark_fops = { 3596static const struct file_operations tracing_mark_fops = {
3622 .open = tracing_open_generic, 3597 .open = tracing_open_generic,
3623 .write = tracing_mark_write, 3598 .write = tracing_mark_write,
3599 .llseek = generic_file_llseek,
3624}; 3600};
3625 3601
3626static const struct file_operations trace_clock_fops = { 3602static const struct file_operations trace_clock_fops = {
@@ -3926,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3926static const struct file_operations tracing_stats_fops = { 3902static const struct file_operations tracing_stats_fops = {
3927 .open = tracing_open_generic, 3903 .open = tracing_open_generic,
3928 .read = tracing_stats_read, 3904 .read = tracing_stats_read,
3905 .llseek = generic_file_llseek,
3929}; 3906};
3930 3907
3931#ifdef CONFIG_DYNAMIC_FTRACE 3908#ifdef CONFIG_DYNAMIC_FTRACE
@@ -3962,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3962static const struct file_operations tracing_dyn_info_fops = { 3939static const struct file_operations tracing_dyn_info_fops = {
3963 .open = tracing_open_generic, 3940 .open = tracing_open_generic,
3964 .read = tracing_read_dyn_info, 3941 .read = tracing_read_dyn_info,
3942 .llseek = generic_file_llseek,
3965}; 3943};
3966#endif 3944#endif
3967 3945
@@ -4115,6 +4093,7 @@ static const struct file_operations trace_options_fops = {
4115 .open = tracing_open_generic, 4093 .open = tracing_open_generic,
4116 .read = trace_options_read, 4094 .read = trace_options_read,
4117 .write = trace_options_write, 4095 .write = trace_options_write,
4096 .llseek = generic_file_llseek,
4118}; 4097};
4119 4098
4120static ssize_t 4099static ssize_t
@@ -4166,6 +4145,7 @@ static const struct file_operations trace_options_core_fops = {
4166 .open = tracing_open_generic, 4145 .open = tracing_open_generic,
4167 .read = trace_options_core_read, 4146 .read = trace_options_core_read,
4168 .write = trace_options_core_write, 4147 .write = trace_options_core_write,
4148 .llseek = generic_file_llseek,
4169}; 4149};
4170 4150
4171struct dentry *trace_create_file(const char *name, 4151struct dentry *trace_create_file(const char *name,
@@ -4355,9 +4335,6 @@ static __init int tracer_init_debugfs(void)
4355 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4335 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4356 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4336 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4357#endif 4337#endif
4358#ifdef CONFIG_SYSPROF_TRACER
4359 init_tracer_sysprof_debugfs(d_tracer);
4360#endif
4361 4338
4362 create_trace_options_dir(); 4339 create_trace_options_dir();
4363 4340
@@ -4414,7 +4391,7 @@ static struct notifier_block trace_die_notifier = {
4414 */ 4391 */
4415#define KERN_TRACE KERN_EMERG 4392#define KERN_TRACE KERN_EMERG
4416 4393
4417static void 4394void
4418trace_printk_seq(struct trace_seq *s) 4395trace_printk_seq(struct trace_seq *s)
4419{ 4396{
4420 /* Probably should print a warning here. */ 4397 /* Probably should print a warning here. */
@@ -4429,6 +4406,13 @@ trace_printk_seq(struct trace_seq *s)
4429 trace_seq_init(s); 4406 trace_seq_init(s);
4430} 4407}
4431 4408
4409void trace_init_global_iter(struct trace_iterator *iter)
4410{
4411 iter->tr = &global_trace;
4412 iter->trace = current_trace;
4413 iter->cpu_file = TRACE_PIPE_ALL_CPU;
4414}
4415
4432static void 4416static void
4433__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) 4417__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4434{ 4418{
@@ -4454,8 +4438,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4454 if (disable_tracing) 4438 if (disable_tracing)
4455 ftrace_kill(); 4439 ftrace_kill();
4456 4440
4441 trace_init_global_iter(&iter);
4442
4457 for_each_tracing_cpu(cpu) { 4443 for_each_tracing_cpu(cpu) {
4458 atomic_inc(&global_trace.data[cpu]->disabled); 4444 atomic_inc(&iter.tr->data[cpu]->disabled);
4459 } 4445 }
4460 4446
4461 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 4447 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4504,7 +4490,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4504 iter.iter_flags |= TRACE_FILE_LAT_FMT; 4490 iter.iter_flags |= TRACE_FILE_LAT_FMT;
4505 iter.pos = -1; 4491 iter.pos = -1;
4506 4492
4507 if (find_next_entry_inc(&iter) != NULL) { 4493 if (trace_find_next_entry_inc(&iter) != NULL) {
4508 int ret; 4494 int ret;
4509 4495
4510 ret = print_trace_line(&iter); 4496 ret = print_trace_line(&iter);
@@ -4526,7 +4512,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4526 trace_flags |= old_userobj; 4512 trace_flags |= old_userobj;
4527 4513
4528 for_each_tracing_cpu(cpu) { 4514 for_each_tracing_cpu(cpu) {
4529 atomic_dec(&global_trace.data[cpu]->disabled); 4515 atomic_dec(&iter.tr->data[cpu]->disabled);
4530 } 4516 }
4531 tracing_on(); 4517 tracing_on();
4532 } 4518 }
@@ -4575,16 +4561,14 @@ __init static int tracer_alloc_buffers(void)
4575 4561
4576 4562
4577#ifdef CONFIG_TRACER_MAX_TRACE 4563#ifdef CONFIG_TRACER_MAX_TRACE
4578 max_tr.buffer = ring_buffer_alloc(ring_buf_size, 4564 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
4579 TRACE_BUFFER_FLAGS);
4580 if (!max_tr.buffer) { 4565 if (!max_tr.buffer) {
4581 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4566 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4582 WARN_ON(1); 4567 WARN_ON(1);
4583 ring_buffer_free(global_trace.buffer); 4568 ring_buffer_free(global_trace.buffer);
4584 goto out_free_cpumask; 4569 goto out_free_cpumask;
4585 } 4570 }
4586 max_tr.entries = ring_buffer_size(max_tr.buffer); 4571 max_tr.entries = 1;
4587 WARN_ON(max_tr.entries != global_trace.entries);
4588#endif 4572#endif
4589 4573
4590 /* Allocate the first page for all buffers */ 4574 /* Allocate the first page for all buffers */
@@ -4597,9 +4581,6 @@ __init static int tracer_alloc_buffers(void)
4597 4581
4598 register_tracer(&nop_trace); 4582 register_tracer(&nop_trace);
4599 current_trace = &nop_trace; 4583 current_trace = &nop_trace;
4600#ifdef CONFIG_BOOT_TRACER
4601 register_tracer(&boot_tracer);
4602#endif
4603 /* All seems OK, enable tracing */ 4584 /* All seems OK, enable tracing */
4604 tracing_disabled = 0; 4585 tracing_disabled = 0;
4605 4586
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..d39b3c5454a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h> 10#include <linux/tracepoint.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h>
13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h> 12#include <linux/hw_breakpoint.h>
15
16#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
17#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
18 15
@@ -25,30 +22,17 @@ enum trace_type {
25 TRACE_STACK, 22 TRACE_STACK,
26 TRACE_PRINT, 23 TRACE_PRINT,
27 TRACE_BPRINT, 24 TRACE_BPRINT,
28 TRACE_SPECIAL,
29 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
30 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
31 TRACE_BRANCH, 27 TRACE_BRANCH,
32 TRACE_BOOT_CALL,
33 TRACE_BOOT_RET,
34 TRACE_GRAPH_RET, 28 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 29 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 30 TRACE_USER_STACK,
37 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE,
39 TRACE_BLK, 31 TRACE_BLK,
40 TRACE_KSYM,
41 32
42 __TRACE_LAST_TYPE, 33 __TRACE_LAST_TYPE,
43}; 34};
44 35
45enum kmemtrace_type_id {
46 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49};
50
51extern struct tracer boot_tracer;
52 36
53#undef __field 37#undef __field
54#define __field(type, item) type item; 38#define __field(type, item) type item;
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void);
204 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 188 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
205 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 189 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
206 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 190 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
207 IF_ASSIGN(var, ent, struct special_entry, 0); \
208 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 191 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
209 TRACE_MMIO_RW); \ 192 TRACE_MMIO_RW); \
210 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 193 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
211 TRACE_MMIO_MAP); \ 194 TRACE_MMIO_MAP); \
212 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
213 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
214 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ 195 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
215 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ 196 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
216 TRACE_GRAPH_ENT); \ 197 TRACE_GRAPH_ENT); \
217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 198 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
218 TRACE_GRAPH_RET); \ 199 TRACE_GRAPH_RET); \
219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
220 TRACE_KMEM_ALLOC); \
221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
222 TRACE_KMEM_FREE); \
223 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
224 __ftrace_bad_type(); \ 200 __ftrace_bad_type(); \
225 } while (0) 201 } while (0)
226 202
@@ -298,6 +274,7 @@ struct tracer {
298 struct tracer *next; 274 struct tracer *next;
299 int print_max; 275 int print_max;
300 struct tracer_flags *flags; 276 struct tracer_flags *flags;
277 int use_max_tr;
301}; 278};
302 279
303 280
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
318 const struct file_operations *fops); 295 const struct file_operations *fops);
319 296
320struct dentry *tracing_init_dentry(void); 297struct dentry *tracing_init_dentry(void);
321void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
322 298
323struct ring_buffer_event; 299struct ring_buffer_event;
324 300
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
338struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 314struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
339 int *ent_cpu, u64 *ent_ts); 315 int *ent_cpu, u64 *ent_ts);
340 316
317int trace_empty(struct trace_iterator *iter);
318
319void *trace_find_next_entry_inc(struct trace_iterator *iter);
320
321void trace_init_global_iter(struct trace_iterator *iter);
322
323void tracing_iter_reset(struct trace_iterator *iter, int cpu);
324
341void default_wait_pipe(struct trace_iterator *iter); 325void default_wait_pipe(struct trace_iterator *iter);
342void poll_wait_pipe(struct trace_iterator *iter); 326void poll_wait_pipe(struct trace_iterator *iter);
343 327
@@ -355,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
355 struct task_struct *wakee, 339 struct task_struct *wakee,
356 struct task_struct *cur, 340 struct task_struct *cur,
357 unsigned long flags, int pc); 341 unsigned long flags, int pc);
358void trace_special(struct trace_array *tr,
359 struct trace_array_cpu *data,
360 unsigned long arg1,
361 unsigned long arg2,
362 unsigned long arg3, int pc);
363void trace_function(struct trace_array *tr, 342void trace_function(struct trace_array *tr,
364 unsigned long ip, 343 unsigned long ip,
365 unsigned long parent_ip, 344 unsigned long parent_ip,
@@ -380,8 +359,15 @@ void tracing_start_sched_switch_record(void);
380int register_tracer(struct tracer *type); 359int register_tracer(struct tracer *type);
381void unregister_tracer(struct tracer *type); 360void unregister_tracer(struct tracer *type);
382int is_tracing_stopped(void); 361int is_tracing_stopped(void);
362enum trace_file_type {
363 TRACE_FILE_LAT_FMT = 1,
364 TRACE_FILE_ANNOTATE = 2,
365};
366
367extern cpumask_var_t __read_mostly tracing_buffer_mask;
383 368
384extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); 369#define for_each_tracing_cpu(cpu) \
370 for_each_cpu(cpu, tracing_buffer_mask)
385 371
386extern unsigned long nsecs_to_usecs(unsigned long nsecs); 372extern unsigned long nsecs_to_usecs(unsigned long nsecs);
387 373
@@ -452,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
452 struct trace_array *tr); 438 struct trace_array *tr);
453extern int trace_selftest_startup_sched_switch(struct tracer *trace, 439extern int trace_selftest_startup_sched_switch(struct tracer *trace,
454 struct trace_array *tr); 440 struct trace_array *tr);
455extern int trace_selftest_startup_sysprof(struct tracer *trace,
456 struct trace_array *tr);
457extern int trace_selftest_startup_branch(struct tracer *trace, 441extern int trace_selftest_startup_branch(struct tracer *trace,
458 struct trace_array *tr); 442 struct trace_array *tr);
459extern int trace_selftest_startup_ksym(struct tracer *trace,
460 struct trace_array *tr);
461#endif /* CONFIG_FTRACE_STARTUP_TEST */ 443#endif /* CONFIG_FTRACE_STARTUP_TEST */
462 444
463extern void *head_page(struct trace_array_cpu *data); 445extern void *head_page(struct trace_array_cpu *data);
@@ -471,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr,
471 unsigned long ip, const char *fmt, va_list args); 453 unsigned long ip, const char *fmt, va_list args);
472int trace_array_printk(struct trace_array *tr, 454int trace_array_printk(struct trace_array *tr,
473 unsigned long ip, const char *fmt, ...); 455 unsigned long ip, const char *fmt, ...);
456void trace_printk_seq(struct trace_seq *s);
457enum print_line_t print_trace_line(struct trace_iterator *iter);
474 458
475extern unsigned long trace_flags; 459extern unsigned long trace_flags;
476 460
@@ -617,6 +601,7 @@ enum trace_iterator_flags {
617 TRACE_ITER_LATENCY_FMT = 0x20000, 601 TRACE_ITER_LATENCY_FMT = 0x20000,
618 TRACE_ITER_SLEEP_TIME = 0x40000, 602 TRACE_ITER_SLEEP_TIME = 0x40000,
619 TRACE_ITER_GRAPH_TIME = 0x80000, 603 TRACE_ITER_GRAPH_TIME = 0x80000,
604 TRACE_ITER_RECORD_CMD = 0x100000,
620}; 605};
621 606
622/* 607/*
@@ -628,54 +613,6 @@ enum trace_iterator_flags {
628 613
629extern struct tracer nop_trace; 614extern struct tracer nop_trace;
630 615
631/**
632 * ftrace_preempt_disable - disable preemption scheduler safe
633 *
634 * When tracing can happen inside the scheduler, there exists
635 * cases that the tracing might happen before the need_resched
636 * flag is checked. If this happens and the tracer calls
637 * preempt_enable (after a disable), a schedule might take place
638 * causing an infinite recursion.
639 *
640 * To prevent this, we read the need_resched flag before
641 * disabling preemption. When we want to enable preemption we
642 * check the flag, if it is set, then we call preempt_enable_no_resched.
643 * Otherwise, we call preempt_enable.
644 *
645 * The rational for doing the above is that if need_resched is set
646 * and we have yet to reschedule, we are either in an atomic location
647 * (where we do not need to check for scheduling) or we are inside
648 * the scheduler and do not want to resched.
649 */
650static inline int ftrace_preempt_disable(void)
651{
652 int resched;
653
654 resched = need_resched();
655 preempt_disable_notrace();
656
657 return resched;
658}
659
660/**
661 * ftrace_preempt_enable - enable preemption scheduler safe
662 * @resched: the return value from ftrace_preempt_disable
663 *
664 * This is a scheduler safe way to enable preemption and not miss
665 * any preemption checks. The disabled saved the state of preemption.
666 * If resched is set, then we are either inside an atomic or
667 * are inside the scheduler (we would have already scheduled
668 * otherwise). In this case, we do not want to call normal
669 * preempt_enable, but preempt_enable_no_resched instead.
670 */
671static inline void ftrace_preempt_enable(int resched)
672{
673 if (resched)
674 preempt_enable_no_resched_notrace();
675 else
676 preempt_enable_notrace();
677}
678
679#ifdef CONFIG_BRANCH_TRACER 616#ifdef CONFIG_BRANCH_TRACER
680extern int enable_branch_tracing(struct trace_array *tr); 617extern int enable_branch_tracing(struct trace_array *tr);
681extern void disable_branch_tracing(void); 618extern void disable_branch_tracing(void);
@@ -766,6 +703,8 @@ struct filter_pred {
766 int pop_n; 703 int pop_n;
767}; 704};
768 705
706extern struct list_head ftrace_common_fields;
707
769extern enum regex_type 708extern enum regex_type
770filter_parse_regex(char *buff, int len, char **search, int *not); 709filter_parse_regex(char *buff, int len, char **search, int *not);
771extern void print_event_filter(struct ftrace_event_call *call, 710extern void print_event_filter(struct ftrace_event_call *call,
@@ -795,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
795 return 0; 734 return 0;
796} 735}
797 736
737extern void trace_event_enable_cmd_record(bool enable);
738
798extern struct mutex event_mutex; 739extern struct mutex event_mutex;
799extern struct list_head ftrace_events; 740extern struct list_head ftrace_events;
800 741
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12#include <linux/time.h>
13
14#include "trace.h"
15#include "trace_output.h"
16
17static struct trace_array *boot_trace;
18static bool pre_initcalls_finished;
19
20/* Tells the boot tracer that the pre_smp_initcalls are finished.
21 * So we are ready .
22 * It doesn't enable sched events tracing however.
23 * You have to call enable_boot_trace to do so.
24 */
25void start_boot_trace(void)
26{
27 pre_initcalls_finished = true;
28}
29
30void enable_boot_trace(void)
31{
32 if (boot_trace && pre_initcalls_finished)
33 tracing_start_sched_switch_record();
34}
35
36void disable_boot_trace(void)
37{
38 if (boot_trace && pre_initcalls_finished)
39 tracing_stop_sched_switch_record();
40}
41
42static int boot_trace_init(struct trace_array *tr)
43{
44 boot_trace = tr;
45
46 if (!tr)
47 return 0;
48
49 tracing_reset_online_cpus(tr);
50
51 tracing_sched_switch_assign_trace(tr);
52 return 0;
53}
54
55static enum print_line_t
56initcall_call_print_line(struct trace_iterator *iter)
57{
58 struct trace_entry *entry = iter->ent;
59 struct trace_seq *s = &iter->seq;
60 struct trace_boot_call *field;
61 struct boot_trace_call *call;
62 u64 ts;
63 unsigned long nsec_rem;
64 int ret;
65
66 trace_assign_type(field, entry);
67 call = &field->boot_call;
68 ts = iter->ts;
69 nsec_rem = do_div(ts, NSEC_PER_SEC);
70
71 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
72 (unsigned long)ts, nsec_rem, call->func, call->caller);
73
74 if (!ret)
75 return TRACE_TYPE_PARTIAL_LINE;
76 else
77 return TRACE_TYPE_HANDLED;
78}
79
80static enum print_line_t
81initcall_ret_print_line(struct trace_iterator *iter)
82{
83 struct trace_entry *entry = iter->ent;
84 struct trace_seq *s = &iter->seq;
85 struct trace_boot_ret *field;
86 struct boot_trace_ret *init_ret;
87 u64 ts;
88 unsigned long nsec_rem;
89 int ret;
90
91 trace_assign_type(field, entry);
92 init_ret = &field->boot_ret;
93 ts = iter->ts;
94 nsec_rem = do_div(ts, NSEC_PER_SEC);
95
96 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
97 "returned %d after %llu msecs\n",
98 (unsigned long) ts,
99 nsec_rem,
100 init_ret->func, init_ret->result, init_ret->duration);
101
102 if (!ret)
103 return TRACE_TYPE_PARTIAL_LINE;
104 else
105 return TRACE_TYPE_HANDLED;
106}
107
108static enum print_line_t initcall_print_line(struct trace_iterator *iter)
109{
110 struct trace_entry *entry = iter->ent;
111
112 switch (entry->type) {
113 case TRACE_BOOT_CALL:
114 return initcall_call_print_line(iter);
115 case TRACE_BOOT_RET:
116 return initcall_ret_print_line(iter);
117 default:
118 return TRACE_TYPE_UNHANDLED;
119 }
120}
121
122struct tracer boot_tracer __read_mostly =
123{
124 .name = "initcall",
125 .init = boot_trace_init,
126 .reset = tracing_reset_online_cpus,
127 .print_line = initcall_print_line,
128};
129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{
132 struct ftrace_event_call *call = &event_boot_call;
133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace;
137
138 if (!tr || !pre_initcalls_finished)
139 return;
140
141 /* Get its name now since this function could
142 * disappear because it is in the .init section.
143 */
144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable();
146
147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
149 sizeof(*entry), 0, 0);
150 if (!event)
151 goto out;
152 entry = ring_buffer_event_data(event);
153 entry->boot_call = *bt;
154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
156 out:
157 preempt_enable();
158}
159
160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
161{
162 struct ftrace_event_call *call = &event_boot_ret;
163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
165 struct trace_boot_ret *entry;
166 struct trace_array *tr = boot_trace;
167
168 if (!tr || !pre_initcalls_finished)
169 return;
170
171 sprint_symbol(bt->func, (unsigned long)fn);
172 preempt_disable();
173
174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
176 sizeof(*entry), 0, 0);
177 if (!event)
178 goto out;
179 entry = ring_buffer_event_data(event);
180 entry->boot_ret = *bt;
181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
183 out:
184 preempt_enable();
185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
32u64 notrace trace_clock_local(void) 32u64 notrace trace_clock_local(void)
33{ 33{
34 u64 clock; 34 u64 clock;
35 int resched;
36 35
37 /* 36 /*
38 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
39 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
40 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
41 */ 40 */
42 resched = ftrace_preempt_disable(); 41 preempt_disable_notrace();
43 clock = sched_clock(); 42 clock = sched_clock();
44 ftrace_preempt_enable(resched); 43 preempt_enable_notrace();
45 44
46 return clock; 45 return clock;
47} 46}
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void)
56 */ 55 */
57u64 notrace trace_clock(void) 56u64 notrace trace_clock(void)
58{ 57{
59 return cpu_clock(raw_smp_processor_id()); 58 return local_clock();
60} 59}
61 60
62 61
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index dc008c1240da..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
151); 151);
152 152
153/* 153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry: 154 * Stack-trace entry:
172 */ 155 */
173 156
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
271 __entry->map_id, __entry->opcode) 254 __entry->map_id, __entry->opcode)
272); 255);
273 256
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301 257
302#define TRACE_FUNC_SIZE 30 258#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20 259#define TRACE_FILE_SIZE 20
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 274 __entry->func, __entry->file, __entry->correct)
319); 275);
320 276
321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
322
323 TRACE_KMEM_ALLOC,
324
325 F_STRUCT(
326 __field( enum kmemtrace_type_id, type_id )
327 __field( unsigned long, call_site )
328 __field( const void *, ptr )
329 __field( size_t, bytes_req )
330 __field( size_t, bytes_alloc )
331 __field( gfp_t, gfp_flags )
332 __field( int, node )
333 ),
334
335 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
336 " flags:%x node:%d",
337 __entry->type_id, __entry->call_site, __entry->ptr,
338 __entry->bytes_req, __entry->bytes_alloc,
339 __entry->gfp_flags, __entry->node)
340);
341
342FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
343
344 TRACE_KMEM_FREE,
345
346 F_STRUCT(
347 __field( enum kmemtrace_type_id, type_id )
348 __field( unsigned long, call_site )
349 __field( const void *, ptr )
350 ),
351
352 F_printk("type:%u call_site:%lx ptr:%p",
353 __entry->type_id, __entry->call_site, __entry->ptr)
354);
355
356FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
357
358 TRACE_KSYM,
359
360 F_STRUCT(
361 __field( unsigned long, ip )
362 __field( unsigned char, type )
363 __array( char , cmd, TASK_COMM_LEN )
364 __field( unsigned long, addr )
365 ),
366
367 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
368 (void *)__entry->ip, (unsigned int)__entry->type,
369 (void *)__entry->addr, __entry->cmd)
370);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 8a2b73f7c068..31cc4cb0dbf2 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,8 +9,6 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
13
14static char *perf_trace_buf[4]; 12static char *perf_trace_buf[4];
15 13
16/* 14/*
@@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
56 } 54 }
57 } 55 }
58 56
59 if (tp_event->class->reg) 57 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret) 58 if (ret)
67 goto fail; 59 goto fail;
68 60
@@ -96,11 +88,11 @@ int perf_trace_init(struct perf_event *p_event)
96 mutex_lock(&event_mutex); 88 mutex_lock(&event_mutex);
97 list_for_each_entry(tp_event, &ftrace_events, list) { 89 list_for_each_entry(tp_event, &ftrace_events, list) {
98 if (tp_event->event.type == event_id && 90 if (tp_event->event.type == event_id &&
99 tp_event->class && 91 tp_event->class && tp_event->class->reg &&
100 (tp_event->class->perf_probe ||
101 tp_event->class->reg) &&
102 try_module_get(tp_event->mod)) { 92 try_module_get(tp_event->mod)) {
103 ret = perf_trace_event_init(tp_event, p_event); 93 ret = perf_trace_event_init(tp_event, p_event);
94 if (ret)
95 module_put(tp_event->mod);
104 break; 96 break;
105 } 97 }
106 } 98 }
@@ -138,18 +130,13 @@ void perf_trace_destroy(struct perf_event *p_event)
138 if (--tp_event->perf_refcount > 0) 130 if (--tp_event->perf_refcount > 0)
139 goto out; 131 goto out;
140 132
141 if (tp_event->class->reg) 133 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
142 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
143 else
144 tracepoint_probe_unregister(tp_event->name,
145 tp_event->class->perf_probe,
146 tp_event);
147 134
148 /* 135 /*
149 * Ensure our callback won't be called anymore. See 136 * Ensure our callback won't be called anymore. The buffers
150 * tracepoint_probe_unregister() and __DO_TRACE(). 137 * will be freed after that.
151 */ 138 */
152 synchronize_sched(); 139 tracepoint_synchronize_unregister();
153 140
154 free_percpu(tp_event->perf_events); 141 free_percpu(tp_event->perf_events);
155 tp_event->perf_events = NULL; 142 tp_event->perf_events = NULL;
@@ -161,6 +148,7 @@ void perf_trace_destroy(struct perf_event *p_event)
161 } 148 }
162 } 149 }
163out: 150out:
151 module_put(tp_event->mod);
164 mutex_unlock(&event_mutex); 152 mutex_unlock(&event_mutex);
165} 153}
166 154
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..4c758f146328 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,6 +28,7 @@
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields);
31 32
32struct list_head * 33struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call) 34trace_get_fields(struct ftrace_event_call *event_call)
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call)
37 return event_call->class->get_fields(event_call); 38 return event_call->class->get_fields(event_call);
38} 39}
39 40
40int trace_define_field(struct ftrace_event_call *call, const char *type, 41static int __trace_define_field(struct list_head *head, const char *type,
41 const char *name, int offset, int size, int is_signed, 42 const char *name, int offset, int size,
42 int filter_type) 43 int is_signed, int filter_type)
43{ 44{
44 struct ftrace_event_field *field; 45 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
49 46
50 field = kzalloc(sizeof(*field), GFP_KERNEL); 47 field = kzalloc(sizeof(*field), GFP_KERNEL);
51 if (!field) 48 if (!field)
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
68 field->size = size; 65 field->size = size;
69 field->is_signed = is_signed; 66 field->is_signed = is_signed;
70 67
71 head = trace_get_fields(call);
72 list_add(&field->link, head); 68 list_add(&field->link, head);
73 69
74 return 0; 70 return 0;
@@ -80,17 +76,32 @@ err:
80 76
81 return -ENOMEM; 77 return -ENOMEM;
82} 78}
79
80int trace_define_field(struct ftrace_event_call *call, const char *type,
81 const char *name, int offset, int size, int is_signed,
82 int filter_type)
83{
84 struct list_head *head;
85
86 if (WARN_ON(!call->class))
87 return 0;
88
89 head = trace_get_fields(call);
90 return __trace_define_field(head, type, name, offset, size,
91 is_signed, filter_type);
92}
83EXPORT_SYMBOL_GPL(trace_define_field); 93EXPORT_SYMBOL_GPL(trace_define_field);
84 94
85#define __common_field(type, item) \ 95#define __common_field(type, item) \
86 ret = trace_define_field(call, #type, "common_" #item, \ 96 ret = __trace_define_field(&ftrace_common_fields, #type, \
87 offsetof(typeof(ent), item), \ 97 "common_" #item, \
88 sizeof(ent.item), \ 98 offsetof(typeof(ent), item), \
89 is_signed_type(type), FILTER_OTHER); \ 99 sizeof(ent.item), \
100 is_signed_type(type), FILTER_OTHER); \
90 if (ret) \ 101 if (ret) \
91 return ret; 102 return ret;
92 103
93static int trace_define_common_fields(struct ftrace_event_call *call) 104static int trace_define_common_fields(void)
94{ 105{
95 int ret; 106 int ret;
96 struct trace_entry ent; 107 struct trace_entry ent;
@@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call)
130} 141}
131EXPORT_SYMBOL_GPL(trace_event_raw_init); 142EXPORT_SYMBOL_GPL(trace_event_raw_init);
132 143
144int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
145{
146 switch (type) {
147 case TRACE_REG_REGISTER:
148 return tracepoint_probe_register(call->name,
149 call->class->probe,
150 call);
151 case TRACE_REG_UNREGISTER:
152 tracepoint_probe_unregister(call->name,
153 call->class->probe,
154 call);
155 return 0;
156
157#ifdef CONFIG_PERF_EVENTS
158 case TRACE_REG_PERF_REGISTER:
159 return tracepoint_probe_register(call->name,
160 call->class->perf_probe,
161 call);
162 case TRACE_REG_PERF_UNREGISTER:
163 tracepoint_probe_unregister(call->name,
164 call->class->perf_probe,
165 call);
166 return 0;
167#endif
168 }
169 return 0;
170}
171EXPORT_SYMBOL_GPL(ftrace_event_reg);
172
173void trace_event_enable_cmd_record(bool enable)
174{
175 struct ftrace_event_call *call;
176
177 mutex_lock(&event_mutex);
178 list_for_each_entry(call, &ftrace_events, list) {
179 if (!(call->flags & TRACE_EVENT_FL_ENABLED))
180 continue;
181
182 if (enable) {
183 tracing_start_cmdline_record();
184 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
185 } else {
186 tracing_stop_cmdline_record();
187 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
188 }
189 }
190 mutex_unlock(&event_mutex);
191}
192
133static int ftrace_event_enable_disable(struct ftrace_event_call *call, 193static int ftrace_event_enable_disable(struct ftrace_event_call *call,
134 int enable) 194 int enable)
135{ 195{
@@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
139 case 0: 199 case 0:
140 if (call->flags & TRACE_EVENT_FL_ENABLED) { 200 if (call->flags & TRACE_EVENT_FL_ENABLED) {
141 call->flags &= ~TRACE_EVENT_FL_ENABLED; 201 call->flags &= ~TRACE_EVENT_FL_ENABLED;
142 tracing_stop_cmdline_record(); 202 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
143 if (call->class->reg) 203 tracing_stop_cmdline_record();
144 call->class->reg(call, TRACE_REG_UNREGISTER); 204 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
145 else 205 }
146 tracepoint_probe_unregister(call->name, 206 call->class->reg(call, TRACE_REG_UNREGISTER);
147 call->class->probe,
148 call);
149 } 207 }
150 break; 208 break;
151 case 1: 209 case 1:
152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 210 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
153 tracing_start_cmdline_record(); 211 if (trace_flags & TRACE_ITER_RECORD_CMD) {
154 if (call->class->reg) 212 tracing_start_cmdline_record();
155 ret = call->class->reg(call, TRACE_REG_REGISTER); 213 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
156 else 214 }
157 ret = tracepoint_probe_register(call->name, 215 ret = call->class->reg(call, TRACE_REG_REGISTER);
158 call->class->probe,
159 call);
160 if (ret) { 216 if (ret) {
161 tracing_stop_cmdline_record(); 217 tracing_stop_cmdline_record();
162 pr_info("event trace: Could not enable event " 218 pr_info("event trace: Could not enable event "
@@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
194 mutex_lock(&event_mutex); 250 mutex_lock(&event_mutex);
195 list_for_each_entry(call, &ftrace_events, list) { 251 list_for_each_entry(call, &ftrace_events, list) {
196 252
197 if (!call->name || !call->class || 253 if (!call->name || !call->class || !call->class->reg)
198 (!call->class->probe && !call->class->reg))
199 continue; 254 continue;
200 255
201 if (match && 256 if (match &&
@@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
321 * The ftrace subsystem is for showing formats only. 376 * The ftrace subsystem is for showing formats only.
322 * They can not be enabled or disabled via the event files. 377 * They can not be enabled or disabled via the event files.
323 */ 378 */
324 if (call->class && (call->class->probe || call->class->reg)) 379 if (call->class && call->class->reg)
325 return call; 380 return call;
326 } 381 }
327 382
@@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
474 529
475 mutex_lock(&event_mutex); 530 mutex_lock(&event_mutex);
476 list_for_each_entry(call, &ftrace_events, list) { 531 list_for_each_entry(call, &ftrace_events, list) {
477 if (!call->name || !call->class || 532 if (!call->name || !call->class || !call->class->reg)
478 (!call->class->probe && !call->class->reg))
479 continue; 533 continue;
480 534
481 if (system && strcmp(call->class->system, system) != 0) 535 if (system && strcmp(call->class->system, system) != 0)
@@ -544,85 +598,165 @@ out:
544 return ret; 598 return ret;
545} 599}
546 600
547static ssize_t 601enum {
548event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 602 FORMAT_HEADER = 1,
549 loff_t *ppos) 603 FORMAT_PRINTFMT = 2,
604};
605
606static void *f_next(struct seq_file *m, void *v, loff_t *pos)
550{ 607{
551 struct ftrace_event_call *call = filp->private_data; 608 struct ftrace_event_call *call = m->private;
552 struct ftrace_event_field *field; 609 struct ftrace_event_field *field;
553 struct list_head *head; 610 struct list_head *head;
554 struct trace_seq *s;
555 int common_field_count = 5;
556 char *buf;
557 int r = 0;
558 611
559 if (*ppos) 612 (*pos)++;
560 return 0;
561 613
562 s = kmalloc(sizeof(*s), GFP_KERNEL); 614 switch ((unsigned long)v) {
563 if (!s) 615 case FORMAT_HEADER:
564 return -ENOMEM; 616 head = &ftrace_common_fields;
565 617
566 trace_seq_init(s); 618 if (unlikely(list_empty(head)))
619 return NULL;
567 620
568 trace_seq_printf(s, "name: %s\n", call->name); 621 field = list_entry(head->prev, struct ftrace_event_field, link);
569 trace_seq_printf(s, "ID: %d\n", call->event.type); 622 return field;
570 trace_seq_printf(s, "format:\n"); 623
624 case FORMAT_PRINTFMT:
625 /* all done */
626 return NULL;
627 }
571 628
572 head = trace_get_fields(call); 629 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) {
574 /*
575 * Smartly shows the array type(except dynamic array).
576 * Normal:
577 * field:TYPE VAR
578 * If TYPE := TYPE[LEN], it is shown:
579 * field:TYPE VAR[LEN]
580 */
581 const char *array_descriptor = strchr(field->type, '[');
582 630
583 if (!strncmp(field->type, "__data_loc", 10)) 631 /*
584 array_descriptor = NULL; 632 * To separate common fields from event fields, the
633 * LSB is set on the first event field. Clear it in case.
634 */
635 v = (void *)((unsigned long)v & ~1L);
585 636
586 if (!array_descriptor) { 637 field = v;
587 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" 638 /*
588 "\tsize:%u;\tsigned:%d;\n", 639 * If this is a common field, and at the end of the list, then
589 field->type, field->name, field->offset, 640 * continue with main list.
590 field->size, !!field->is_signed); 641 */
591 } else { 642 if (field->link.prev == &ftrace_common_fields) {
592 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" 643 if (unlikely(list_empty(head)))
593 "\tsize:%u;\tsigned:%d;\n", 644 return NULL;
594 (int)(array_descriptor - field->type), 645 field = list_entry(head->prev, struct ftrace_event_field, link);
595 field->type, field->name, 646 /* Set the LSB to notify f_show to print an extra newline */
596 array_descriptor, field->offset, 647 field = (struct ftrace_event_field *)
597 field->size, !!field->is_signed); 648 ((unsigned long)field | 1);
598 } 649 return field;
650 }
599 651
600 if (--common_field_count == 0) 652 /* If we are done tell f_show to print the format */
601 r = trace_seq_printf(s, "\n"); 653 if (field->link.prev == head)
654 return (void *)FORMAT_PRINTFMT;
602 655
603 if (!r) 656 field = list_entry(field->link.prev, struct ftrace_event_field, link);
604 break;
605 }
606 657
607 if (r) 658 return field;
608 r = trace_seq_printf(s, "\nprint fmt: %s\n", 659}
609 call->print_fmt);
610 660
611 if (!r) { 661static void *f_start(struct seq_file *m, loff_t *pos)
612 /* 662{
613 * ug! The format output is bigger than a PAGE!! 663 loff_t l = 0;
614 */ 664 void *p;
615 buf = "FORMAT TOO BIG\n"; 665
616 r = simple_read_from_buffer(ubuf, cnt, ppos, 666 /* Start by showing the header */
617 buf, strlen(buf)); 667 if (!*pos)
618 goto out; 668 return (void *)FORMAT_HEADER;
669
670 p = (void *)FORMAT_HEADER;
671 do {
672 p = f_next(m, p, &l);
673 } while (p && l < *pos);
674
675 return p;
676}
677
678static int f_show(struct seq_file *m, void *v)
679{
680 struct ftrace_event_call *call = m->private;
681 struct ftrace_event_field *field;
682 const char *array_descriptor;
683
684 switch ((unsigned long)v) {
685 case FORMAT_HEADER:
686 seq_printf(m, "name: %s\n", call->name);
687 seq_printf(m, "ID: %d\n", call->event.type);
688 seq_printf(m, "format:\n");
689 return 0;
690
691 case FORMAT_PRINTFMT:
692 seq_printf(m, "\nprint fmt: %s\n",
693 call->print_fmt);
694 return 0;
619 } 695 }
620 696
621 r = simple_read_from_buffer(ubuf, cnt, ppos, 697 /*
622 s->buffer, s->len); 698 * To separate common fields from event fields, the
623 out: 699 * LSB is set on the first event field. Clear it and
624 kfree(s); 700 * print a newline if it is set.
625 return r; 701 */
702 if ((unsigned long)v & 1) {
703 seq_putc(m, '\n');
704 v = (void *)((unsigned long)v & ~1L);
705 }
706
707 field = v;
708
709 /*
710 * Smartly shows the array type(except dynamic array).
711 * Normal:
712 * field:TYPE VAR
713 * If TYPE := TYPE[LEN], it is shown:
714 * field:TYPE VAR[LEN]
715 */
716 array_descriptor = strchr(field->type, '[');
717
718 if (!strncmp(field->type, "__data_loc", 10))
719 array_descriptor = NULL;
720
721 if (!array_descriptor)
722 seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
723 field->type, field->name, field->offset,
724 field->size, !!field->is_signed);
725 else
726 seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
727 (int)(array_descriptor - field->type),
728 field->type, field->name,
729 array_descriptor, field->offset,
730 field->size, !!field->is_signed);
731
732 return 0;
733}
734
735static void f_stop(struct seq_file *m, void *p)
736{
737}
738
739static const struct seq_operations trace_format_seq_ops = {
740 .start = f_start,
741 .next = f_next,
742 .stop = f_stop,
743 .show = f_show,
744};
745
746static int trace_format_open(struct inode *inode, struct file *file)
747{
748 struct ftrace_event_call *call = inode->i_private;
749 struct seq_file *m;
750 int ret;
751
752 ret = seq_open(file, &trace_format_seq_ops);
753 if (ret < 0)
754 return ret;
755
756 m = file->private_data;
757 m->private = call;
758
759 return 0;
626} 760}
627 761
628static ssize_t 762static ssize_t
@@ -820,8 +954,10 @@ static const struct file_operations ftrace_enable_fops = {
820}; 954};
821 955
822static const struct file_operations ftrace_event_format_fops = { 956static const struct file_operations ftrace_event_format_fops = {
823 .open = tracing_open_generic, 957 .open = trace_format_open,
824 .read = event_format_read, 958 .read = seq_read,
959 .llseek = seq_lseek,
960 .release = seq_release,
825}; 961};
826 962
827static const struct file_operations ftrace_event_id_fops = { 963static const struct file_operations ftrace_event_id_fops = {
@@ -963,35 +1099,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
963 return -1; 1099 return -1;
964 } 1100 }
965 1101
966 if (call->class->probe || call->class->reg) 1102 if (call->class->reg)
967 trace_create_file("enable", 0644, call->dir, call, 1103 trace_create_file("enable", 0644, call->dir, call,
968 enable); 1104 enable);
969 1105
970#ifdef CONFIG_PERF_EVENTS 1106#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg)) 1107 if (call->event.type && call->class->reg)
972 trace_create_file("id", 0444, call->dir, call, 1108 trace_create_file("id", 0444, call->dir, call,
973 id); 1109 id);
974#endif 1110#endif
975 1111
976 if (call->class->define_fields) { 1112 /*
977 /* 1113 * Other events may have the same class. Only update
978 * Other events may have the same class. Only update 1114 * the fields if they are not already defined.
979 * the fields if they are not already defined. 1115 */
980 */ 1116 head = trace_get_fields(call);
981 head = trace_get_fields(call); 1117 if (list_empty(head)) {
982 if (list_empty(head)) { 1118 ret = call->class->define_fields(call);
983 ret = trace_define_common_fields(call); 1119 if (ret < 0) {
984 if (!ret) 1120 pr_warning("Could not initialize trace point"
985 ret = call->class->define_fields(call); 1121 " events/%s\n", call->name);
986 if (ret < 0) { 1122 return ret;
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
991 } 1123 }
992 trace_create_file("filter", 0644, call->dir, call,
993 filter);
994 } 1124 }
1125 trace_create_file("filter", 0644, call->dir, call,
1126 filter);
995 1127
996 trace_create_file("format", 0444, call->dir, call, 1128 trace_create_file("format", 0444, call->dir, call,
997 format); 1129 format);
@@ -999,11 +1131,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
999 return 0; 1131 return 0;
1000} 1132}
1001 1133
1002static int __trace_add_event_call(struct ftrace_event_call *call) 1134static int
1135__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1136 const struct file_operations *id,
1137 const struct file_operations *enable,
1138 const struct file_operations *filter,
1139 const struct file_operations *format)
1003{ 1140{
1004 struct dentry *d_events; 1141 struct dentry *d_events;
1005 int ret; 1142 int ret;
1006 1143
1144 /* The linker may leave blanks */
1007 if (!call->name) 1145 if (!call->name)
1008 return -EINVAL; 1146 return -EINVAL;
1009 1147
@@ -1011,8 +1149,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1011 ret = call->class->raw_init(call); 1149 ret = call->class->raw_init(call);
1012 if (ret < 0) { 1150 if (ret < 0) {
1013 if (ret != -ENOSYS) 1151 if (ret != -ENOSYS)
1014 pr_warning("Could not initialize trace " 1152 pr_warning("Could not initialize trace events/%s\n",
1015 "events/%s\n", call->name); 1153 call->name);
1016 return ret; 1154 return ret;
1017 } 1155 }
1018 } 1156 }
@@ -1021,11 +1159,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1021 if (!d_events) 1159 if (!d_events)
1022 return -ENOENT; 1160 return -ENOENT;
1023 1161
1024 ret = event_create_dir(call, d_events, &ftrace_event_id_fops, 1162 ret = event_create_dir(call, d_events, id, enable, filter, format);
1025 &ftrace_enable_fops, &ftrace_event_filter_fops,
1026 &ftrace_event_format_fops);
1027 if (!ret) 1163 if (!ret)
1028 list_add(&call->list, &ftrace_events); 1164 list_add(&call->list, &ftrace_events);
1165 call->mod = mod;
1029 1166
1030 return ret; 1167 return ret;
1031} 1168}
@@ -1035,7 +1172,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
1035{ 1172{
1036 int ret; 1173 int ret;
1037 mutex_lock(&event_mutex); 1174 mutex_lock(&event_mutex);
1038 ret = __trace_add_event_call(call); 1175 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1176 &ftrace_enable_fops,
1177 &ftrace_event_filter_fops,
1178 &ftrace_event_format_fops);
1039 mutex_unlock(&event_mutex); 1179 mutex_unlock(&event_mutex);
1040 return ret; 1180 return ret;
1041} 1181}
@@ -1152,8 +1292,6 @@ static void trace_module_add_events(struct module *mod)
1152{ 1292{
1153 struct ftrace_module_file_ops *file_ops = NULL; 1293 struct ftrace_module_file_ops *file_ops = NULL;
1154 struct ftrace_event_call *call, *start, *end; 1294 struct ftrace_event_call *call, *start, *end;
1155 struct dentry *d_events;
1156 int ret;
1157 1295
1158 start = mod->trace_events; 1296 start = mod->trace_events;
1159 end = mod->trace_events + mod->num_trace_events; 1297 end = mod->trace_events + mod->num_trace_events;
@@ -1161,38 +1299,14 @@ static void trace_module_add_events(struct module *mod)
1161 if (start == end) 1299 if (start == end)
1162 return; 1300 return;
1163 1301
1164 d_events = event_trace_events_dir(); 1302 file_ops = trace_create_file_ops(mod);
1165 if (!d_events) 1303 if (!file_ops)
1166 return; 1304 return;
1167 1305
1168 for_each_event(call, start, end) { 1306 for_each_event(call, start, end) {
1169 /* The linker may leave blanks */ 1307 __trace_add_event_call(call, mod,
1170 if (!call->name)
1171 continue;
1172 if (call->class->raw_init) {
1173 ret = call->class->raw_init(call);
1174 if (ret < 0) {
1175 if (ret != -ENOSYS)
1176 pr_warning("Could not initialize trace "
1177 "point events/%s\n", call->name);
1178 continue;
1179 }
1180 }
1181 /*
1182 * This module has events, create file ops for this module
1183 * if not already done.
1184 */
1185 if (!file_ops) {
1186 file_ops = trace_create_file_ops(mod);
1187 if (!file_ops)
1188 return;
1189 }
1190 call->mod = mod;
1191 ret = event_create_dir(call, d_events,
1192 &file_ops->id, &file_ops->enable, 1308 &file_ops->id, &file_ops->enable,
1193 &file_ops->filter, &file_ops->format); 1309 &file_ops->filter, &file_ops->format);
1194 if (!ret)
1195 list_add(&call->list, &ftrace_events);
1196 } 1310 }
1197} 1311}
1198 1312
@@ -1319,25 +1433,14 @@ static __init int event_trace_init(void)
1319 trace_create_file("enable", 0644, d_events, 1433 trace_create_file("enable", 0644, d_events,
1320 NULL, &ftrace_system_enable_fops); 1434 NULL, &ftrace_system_enable_fops);
1321 1435
1436 if (trace_define_common_fields())
1437 pr_warning("tracing: Failed to allocate common fields");
1438
1322 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1439 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1323 /* The linker may leave blanks */ 1440 __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1324 if (!call->name)
1325 continue;
1326 if (call->class->raw_init) {
1327 ret = call->class->raw_init(call);
1328 if (ret < 0) {
1329 if (ret != -ENOSYS)
1330 pr_warning("Could not initialize trace "
1331 "point events/%s\n", call->name);
1332 continue;
1333 }
1334 }
1335 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1336 &ftrace_enable_fops, 1441 &ftrace_enable_fops,
1337 &ftrace_event_filter_fops, 1442 &ftrace_event_filter_fops,
1338 &ftrace_event_format_fops); 1443 &ftrace_event_format_fops);
1339 if (!ret)
1340 list_add(&call->list, &ftrace_events);
1341 } 1444 }
1342 1445
1343 while (true) { 1446 while (true) {
@@ -1524,12 +1627,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1524 struct ftrace_entry *entry; 1627 struct ftrace_entry *entry;
1525 unsigned long flags; 1628 unsigned long flags;
1526 long disabled; 1629 long disabled;
1527 int resched;
1528 int cpu; 1630 int cpu;
1529 int pc; 1631 int pc;
1530 1632
1531 pc = preempt_count(); 1633 pc = preempt_count();
1532 resched = ftrace_preempt_disable(); 1634 preempt_disable_notrace();
1533 cpu = raw_smp_processor_id(); 1635 cpu = raw_smp_processor_id();
1534 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); 1636 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1535 1637
@@ -1551,7 +1653,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1551 1653
1552 out: 1654 out:
1553 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1655 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1554 ftrace_preempt_enable(resched); 1656 preempt_enable_notrace();
1555} 1657}
1556 1658
1557static struct ftrace_ops trace_ops __initdata = 1659static struct ftrace_ops trace_ops __initdata =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 57bb1bb32999..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system,
497} 497}
498 498
499static struct ftrace_event_field * 499static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500__find_event_field(struct list_head *head, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
504 503
505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) { 504 list_for_each_entry(field, head, link) {
507 if (!strcmp(field->name, name)) 505 if (!strcmp(field->name, name))
508 return field; 506 return field;
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
511 return NULL; 509 return NULL;
512} 510}
513 511
512static struct ftrace_event_field *
513find_event_field(struct ftrace_event_call *call, char *name)
514{
515 struct ftrace_event_field *field;
516 struct list_head *head;
517
518 field = __find_event_field(&ftrace_common_fields, name);
519 if (field)
520 return field;
521
522 head = trace_get_fields(call);
523 return __find_event_field(head, name);
524}
525
514static void filter_free_pred(struct filter_pred *pred) 526static void filter_free_pred(struct filter_pred *pred)
515{ 527{
516 if (!pred) 528 if (!pred)
@@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system)
627 int err; 639 int err;
628 640
629 list_for_each_entry(call, &ftrace_events, list) { 641 list_for_each_entry(call, &ftrace_events, list) {
630 if (!call->class || !call->class->define_fields)
631 continue;
632
633 if (strcmp(call->class->system, system->name) != 0) 642 if (strcmp(call->class->system, system->name) != 0)
634 continue; 643 continue;
635 644
@@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
646 struct ftrace_event_call *call; 655 struct ftrace_event_call *call;
647 656
648 list_for_each_entry(call, &ftrace_events, list) { 657 list_for_each_entry(call, &ftrace_events, list) {
649 if (!call->class || !call->class->define_fields)
650 continue;
651
652 if (strcmp(call->class->system, system->name) != 0) 658 if (strcmp(call->class->system, system->name) != 0)
653 continue; 659 continue;
654 660
@@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system,
1251 list_for_each_entry(call, &ftrace_events, list) { 1257 list_for_each_entry(call, &ftrace_events, list) {
1252 struct event_filter *filter = call->filter; 1258 struct event_filter *filter = call->filter;
1253 1259
1254 if (!call->class || !call->class->define_fields)
1255 continue;
1256
1257 if (strcmp(call->class->system, system->name) != 0) 1260 if (strcmp(call->class->system, system->name) != 0)
1258 continue; 1261 continue;
1259 1262
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 8536e2a65969..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
125 125
126#include "trace_entries.h" 126#include "trace_entries.h"
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{
130 INIT_LIST_HEAD(&call->class->fields);
131 return 0;
132}
133
134#undef __entry 128#undef __entry
135#define __entry REC 129#define __entry REC
136 130
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
158struct ftrace_event_class event_class_ftrace_##call = { \ 152struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \ 153 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \ 154 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \ 155 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
162}; \ 156}; \
163 \ 157 \
164struct ftrace_event_call __used \ 158struct ftrace_event_call __used \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
54 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
55 unsigned long flags; 55 unsigned long flags;
56 long disabled; 56 long disabled;
57 int cpu, resched; 57 int cpu;
58 int pc; 58 int pc;
59 59
60 if (unlikely(!ftrace_function_enabled)) 60 if (unlikely(!ftrace_function_enabled))
61 return; 61 return;
62 62
63 pc = preempt_count(); 63 pc = preempt_count();
64 resched = ftrace_preempt_disable(); 64 preempt_disable_notrace();
65 local_save_flags(flags); 65 local_save_flags(flags);
66 cpu = raw_smp_processor_id(); 66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu]; 67 data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
71 trace_function(tr, ip, parent_ip, flags, pc); 71 trace_function(tr, ip, parent_ip, flags, pc);
72 72
73 atomic_dec(&data->disabled); 73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched); 74 preempt_enable_notrace();
75} 75}
76 76
77static void 77static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..6f233698518e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -507,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter,
507 * if the output fails. 507 * if the output fails.
508 */ 508 */
509 data->ent = *curr; 509 data->ent = *curr;
510 data->ret = *next; 510 /*
511 * If the next event is not a return type, then
512 * we only care about what type it is. Otherwise we can
513 * safely copy the entire event.
514 */
515 if (next->ent.type == TRACE_GRAPH_RET)
516 data->ret = *next;
517 else
518 data->ret.ent.type = next->ent.type;
511 } 519 }
512 } 520 }
513 521
@@ -641,7 +649,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
641 649
642 /* Print nsecs (we don't want to exceed 7 numbers) */ 650 /* Print nsecs (we don't want to exceed 7 numbers) */
643 if (len < 7) { 651 if (len < 7) {
644 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); 652 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
653 nsecs_rem);
645 ret = trace_seq_printf(s, ".%s", nsecs_str); 654 ret = trace_seq_printf(s, ".%s", nsecs_str);
646 if (!ret) 655 if (!ret)
647 return TRACE_TYPE_PARTIAL_LINE; 656 return TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6fd486e0cef4..73a6b0601f2e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly =
649#endif 649#endif
650 .open = irqsoff_trace_open, 650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close, 651 .close = irqsoff_trace_close,
652 .use_max_tr = 1,
652}; 653};
653# define register_irqsoff(trace) register_tracer(&trace) 654# define register_irqsoff(trace) register_tracer(&trace)
654#else 655#else
@@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly =
681#endif 682#endif
682 .open = irqsoff_trace_open, 683 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close, 684 .close = irqsoff_trace_close,
685 .use_max_tr = 1,
684}; 686};
685# define register_preemptoff(trace) register_tracer(&trace) 687# define register_preemptoff(trace) register_tracer(&trace)
686#else 688#else
@@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
715#endif 717#endif
716 .open = irqsoff_trace_open, 718 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close, 719 .close = irqsoff_trace_close,
720 .use_max_tr = 1,
718}; 721};
719 722
720# define register_preemptirqsoff(trace) register_tracer(&trace) 723# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..7b8ecd751d93
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,136 @@
1/*
2 * kdb helper for dumping the ftrace buffer
3 *
4 * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
5 *
6 * ftrace_dump_buf based on ftrace_dump:
7 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
8 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
9 *
10 */
11#include <linux/init.h>
12#include <linux/kgdb.h>
13#include <linux/kdb.h>
14#include <linux/ftrace.h>
15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h"
18#include "trace_output.h"
19
20static void ftrace_dump_buf(int skip_lines, long cpu_file)
21{
22 /* use static because iter can be a bit big for the stack */
23 static struct trace_iterator iter;
24 unsigned int old_userobj;
25 int cnt = 0, cpu;
26
27 trace_init_global_iter(&iter);
28
29 for_each_tracing_cpu(cpu) {
30 atomic_inc(&iter.tr->data[cpu]->disabled);
31 }
32
33 old_userobj = trace_flags;
34
35 /* don't look at user memory in panic mode */
36 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
37
38 kdb_printf("Dumping ftrace buffer:\n");
39
40 /* reset all but tr, trace, and overruns */
41 memset(&iter.seq, 0,
42 sizeof(struct trace_iterator) -
43 offsetof(struct trace_iterator, seq));
44 iter.iter_flags |= TRACE_FILE_LAT_FMT;
45 iter.pos = -1;
46
47 if (cpu_file == TRACE_PIPE_ALL_CPU) {
48 for_each_tracing_cpu(cpu) {
49 iter.buffer_iter[cpu] =
50 ring_buffer_read_prepare(iter.tr->buffer, cpu);
51 ring_buffer_read_start(iter.buffer_iter[cpu]);
52 tracing_iter_reset(&iter, cpu);
53 }
54 } else {
55 iter.cpu_file = cpu_file;
56 iter.buffer_iter[cpu_file] =
57 ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
58 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
59 tracing_iter_reset(&iter, cpu_file);
60 }
61 if (!trace_empty(&iter))
62 trace_find_next_entry_inc(&iter);
63 while (!trace_empty(&iter)) {
64 if (!cnt)
65 kdb_printf("---------------------------------\n");
66 cnt++;
67
68 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
69 print_trace_line(&iter);
70 if (!skip_lines)
71 trace_printk_seq(&iter.seq);
72 else
73 skip_lines--;
74 if (KDB_FLAG(CMD_INTERRUPT))
75 goto out;
76 }
77
78 if (!cnt)
79 kdb_printf(" (ftrace buffer empty)\n");
80 else
81 kdb_printf("---------------------------------\n");
82
83out:
84 trace_flags = old_userobj;
85
86 for_each_tracing_cpu(cpu) {
87 atomic_dec(&iter.tr->data[cpu]->disabled);
88 }
89
90 for_each_tracing_cpu(cpu)
91 if (iter.buffer_iter[cpu])
92 ring_buffer_read_finish(iter.buffer_iter[cpu]);
93}
94
95/*
96 * kdb_ftdump - Dump the ftrace log buffer
97 */
98static int kdb_ftdump(int argc, const char **argv)
99{
100 int skip_lines = 0;
101 long cpu_file;
102 char *cp;
103
104 if (argc > 2)
105 return KDB_ARGCOUNT;
106
107 if (argc) {
108 skip_lines = simple_strtol(argv[1], &cp, 0);
109 if (*cp)
110 skip_lines = 0;
111 }
112
113 if (argc == 2) {
114 cpu_file = simple_strtol(argv[2], &cp, 0);
115 if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
116 !cpu_online(cpu_file))
117 return KDB_BADINT;
118 } else {
119 cpu_file = TRACE_PIPE_ALL_CPU;
120 }
121
122 kdb_trap_printk++;
123 ftrace_dump_buf(skip_lines, cpu_file);
124 kdb_trap_printk--;
125
126 return 0;
127}
128
129static __init int kdb_ftrace_register(void)
130{
131 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
132 "Dump ftrace log", 0, KDB_REPEAT_NONE);
133 return 0;
134}
135
136late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f52b5f50299d..544301d29dee 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,6 +30,8 @@
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <linux/uaccess.h>
33#include <asm/bitsperlong.h> 35#include <asm/bitsperlong.h>
34 36
35#include "trace.h" 37#include "trace.h"
@@ -38,6 +40,7 @@
38#define MAX_TRACE_ARGS 128 40#define MAX_TRACE_ARGS 128
39#define MAX_ARGSTR_LEN 63 41#define MAX_ARGSTR_LEN 63
40#define MAX_EVENT_NAME_LEN 64 42#define MAX_EVENT_NAME_LEN 64
43#define MAX_STRING_SIZE PATH_MAX
41#define KPROBE_EVENT_SYSTEM "kprobes" 44#define KPROBE_EVENT_SYSTEM "kprobes"
42 45
43/* Reserved field names */ 46/* Reserved field names */
@@ -58,14 +61,16 @@ const char *reserved_field_names[] = {
58}; 61};
59 62
60/* Printing function type */ 63/* Printing function type */
61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); 64typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
65 void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type 66#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type 67#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64 68
65/* Printing in basic type function template */ 69/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ 70#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 71static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\ 72 const char *name, \
73 void *data, void *ent)\
69{ \ 74{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ 75 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \ 76} \
@@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) 85DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) 86DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82 87
88/* data_rloc: data relative location, compatible with u32 */
89#define make_data_rloc(len, roffs) \
90 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
91#define get_rloc_len(dl) ((u32)(dl) >> 16)
92#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
93
94static inline void *get_rloc_data(u32 *dl)
95{
96 return (u8 *)dl + get_rloc_offs(*dl);
97}
98
99/* For data_loc conversion */
100static inline void *get_loc_data(u32 *dl, void *ent)
101{
102 return (u8 *)ent + get_rloc_offs(*dl);
103}
104
105/*
106 * Convert data_rloc to data_loc:
107 * data_rloc stores the offset from data_rloc itself, but data_loc
108 * stores the offset from event entry.
109 */
110#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
111
112/* For defining macros, define string/string_size types */
113typedef u32 string;
114typedef u32 string_size;
115
116/* Print type function for string type */
117static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
118 const char *name,
119 void *data, void *ent)
120{
121 int len = *(u32 *)data >> 16;
122
123 if (!len)
124 return trace_seq_printf(s, " %s=(fault)", name);
125 else
126 return trace_seq_printf(s, " %s=\"%s\"", name,
127 (const char *)get_loc_data(data, ent));
128}
129static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
130
83/* Data fetch function type */ 131/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); 132typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85 133
@@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm,
94 return fprm->fn(regs, fprm->data, dest); 142 return fprm->fn(regs, fprm->data, dest);
95} 143}
96 144
97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type 145#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
98/* 146/*
99 * Define macro for basic types - we don't need to define s* types, because 147 * Define macro for basic types - we don't need to define s* types, because
100 * we have to care only about bitwidth at recording time. 148 * we have to care only about bitwidth at recording time.
101 */ 149 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \ 150#define DEFINE_BASIC_FETCH_FUNCS(method) \
103DEFINE_FETCH_##kind(u8) \ 151DEFINE_FETCH_##method(u8) \
104DEFINE_FETCH_##kind(u16) \ 152DEFINE_FETCH_##method(u16) \
105DEFINE_FETCH_##kind(u32) \ 153DEFINE_FETCH_##method(u32) \
106DEFINE_FETCH_##kind(u64) 154DEFINE_FETCH_##method(u64)
107 155
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ 156#define CHECK_FETCH_FUNCS(method, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \ 157 (((FETCH_FUNC_NAME(method, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \ 158 (FETCH_FUNC_NAME(method, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \ 159 (FETCH_FUNC_NAME(method, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn)) 160 (FETCH_FUNC_NAME(method, u64) == fn) || \
161 (FETCH_FUNC_NAME(method, string) == fn) || \
162 (FETCH_FUNC_NAME(method, string_size) == fn)) \
163 && (fn != NULL))
113 164
114/* Data fetch function templates */ 165/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \ 166#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 167static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \ 168 void *offset, void *dest) \
118{ \ 169{ \
119 *(type *)dest = (type)regs_get_register(regs, \ 170 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \ 171 (unsigned int)((unsigned long)offset)); \
121} 172}
122DEFINE_BASIC_FETCH_FUNCS(reg) 173DEFINE_BASIC_FETCH_FUNCS(reg)
174/* No string on the register */
175#define fetch_reg_string NULL
176#define fetch_reg_string_size NULL
123 177
124#define DEFINE_FETCH_stack(type) \ 178#define DEFINE_FETCH_stack(type) \
125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 179static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
@@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
129 (unsigned int)((unsigned long)offset)); \ 183 (unsigned int)((unsigned long)offset)); \
130} 184}
131DEFINE_BASIC_FETCH_FUNCS(stack) 185DEFINE_BASIC_FETCH_FUNCS(stack)
186/* No string on the stack entry */
187#define fetch_stack_string NULL
188#define fetch_stack_string_size NULL
132 189
133#define DEFINE_FETCH_retval(type) \ 190#define DEFINE_FETCH_retval(type) \
134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ 191static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
@@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
137 *(type *)dest = (type)regs_return_value(regs); \ 194 *(type *)dest = (type)regs_return_value(regs); \
138} 195}
139DEFINE_BASIC_FETCH_FUNCS(retval) 196DEFINE_BASIC_FETCH_FUNCS(retval)
197/* No string on the retval */
198#define fetch_retval_string NULL
199#define fetch_retval_string_size NULL
140 200
141#define DEFINE_FETCH_memory(type) \ 201#define DEFINE_FETCH_memory(type) \
142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 202static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
@@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
149 *(type *)dest = retval; \ 209 *(type *)dest = retval; \
150} 210}
151DEFINE_BASIC_FETCH_FUNCS(memory) 211DEFINE_BASIC_FETCH_FUNCS(memory)
212/*
213 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
214 * length and relative data location.
215 */
216static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
217 void *addr, void *dest)
218{
219 long ret;
220 int maxlen = get_rloc_len(*(u32 *)dest);
221 u8 *dst = get_rloc_data(dest);
222 u8 *src = addr;
223 mm_segment_t old_fs = get_fs();
224 if (!maxlen)
225 return;
226 /*
227 * Try to get string again, since the string can be changed while
228 * probing.
229 */
230 set_fs(KERNEL_DS);
231 pagefault_disable();
232 do
233 ret = __copy_from_user_inatomic(dst++, src++, 1);
234 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
235 dst[-1] = '\0';
236 pagefault_enable();
237 set_fs(old_fs);
238
239 if (ret < 0) { /* Failed to fetch string */
240 ((u8 *)get_rloc_data(dest))[0] = '\0';
241 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
242 } else
243 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
244 get_rloc_offs(*(u32 *)dest));
245}
246/* Return the length of string -- including null terminal byte */
247static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
248 void *addr, void *dest)
249{
250 int ret, len = 0;
251 u8 c;
252 mm_segment_t old_fs = get_fs();
253
254 set_fs(KERNEL_DS);
255 pagefault_disable();
256 do {
257 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
258 len++;
259 } while (c && ret == 0 && len < MAX_STRING_SIZE);
260 pagefault_enable();
261 set_fs(old_fs);
262
263 if (ret < 0) /* Failed to check the length */
264 *(u32 *)dest = 0;
265 else
266 *(u32 *)dest = len;
267}
152 268
153/* Memory fetching by symbol */ 269/* Memory fetching by symbol */
154struct symbol_cache { 270struct symbol_cache {
@@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
203 *(type *)dest = 0; \ 319 *(type *)dest = 0; \
204} 320}
205DEFINE_BASIC_FETCH_FUNCS(symbol) 321DEFINE_BASIC_FETCH_FUNCS(symbol)
322DEFINE_FETCH_symbol(string)
323DEFINE_FETCH_symbol(string_size)
206 324
207/* Dereference memory access function */ 325/* Dereference memory access function */
208struct deref_fetch_param { 326struct deref_fetch_param {
@@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
224 *(type *)dest = 0; \ 342 *(type *)dest = 0; \
225} 343}
226DEFINE_BASIC_FETCH_FUNCS(deref) 344DEFINE_BASIC_FETCH_FUNCS(deref)
345DEFINE_FETCH_deref(string)
346DEFINE_FETCH_deref(string_size)
227 347
228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 348static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
229{ 349{
230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) 350 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
231 free_deref_fetch_param(data->orig.data); 351 free_deref_fetch_param(data->orig.data);
232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) 352 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
233 free_symbol_cache(data->orig.data); 353 free_symbol_cache(data->orig.data);
234 kfree(data); 354 kfree(data);
235} 355}
@@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) 360#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) 361#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242 362
243#define ASSIGN_FETCH_FUNC(kind, type) \ 363/* Fetch types */
244 .kind = FETCH_FUNC_NAME(kind, type) 364enum {
245 365 FETCH_MTD_reg = 0,
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ 366 FETCH_MTD_stack,
247 {.name = #ptype, \ 367 FETCH_MTD_retval,
248 .size = sizeof(ftype), \ 368 FETCH_MTD_memory,
249 .is_signed = sign, \ 369 FETCH_MTD_symbol,
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \ 370 FETCH_MTD_deref,
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \ 371 FETCH_MTD_END,
252ASSIGN_FETCH_FUNC(reg, ftype), \ 372};
253ASSIGN_FETCH_FUNC(stack, ftype), \ 373
254ASSIGN_FETCH_FUNC(retval, ftype), \ 374#define ASSIGN_FETCH_FUNC(method, type) \
255ASSIGN_FETCH_FUNC(memory, ftype), \ 375 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
256ASSIGN_FETCH_FUNC(symbol, ftype), \ 376
257ASSIGN_FETCH_FUNC(deref, ftype), \ 377#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
378 {.name = _name, \
379 .size = _size, \
380 .is_signed = sign, \
381 .print = PRINT_TYPE_FUNC_NAME(ptype), \
382 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
383 .fmttype = _fmttype, \
384 .fetch = { \
385ASSIGN_FETCH_FUNC(reg, ftype), \
386ASSIGN_FETCH_FUNC(stack, ftype), \
387ASSIGN_FETCH_FUNC(retval, ftype), \
388ASSIGN_FETCH_FUNC(memory, ftype), \
389ASSIGN_FETCH_FUNC(symbol, ftype), \
390ASSIGN_FETCH_FUNC(deref, ftype), \
391 } \
258 } 392 }
259 393
394#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
395 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
396
397#define FETCH_TYPE_STRING 0
398#define FETCH_TYPE_STRSIZE 1
399
260/* Fetch type information table */ 400/* Fetch type information table */
261static const struct fetch_type { 401static const struct fetch_type {
262 const char *name; /* Name of type */ 402 const char *name; /* Name of type */
@@ -264,14 +404,16 @@ static const struct fetch_type {
264 int is_signed; /* Signed flag */ 404 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */ 405 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */ 406 const char *fmt; /* Fromat string */
407 const char *fmttype; /* Name in format file */
267 /* Fetch functions */ 408 /* Fetch functions */
268 fetch_func_t reg; 409 fetch_func_t fetch[FETCH_MTD_END];
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = { 410} fetch_type_table[] = {
411 /* Special types */
412 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
413 sizeof(u32), 1, "__data_loc char[]"),
414 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
415 string_size, sizeof(u32), 0, "u32"),
416 /* Basic types */
275 ASSIGN_FETCH_TYPE(u8, u8, 0), 417 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0), 418 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0), 419 ASSIGN_FETCH_TYPE(u32, u32, 0),
@@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs,
302 *(unsigned long *)dest = kernel_stack_pointer(regs); 444 *(unsigned long *)dest = kernel_stack_pointer(regs);
303} 445}
304 446
447static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
448 fetch_func_t orig_fn)
449{
450 int i;
451
452 if (type != &fetch_type_table[FETCH_TYPE_STRING])
453 return NULL; /* Only string type needs size function */
454 for (i = 0; i < FETCH_MTD_END; i++)
455 if (type->fetch[i] == orig_fn)
456 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
457
458 WARN_ON(1); /* This should not happen */
459 return NULL;
460}
461
305/** 462/**
306 * Kprobe event core functions 463 * Kprobe event core functions
307 */ 464 */
308 465
309struct probe_arg { 466struct probe_arg {
310 struct fetch_param fetch; 467 struct fetch_param fetch;
468 struct fetch_param fetch_size;
311 unsigned int offset; /* Offset from argument entry */ 469 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */ 470 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */ 471 const char *comm; /* Command of this argument */
@@ -356,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
356static int kretprobe_dispatcher(struct kretprobe_instance *ri, 514static int kretprobe_dispatcher(struct kretprobe_instance *ri,
357 struct pt_regs *regs); 515 struct pt_regs *regs);
358 516
359/* Check the name is good for event/group */ 517/* Check the name is good for event/group/fields */
360static int check_event_name(const char *name) 518static int is_good_name(const char *name)
361{ 519{
362 if (!isalpha(*name) && *name != '_') 520 if (!isalpha(*name) && *name != '_')
363 return 0; 521 return 0;
@@ -399,7 +557,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
399 else 557 else
400 tp->rp.kp.pre_handler = kprobe_dispatcher; 558 tp->rp.kp.pre_handler = kprobe_dispatcher;
401 559
402 if (!event || !check_event_name(event)) { 560 if (!event || !is_good_name(event)) {
403 ret = -EINVAL; 561 ret = -EINVAL;
404 goto error; 562 goto error;
405 } 563 }
@@ -409,7 +567,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
409 if (!tp->call.name) 567 if (!tp->call.name)
410 goto error; 568 goto error;
411 569
412 if (!group || !check_event_name(group)) { 570 if (!group || !is_good_name(group)) {
413 ret = -EINVAL; 571 ret = -EINVAL;
414 goto error; 572 goto error;
415 } 573 }
@@ -429,9 +587,9 @@ error:
429 587
430static void free_probe_arg(struct probe_arg *arg) 588static void free_probe_arg(struct probe_arg *arg)
431{ 589{
432 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) 590 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
433 free_deref_fetch_param(arg->fetch.data); 591 free_deref_fetch_param(arg->fetch.data);
434 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) 592 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
435 free_symbol_cache(arg->fetch.data); 593 free_symbol_cache(arg->fetch.data);
436 kfree(arg->name); 594 kfree(arg->name);
437 kfree(arg->comm); 595 kfree(arg->comm);
@@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
548 706
549 if (strcmp(arg, "retval") == 0) { 707 if (strcmp(arg, "retval") == 0) {
550 if (is_return) 708 if (is_return)
551 f->fn = t->retval; 709 f->fn = t->fetch[FETCH_MTD_retval];
552 else 710 else
553 ret = -EINVAL; 711 ret = -EINVAL;
554 } else if (strncmp(arg, "stack", 5) == 0) { 712 } else if (strncmp(arg, "stack", 5) == 0) {
@@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
562 if (ret || param > PARAM_MAX_STACK) 720 if (ret || param > PARAM_MAX_STACK)
563 ret = -EINVAL; 721 ret = -EINVAL;
564 else { 722 else {
565 f->fn = t->stack; 723 f->fn = t->fetch[FETCH_MTD_stack];
566 f->data = (void *)param; 724 f->data = (void *)param;
567 } 725 }
568 } else 726 } else
@@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
588 case '%': /* named register */ 746 case '%': /* named register */
589 ret = regs_query_register_offset(arg + 1); 747 ret = regs_query_register_offset(arg + 1);
590 if (ret >= 0) { 748 if (ret >= 0) {
591 f->fn = t->reg; 749 f->fn = t->fetch[FETCH_MTD_reg];
592 f->data = (void *)(unsigned long)ret; 750 f->data = (void *)(unsigned long)ret;
593 ret = 0; 751 ret = 0;
594 } 752 }
@@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
598 ret = strict_strtoul(arg + 1, 0, &param); 756 ret = strict_strtoul(arg + 1, 0, &param);
599 if (ret) 757 if (ret)
600 break; 758 break;
601 f->fn = t->memory; 759 f->fn = t->fetch[FETCH_MTD_memory];
602 f->data = (void *)param; 760 f->data = (void *)param;
603 } else { 761 } else {
604 ret = split_symbol_offset(arg + 1, &offset); 762 ret = split_symbol_offset(arg + 1, &offset);
@@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
606 break; 764 break;
607 f->data = alloc_symbol_cache(arg + 1, offset); 765 f->data = alloc_symbol_cache(arg + 1, offset);
608 if (f->data) 766 if (f->data)
609 f->fn = t->symbol; 767 f->fn = t->fetch[FETCH_MTD_symbol];
610 } 768 }
611 break; 769 break;
612 case '+': /* deref memory */ 770 case '+': /* deref memory */
@@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
636 if (ret) 794 if (ret)
637 kfree(dprm); 795 kfree(dprm);
638 else { 796 else {
639 f->fn = t->deref; 797 f->fn = t->fetch[FETCH_MTD_deref];
640 f->data = (void *)dprm; 798 f->data = (void *)dprm;
641 } 799 }
642 } 800 }
643 break; 801 break;
644 } 802 }
645 if (!ret && !f->fn) 803 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
804 pr_info("%s type has no corresponding fetch method.\n",
805 t->name);
646 ret = -EINVAL; 806 ret = -EINVAL;
807 }
647 return ret; 808 return ret;
648} 809}
649 810
@@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
652 struct probe_arg *parg, int is_return) 813 struct probe_arg *parg, int is_return)
653{ 814{
654 const char *t; 815 const char *t;
816 int ret;
655 817
656 if (strlen(arg) > MAX_ARGSTR_LEN) { 818 if (strlen(arg) > MAX_ARGSTR_LEN) {
657 pr_info("Argument is too long.: %s\n", arg); 819 pr_info("Argument is too long.: %s\n", arg);
@@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
674 } 836 }
675 parg->offset = tp->size; 837 parg->offset = tp->size;
676 tp->size += parg->type->size; 838 tp->size += parg->type->size;
677 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 839 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
840 if (ret >= 0) {
841 parg->fetch_size.fn = get_fetch_size_function(parg->type,
842 parg->fetch.fn);
843 parg->fetch_size.data = parg->fetch.data;
844 }
845 return ret;
678} 846}
679 847
680/* Return 1 if name is reserved or already used by another argument */ 848/* Return 1 if name is reserved or already used by another argument */
@@ -715,7 +883,7 @@ static int create_trace_probe(int argc, char **argv)
715 int i, ret = 0; 883 int i, ret = 0;
716 int is_return = 0, is_delete = 0; 884 int is_return = 0, is_delete = 0;
717 char *symbol = NULL, *event = NULL, *group = NULL; 885 char *symbol = NULL, *event = NULL, *group = NULL;
718 char *arg, *tmp; 886 char *arg;
719 unsigned long offset = 0; 887 unsigned long offset = 0;
720 void *addr = NULL; 888 void *addr = NULL;
721 char buf[MAX_EVENT_NAME_LEN]; 889 char buf[MAX_EVENT_NAME_LEN];
@@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv)
757 pr_info("Delete command needs an event name.\n"); 925 pr_info("Delete command needs an event name.\n");
758 return -EINVAL; 926 return -EINVAL;
759 } 927 }
928 mutex_lock(&probe_lock);
760 tp = find_probe_event(event, group); 929 tp = find_probe_event(event, group);
761 if (!tp) { 930 if (!tp) {
931 mutex_unlock(&probe_lock);
762 pr_info("Event %s/%s doesn't exist.\n", group, event); 932 pr_info("Event %s/%s doesn't exist.\n", group, event);
763 return -ENOENT; 933 return -ENOENT;
764 } 934 }
765 /* delete an event */ 935 /* delete an event */
766 unregister_trace_probe(tp); 936 unregister_trace_probe(tp);
767 free_trace_probe(tp); 937 free_trace_probe(tp);
938 mutex_unlock(&probe_lock);
768 return 0; 939 return 0;
769 } 940 }
770 941
@@ -821,26 +992,36 @@ static int create_trace_probe(int argc, char **argv)
821 /* parse arguments */ 992 /* parse arguments */
822 ret = 0; 993 ret = 0;
823 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 994 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
995 /* Increment count for freeing args in error case */
996 tp->nr_args++;
997
824 /* Parse argument name */ 998 /* Parse argument name */
825 arg = strchr(argv[i], '='); 999 arg = strchr(argv[i], '=');
826 if (arg) 1000 if (arg) {
827 *arg++ = '\0'; 1001 *arg++ = '\0';
828 else 1002 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
1003 } else {
829 arg = argv[i]; 1004 arg = argv[i];
1005 /* If argument name is omitted, set "argN" */
1006 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
1007 tp->args[i].name = kstrdup(buf, GFP_KERNEL);
1008 }
830 1009
831 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
832 if (!tp->args[i].name) { 1010 if (!tp->args[i].name) {
833 pr_info("Failed to allocate argument%d name '%s'.\n", 1011 pr_info("Failed to allocate argument[%d] name.\n", i);
834 i, argv[i]);
835 ret = -ENOMEM; 1012 ret = -ENOMEM;
836 goto error; 1013 goto error;
837 } 1014 }
838 tmp = strchr(tp->args[i].name, ':'); 1015
839 if (tmp) 1016 if (!is_good_name(tp->args[i].name)) {
840 *tmp = '_'; /* convert : to _ */ 1017 pr_info("Invalid argument[%d] name: %s\n",
1018 i, tp->args[i].name);
1019 ret = -EINVAL;
1020 goto error;
1021 }
841 1022
842 if (conflict_field_name(tp->args[i].name, tp->args, i)) { 1023 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
843 pr_info("Argument%d name '%s' conflicts with " 1024 pr_info("Argument[%d] name '%s' conflicts with "
844 "another field.\n", i, argv[i]); 1025 "another field.\n", i, argv[i]);
845 ret = -EINVAL; 1026 ret = -EINVAL;
846 goto error; 1027 goto error;
@@ -849,12 +1030,9 @@ static int create_trace_probe(int argc, char **argv)
849 /* Parse fetch argument */ 1030 /* Parse fetch argument */
850 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); 1031 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
851 if (ret) { 1032 if (ret) {
852 pr_info("Parse error at argument%d. (%d)\n", i, ret); 1033 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
853 kfree(tp->args[i].name);
854 goto error; 1034 goto error;
855 } 1035 }
856
857 tp->nr_args++;
858 } 1036 }
859 1037
860 ret = register_trace_probe(tp); 1038 ret = register_trace_probe(tp);
@@ -1043,6 +1221,54 @@ static const struct file_operations kprobe_profile_ops = {
1043 .release = seq_release, 1221 .release = seq_release,
1044}; 1222};
1045 1223
1224/* Sum up total data length for dynamic arraies (strings) */
1225static __kprobes int __get_data_size(struct trace_probe *tp,
1226 struct pt_regs *regs)
1227{
1228 int i, ret = 0;
1229 u32 len;
1230
1231 for (i = 0; i < tp->nr_args; i++)
1232 if (unlikely(tp->args[i].fetch_size.fn)) {
1233 call_fetch(&tp->args[i].fetch_size, regs, &len);
1234 ret += len;
1235 }
1236
1237 return ret;
1238}
1239
1240/* Store the value of each argument */
1241static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
1242 struct pt_regs *regs,
1243 u8 *data, int maxlen)
1244{
1245 int i;
1246 u32 end = tp->size;
1247 u32 *dl; /* Data (relative) location */
1248
1249 for (i = 0; i < tp->nr_args; i++) {
1250 if (unlikely(tp->args[i].fetch_size.fn)) {
1251 /*
1252 * First, we set the relative location and
1253 * maximum data length to *dl
1254 */
1255 dl = (u32 *)(data + tp->args[i].offset);
1256 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
1257 /* Then try to fetch string or dynamic array data */
1258 call_fetch(&tp->args[i].fetch, regs, dl);
1259 /* Reduce maximum length */
1260 end += get_rloc_len(*dl);
1261 maxlen -= get_rloc_len(*dl);
1262 /* Trick here, convert data_rloc to data_loc */
1263 *dl = convert_rloc_to_loc(*dl,
1264 ent_size + tp->args[i].offset);
1265 } else
1266 /* Just fetching data normally */
1267 call_fetch(&tp->args[i].fetch, regs,
1268 data + tp->args[i].offset);
1269 }
1270}
1271
1046/* Kprobe handler */ 1272/* Kprobe handler */
1047static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1273static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1048{ 1274{
@@ -1050,8 +1276,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1050 struct kprobe_trace_entry_head *entry; 1276 struct kprobe_trace_entry_head *entry;
1051 struct ring_buffer_event *event; 1277 struct ring_buffer_event *event;
1052 struct ring_buffer *buffer; 1278 struct ring_buffer *buffer;
1053 u8 *data; 1279 int size, dsize, pc;
1054 int size, i, pc;
1055 unsigned long irq_flags; 1280 unsigned long irq_flags;
1056 struct ftrace_event_call *call = &tp->call; 1281 struct ftrace_event_call *call = &tp->call;
1057 1282
@@ -1060,7 +1285,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1060 local_save_flags(irq_flags); 1285 local_save_flags(irq_flags);
1061 pc = preempt_count(); 1286 pc = preempt_count();
1062 1287
1063 size = sizeof(*entry) + tp->size; 1288 dsize = __get_data_size(tp, regs);
1289 size = sizeof(*entry) + tp->size + dsize;
1064 1290
1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1291 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1066 size, irq_flags, pc); 1292 size, irq_flags, pc);
@@ -1069,9 +1295,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1069 1295
1070 entry = ring_buffer_event_data(event); 1296 entry = ring_buffer_event_data(event);
1071 entry->ip = (unsigned long)kp->addr; 1297 entry->ip = (unsigned long)kp->addr;
1072 data = (u8 *)&entry[1]; 1298 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1073 for (i = 0; i < tp->nr_args; i++)
1074 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1075 1299
1076 if (!filter_current_check_discard(buffer, call, entry, event)) 1300 if (!filter_current_check_discard(buffer, call, entry, event))
1077 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1301 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1085,15 +1309,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1085 struct kretprobe_trace_entry_head *entry; 1309 struct kretprobe_trace_entry_head *entry;
1086 struct ring_buffer_event *event; 1310 struct ring_buffer_event *event;
1087 struct ring_buffer *buffer; 1311 struct ring_buffer *buffer;
1088 u8 *data; 1312 int size, pc, dsize;
1089 int size, i, pc;
1090 unsigned long irq_flags; 1313 unsigned long irq_flags;
1091 struct ftrace_event_call *call = &tp->call; 1314 struct ftrace_event_call *call = &tp->call;
1092 1315
1093 local_save_flags(irq_flags); 1316 local_save_flags(irq_flags);
1094 pc = preempt_count(); 1317 pc = preempt_count();
1095 1318
1096 size = sizeof(*entry) + tp->size; 1319 dsize = __get_data_size(tp, regs);
1320 size = sizeof(*entry) + tp->size + dsize;
1097 1321
1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1322 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1099 size, irq_flags, pc); 1323 size, irq_flags, pc);
@@ -1103,9 +1327,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1103 entry = ring_buffer_event_data(event); 1327 entry = ring_buffer_event_data(event);
1104 entry->func = (unsigned long)tp->rp.kp.addr; 1328 entry->func = (unsigned long)tp->rp.kp.addr;
1105 entry->ret_ip = (unsigned long)ri->ret_addr; 1329 entry->ret_ip = (unsigned long)ri->ret_addr;
1106 data = (u8 *)&entry[1]; 1330 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1107 for (i = 0; i < tp->nr_args; i++)
1108 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1109 1331
1110 if (!filter_current_check_discard(buffer, call, entry, event)) 1332 if (!filter_current_check_discard(buffer, call, entry, event))
1111 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1333 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1137,7 +1359,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1137 data = (u8 *)&field[1]; 1359 data = (u8 *)&field[1];
1138 for (i = 0; i < tp->nr_args; i++) 1360 for (i = 0; i < tp->nr_args; i++)
1139 if (!tp->args[i].type->print(s, tp->args[i].name, 1361 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset)) 1362 data + tp->args[i].offset, field))
1141 goto partial; 1363 goto partial;
1142 1364
1143 if (!trace_seq_puts(s, "\n")) 1365 if (!trace_seq_puts(s, "\n"))
@@ -1179,7 +1401,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1179 data = (u8 *)&field[1]; 1401 data = (u8 *)&field[1];
1180 for (i = 0; i < tp->nr_args; i++) 1402 for (i = 0; i < tp->nr_args; i++)
1181 if (!tp->args[i].type->print(s, tp->args[i].name, 1403 if (!tp->args[i].type->print(s, tp->args[i].name,
1182 data + tp->args[i].offset)) 1404 data + tp->args[i].offset, field))
1183 goto partial; 1405 goto partial;
1184 1406
1185 if (!trace_seq_puts(s, "\n")) 1407 if (!trace_seq_puts(s, "\n"))
@@ -1214,11 +1436,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1214 } 1436 }
1215} 1437}
1216 1438
1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1218{
1219 return 0;
1220}
1221
1222#undef DEFINE_FIELD 1439#undef DEFINE_FIELD
1223#define DEFINE_FIELD(type, item, name, is_signed) \ 1440#define DEFINE_FIELD(type, item, name, is_signed) \
1224 do { \ 1441 do { \
@@ -1239,7 +1456,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1239 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1456 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1240 /* Set argument names as fields */ 1457 /* Set argument names as fields */
1241 for (i = 0; i < tp->nr_args; i++) { 1458 for (i = 0; i < tp->nr_args; i++) {
1242 ret = trace_define_field(event_call, tp->args[i].type->name, 1459 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1243 tp->args[i].name, 1460 tp->args[i].name,
1244 sizeof(field) + tp->args[i].offset, 1461 sizeof(field) + tp->args[i].offset,
1245 tp->args[i].type->size, 1462 tp->args[i].type->size,
@@ -1261,7 +1478,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1261 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1478 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1262 /* Set argument names as fields */ 1479 /* Set argument names as fields */
1263 for (i = 0; i < tp->nr_args; i++) { 1480 for (i = 0; i < tp->nr_args; i++) {
1264 ret = trace_define_field(event_call, tp->args[i].type->name, 1481 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1265 tp->args[i].name, 1482 tp->args[i].name,
1266 sizeof(field) + tp->args[i].offset, 1483 sizeof(field) + tp->args[i].offset,
1267 tp->args[i].type->size, 1484 tp->args[i].type->size,
@@ -1301,8 +1518,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1301 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1518 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1302 1519
1303 for (i = 0; i < tp->nr_args; i++) { 1520 for (i = 0; i < tp->nr_args; i++) {
1304 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", 1521 if (strcmp(tp->args[i].type->name, "string") == 0)
1305 tp->args[i].name); 1522 pos += snprintf(buf + pos, LEN_OR_ZERO,
1523 ", __get_str(%s)",
1524 tp->args[i].name);
1525 else
1526 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1527 tp->args[i].name);
1306 } 1528 }
1307 1529
1308#undef LEN_OR_ZERO 1530#undef LEN_OR_ZERO
@@ -1339,11 +1561,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1339 struct ftrace_event_call *call = &tp->call; 1561 struct ftrace_event_call *call = &tp->call;
1340 struct kprobe_trace_entry_head *entry; 1562 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head; 1563 struct hlist_head *head;
1342 u8 *data; 1564 int size, __size, dsize;
1343 int size, __size, i;
1344 int rctx; 1565 int rctx;
1345 1566
1346 __size = sizeof(*entry) + tp->size; 1567 dsize = __get_data_size(tp, regs);
1568 __size = sizeof(*entry) + tp->size + dsize;
1347 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1569 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1348 size -= sizeof(u32); 1570 size -= sizeof(u32);
1349 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1571 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1355,9 +1577,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1355 return; 1577 return;
1356 1578
1357 entry->ip = (unsigned long)kp->addr; 1579 entry->ip = (unsigned long)kp->addr;
1358 data = (u8 *)&entry[1]; 1580 memset(&entry[1], 0, dsize);
1359 for (i = 0; i < tp->nr_args; i++) 1581 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1361 1582
1362 head = this_cpu_ptr(call->perf_events); 1583 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1584 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
@@ -1371,11 +1592,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1371 struct ftrace_event_call *call = &tp->call; 1592 struct ftrace_event_call *call = &tp->call;
1372 struct kretprobe_trace_entry_head *entry; 1593 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head; 1594 struct hlist_head *head;
1374 u8 *data; 1595 int size, __size, dsize;
1375 int size, __size, i;
1376 int rctx; 1596 int rctx;
1377 1597
1378 __size = sizeof(*entry) + tp->size; 1598 dsize = __get_data_size(tp, regs);
1599 __size = sizeof(*entry) + tp->size + dsize;
1379 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1600 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1380 size -= sizeof(u32); 1601 size -= sizeof(u32);
1381 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1602 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1388,9 +1609,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1388 1609
1389 entry->func = (unsigned long)tp->rp.kp.addr; 1610 entry->func = (unsigned long)tp->rp.kp.addr;
1390 entry->ret_ip = (unsigned long)ri->ret_addr; 1611 entry->ret_ip = (unsigned long)ri->ret_addr;
1391 data = (u8 *)&entry[1]; 1612 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1392 for (i = 0; i < tp->nr_args; i++)
1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1394 1613
1395 head = this_cpu_ptr(call->perf_events); 1614 head = this_cpu_ptr(call->perf_events);
1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1615 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
@@ -1486,15 +1705,12 @@ static int register_probe_event(struct trace_probe *tp)
1486 int ret; 1705 int ret;
1487 1706
1488 /* Initialize ftrace_event_call */ 1707 /* Initialize ftrace_event_call */
1708 INIT_LIST_HEAD(&call->class->fields);
1489 if (probe_is_return(tp)) { 1709 if (probe_is_return(tp)) {
1490 INIT_LIST_HEAD(&call->class->fields);
1491 call->event.funcs = &kretprobe_funcs; 1710 call->event.funcs = &kretprobe_funcs;
1492 call->class->raw_init = probe_event_raw_init;
1493 call->class->define_fields = kretprobe_event_define_fields; 1711 call->class->define_fields = kretprobe_event_define_fields;
1494 } else { 1712 } else {
1495 INIT_LIST_HEAD(&call->class->fields);
1496 call->event.funcs = &kprobe_funcs; 1713 call->event.funcs = &kprobe_funcs;
1497 call->class->raw_init = probe_event_raw_init;
1498 call->class->define_fields = kprobe_event_define_fields; 1714 call->class->define_fields = kprobe_event_define_fields;
1499 } 1715 }
1500 if (set_print_fmt(tp) < 0) 1716 if (set_print_fmt(tp) < 0)
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 8eaf00749b65..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,508 +0,0 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/fs.h>
28
29#include "trace_output.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35#include <asm/atomic.h>
36
37#define KSYM_TRACER_OP_LEN 3 /* rw- */
38
39struct trace_ksym {
40 struct perf_event **ksym_hbp;
41 struct perf_event_attr attr;
42#ifdef CONFIG_PROFILE_KSYM_TRACER
43 atomic64_t counter;
44#endif
45 struct hlist_node ksym_hlist;
46};
47
48static struct trace_array *ksym_trace_array;
49
50static unsigned int ksym_tracing_enabled;
51
52static HLIST_HEAD(ksym_filter_head);
53
54static DEFINE_MUTEX(ksym_tracer_mutex);
55
56#ifdef CONFIG_PROFILE_KSYM_TRACER
57
58#define MAX_UL_INT 0xffffffff
59
60void ksym_collect_stats(unsigned long hbp_hit_addr)
61{
62 struct hlist_node *node;
63 struct trace_ksym *entry;
64
65 rcu_read_lock();
66 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
67 if (entry->attr.bp_addr == hbp_hit_addr) {
68 atomic64_inc(&entry->counter);
69 break;
70 }
71 }
72 rcu_read_unlock();
73}
74#endif /* CONFIG_PROFILE_KSYM_TRACER */
75
76void ksym_hbp_handler(struct perf_event *hbp, int nmi,
77 struct perf_sample_data *data,
78 struct pt_regs *regs)
79{
80 struct ring_buffer_event *event;
81 struct ksym_trace_entry *entry;
82 struct ring_buffer *buffer;
83 int pc;
84
85 if (!ksym_tracing_enabled)
86 return;
87
88 buffer = ksym_trace_array->buffer;
89
90 pc = preempt_count();
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
93 sizeof(*entry), 0, pc);
94 if (!event)
95 return;
96
97 entry = ring_buffer_event_data(event);
98 entry->ip = instruction_pointer(regs);
99 entry->type = hw_breakpoint_type(hbp);
100 entry->addr = hw_breakpoint_addr(hbp);
101 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
102
103#ifdef CONFIG_PROFILE_KSYM_TRACER
104 ksym_collect_stats(hw_breakpoint_addr(hbp));
105#endif /* CONFIG_PROFILE_KSYM_TRACER */
106
107 trace_buffer_unlock_commit(buffer, event, 0, pc);
108}
109
110/* Valid access types are represented as
111 *
112 * rw- : Set Read/Write Access Breakpoint
113 * -w- : Set Write Access Breakpoint
114 * --- : Clear Breakpoints
115 * --x : Set Execution Break points (Not available yet)
116 *
117 */
118static int ksym_trace_get_access_type(char *str)
119{
120 int access = 0;
121
122 if (str[0] == 'r')
123 access |= HW_BREAKPOINT_R;
124
125 if (str[1] == 'w')
126 access |= HW_BREAKPOINT_W;
127
128 if (str[2] == 'x')
129 access |= HW_BREAKPOINT_X;
130
131 switch (access) {
132 case HW_BREAKPOINT_R:
133 case HW_BREAKPOINT_W:
134 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
135 return access;
136 default:
137 return -EINVAL;
138 }
139}
140
141/*
142 * There can be several possible malformed requests and we attempt to capture
143 * all of them. We enumerate some of the rules
144 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
145 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
146 * <module>:<ksym_name>:<op>.
147 * 2. No delimiter symbol ':' in the input string
148 * 3. Spurious operator symbols or symbols not in their respective positions
149 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
150 * 5. Kernel symbol not a part of /proc/kallsyms
151 * 6. Duplicate requests
152 */
153static int parse_ksym_trace_str(char *input_string, char **ksymname,
154 unsigned long *addr)
155{
156 int ret;
157
158 *ksymname = strsep(&input_string, ":");
159 *addr = kallsyms_lookup_name(*ksymname);
160
161 /* Check for malformed request: (2), (1) and (5) */
162 if ((!input_string) ||
163 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
164 (*addr == 0))
165 return -EINVAL;;
166
167 ret = ksym_trace_get_access_type(input_string);
168
169 return ret;
170}
171
172int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
173{
174 struct trace_ksym *entry;
175 int ret = -ENOMEM;
176
177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
178 if (!entry)
179 return -ENOMEM;
180
181 hw_breakpoint_init(&entry->attr);
182
183 entry->attr.bp_type = op;
184 entry->attr.bp_addr = addr;
185 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
186
187 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
188 ksym_hbp_handler);
189
190 if (IS_ERR(entry->ksym_hbp)) {
191 ret = PTR_ERR(entry->ksym_hbp);
192 if (ret == -ENOSPC) {
193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
199 goto err;
200 }
201
202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
203
204 return 0;
205
206err:
207 kfree(entry);
208
209 return ret;
210}
211
212static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
213 size_t count, loff_t *ppos)
214{
215 struct trace_ksym *entry;
216 struct hlist_node *node;
217 struct trace_seq *s;
218 ssize_t cnt = 0;
219 int ret;
220
221 s = kmalloc(sizeof(*s), GFP_KERNEL);
222 if (!s)
223 return -ENOMEM;
224 trace_seq_init(s);
225
226 mutex_lock(&ksym_tracer_mutex);
227
228 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
229 ret = trace_seq_printf(s, "%pS:",
230 (void *)(unsigned long)entry->attr.bp_addr);
231 if (entry->attr.bp_type == HW_BREAKPOINT_R)
232 ret = trace_seq_puts(s, "r--\n");
233 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
234 ret = trace_seq_puts(s, "-w-\n");
235 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
236 ret = trace_seq_puts(s, "rw-\n");
237 WARN_ON_ONCE(!ret);
238 }
239
240 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
241
242 mutex_unlock(&ksym_tracer_mutex);
243
244 kfree(s);
245
246 return cnt;
247}
248
249static void __ksym_trace_reset(void)
250{
251 struct trace_ksym *entry;
252 struct hlist_node *node, *node1;
253
254 mutex_lock(&ksym_tracer_mutex);
255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
256 ksym_hlist) {
257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
258 hlist_del_rcu(&(entry->ksym_hlist));
259 synchronize_rcu();
260 kfree(entry);
261 }
262 mutex_unlock(&ksym_tracer_mutex);
263}
264
265static ssize_t ksym_trace_filter_write(struct file *file,
266 const char __user *buffer,
267 size_t count, loff_t *ppos)
268{
269 struct trace_ksym *entry;
270 struct hlist_node *node;
271 char *buf, *input_string, *ksymname = NULL;
272 unsigned long ksym_addr = 0;
273 int ret, op, changed = 0;
274
275 buf = kzalloc(count + 1, GFP_KERNEL);
276 if (!buf)
277 return -ENOMEM;
278
279 ret = -EFAULT;
280 if (copy_from_user(buf, buffer, count))
281 goto out;
282
283 buf[count] = '\0';
284 input_string = strstrip(buf);
285
286 /*
287 * Clear all breakpoints if:
288 * 1: echo > ksym_trace_filter
289 * 2: echo 0 > ksym_trace_filter
290 * 3: echo "*:---" > ksym_trace_filter
291 */
292 if (!input_string[0] || !strcmp(input_string, "0") ||
293 !strcmp(input_string, "*:---")) {
294 __ksym_trace_reset();
295 ret = 0;
296 goto out;
297 }
298
299 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
300 if (ret < 0)
301 goto out;
302
303 mutex_lock(&ksym_tracer_mutex);
304
305 ret = -EINVAL;
306 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
307 if (entry->attr.bp_addr == ksym_addr) {
308 /* Check for malformed request: (6) */
309 if (entry->attr.bp_type != op)
310 changed = 1;
311 else
312 goto out_unlock;
313 break;
314 }
315 }
316 if (changed) {
317 unregister_wide_hw_breakpoint(entry->ksym_hbp);
318 entry->attr.bp_type = op;
319 ret = 0;
320 if (op > 0) {
321 entry->ksym_hbp =
322 register_wide_hw_breakpoint(&entry->attr,
323 ksym_hbp_handler);
324 if (IS_ERR(entry->ksym_hbp))
325 ret = PTR_ERR(entry->ksym_hbp);
326 else
327 goto out_unlock;
328 }
329 /* Error or "symbol:---" case: drop it */
330 hlist_del_rcu(&(entry->ksym_hlist));
331 synchronize_rcu();
332 kfree(entry);
333 goto out_unlock;
334 } else {
335 /* Check for malformed request: (4) */
336 if (op)
337 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
338 }
339out_unlock:
340 mutex_unlock(&ksym_tracer_mutex);
341out:
342 kfree(buf);
343 return !ret ? count : ret;
344}
345
346static const struct file_operations ksym_tracing_fops = {
347 .open = tracing_open_generic,
348 .read = ksym_trace_filter_read,
349 .write = ksym_trace_filter_write,
350};
351
352static void ksym_trace_reset(struct trace_array *tr)
353{
354 ksym_tracing_enabled = 0;
355 __ksym_trace_reset();
356}
357
358static int ksym_trace_init(struct trace_array *tr)
359{
360 int cpu, ret = 0;
361
362 for_each_online_cpu(cpu)
363 tracing_reset(tr, cpu);
364 ksym_tracing_enabled = 1;
365 ksym_trace_array = tr;
366
367 return ret;
368}
369
370static void ksym_trace_print_header(struct seq_file *m)
371{
372 seq_puts(m,
373 "# TASK-PID CPU# Symbol "
374 "Type Function\n");
375 seq_puts(m,
376 "# | | | "
377 " | |\n");
378}
379
380static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
381{
382 struct trace_entry *entry = iter->ent;
383 struct trace_seq *s = &iter->seq;
384 struct ksym_trace_entry *field;
385 char str[KSYM_SYMBOL_LEN];
386 int ret;
387
388 if (entry->type != TRACE_KSYM)
389 return TRACE_TYPE_UNHANDLED;
390
391 trace_assign_type(field, entry);
392
393 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
394 entry->pid, iter->cpu, (char *)field->addr);
395 if (!ret)
396 return TRACE_TYPE_PARTIAL_LINE;
397
398 switch (field->type) {
399 case HW_BREAKPOINT_R:
400 ret = trace_seq_printf(s, " R ");
401 break;
402 case HW_BREAKPOINT_W:
403 ret = trace_seq_printf(s, " W ");
404 break;
405 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
406 ret = trace_seq_printf(s, " RW ");
407 break;
408 default:
409 return TRACE_TYPE_PARTIAL_LINE;
410 }
411
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 sprint_symbol(str, field->ip);
416 ret = trace_seq_printf(s, "%s\n", str);
417 if (!ret)
418 return TRACE_TYPE_PARTIAL_LINE;
419
420 return TRACE_TYPE_HANDLED;
421}
422
423struct tracer ksym_tracer __read_mostly =
424{
425 .name = "ksym_tracer",
426 .init = ksym_trace_init,
427 .reset = ksym_trace_reset,
428#ifdef CONFIG_FTRACE_SELFTEST
429 .selftest = trace_selftest_startup_ksym,
430#endif
431 .print_header = ksym_trace_print_header,
432 .print_line = ksym_trace_output
433};
434
435#ifdef CONFIG_PROFILE_KSYM_TRACER
436static int ksym_profile_show(struct seq_file *m, void *v)
437{
438 struct hlist_node *node;
439 struct trace_ksym *entry;
440 int access_type = 0;
441 char fn_name[KSYM_NAME_LEN];
442
443 seq_puts(m, " Access Type ");
444 seq_puts(m, " Symbol Counter\n");
445 seq_puts(m, " ----------- ");
446 seq_puts(m, " ------ -------\n");
447
448 rcu_read_lock();
449 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
450
451 access_type = entry->attr.bp_type;
452
453 switch (access_type) {
454 case HW_BREAKPOINT_R:
455 seq_puts(m, " R ");
456 break;
457 case HW_BREAKPOINT_W:
458 seq_puts(m, " W ");
459 break;
460 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
461 seq_puts(m, " RW ");
462 break;
463 default:
464 seq_puts(m, " NA ");
465 }
466
467 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
468 seq_printf(m, " %-36s", fn_name);
469 else
470 seq_printf(m, " %-36s", "<NA>");
471 seq_printf(m, " %15llu\n",
472 (unsigned long long)atomic64_read(&entry->counter));
473 }
474 rcu_read_unlock();
475
476 return 0;
477}
478
479static int ksym_profile_open(struct inode *node, struct file *file)
480{
481 return single_open(file, ksym_profile_show, NULL);
482}
483
484static const struct file_operations ksym_profile_fops = {
485 .open = ksym_profile_open,
486 .read = seq_read,
487 .llseek = seq_lseek,
488 .release = single_release,
489};
490#endif /* CONFIG_PROFILE_KSYM_TRACER */
491
492__init static int init_ksym_trace(void)
493{
494 struct dentry *d_tracer;
495
496 d_tracer = tracing_init_dentry();
497
498 trace_create_file("ksym_trace_filter", 0644, d_tracer,
499 NULL, &ksym_tracing_fops);
500
501#ifdef CONFIG_PROFILE_KSYM_TRACER
502 trace_create_file("ksym_profile", 0444, d_tracer,
503 NULL, &ksym_profile_fops);
504#endif
505
506 return register_tracer(&ksym_tracer);
507}
508device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..02272baa2206 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18 18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 20
24static int next_event_type = __TRACE_LAST_TYPE + 1; 21static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = {
1069 .funcs = &trace_wake_funcs, 1066 .funcs = &trace_wake_funcs,
1070}; 1067};
1071 1068
1072/* TRACE_SPECIAL */
1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1074 int flags, struct trace_event *event)
1075{
1076 struct special_entry *field;
1077
1078 trace_assign_type(field, iter->ent);
1079
1080 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
1081 field->arg1,
1082 field->arg2,
1083 field->arg3))
1084 return TRACE_TYPE_PARTIAL_LINE;
1085
1086 return TRACE_TYPE_HANDLED;
1087}
1088
1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1090 int flags, struct trace_event *event)
1091{
1092 struct special_entry *field;
1093 struct trace_seq *s = &iter->seq;
1094
1095 trace_assign_type(field, iter->ent);
1096
1097 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
1098 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
1099 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1100
1101 return TRACE_TYPE_HANDLED;
1102}
1103
1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1105 int flags, struct trace_event *event)
1106{
1107 struct special_entry *field;
1108 struct trace_seq *s = &iter->seq;
1109
1110 trace_assign_type(field, iter->ent);
1111
1112 SEQ_PUT_FIELD_RET(s, field->arg1);
1113 SEQ_PUT_FIELD_RET(s, field->arg2);
1114 SEQ_PUT_FIELD_RET(s, field->arg3);
1115
1116 return TRACE_TYPE_HANDLED;
1117}
1118
1119static struct trace_event_functions trace_special_funcs = {
1120 .trace = trace_special_print,
1121 .raw = trace_special_print,
1122 .hex = trace_special_hex,
1123 .binary = trace_special_bin,
1124};
1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1131/* TRACE_STACK */ 1069/* TRACE_STACK */
1132 1070
1133static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1071static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1161 1099
1162static struct trace_event_functions trace_stack_funcs = { 1100static struct trace_event_functions trace_stack_funcs = {
1163 .trace = trace_stack_print, 1101 .trace = trace_stack_print,
1164 .raw = trace_special_print,
1165 .hex = trace_special_hex,
1166 .binary = trace_special_bin,
1167}; 1102};
1168 1103
1169static struct trace_event trace_stack_event = { 1104static struct trace_event trace_stack_event = {
@@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1194 1129
1195static struct trace_event_functions trace_user_stack_funcs = { 1130static struct trace_event_functions trace_user_stack_funcs = {
1196 .trace = trace_user_stack_print, 1131 .trace = trace_user_stack_print,
1197 .raw = trace_special_print,
1198 .hex = trace_special_hex,
1199 .binary = trace_special_bin,
1200}; 1132};
1201 1133
1202static struct trace_event trace_user_stack_event = { 1134static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = {
1314 &trace_fn_event, 1246 &trace_fn_event,
1315 &trace_ctx_event, 1247 &trace_ctx_event,
1316 &trace_wake_event, 1248 &trace_wake_event,
1317 &trace_special_event,
1318 &trace_stack_event, 1249 &trace_stack_event,
1319 &trace_user_stack_event, 1250 &trace_user_stack_event,
1320 &trace_bprint_event, 1251 &trace_bprint_event,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0e73bc2ef8c5..4086eae6e81b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
46 struct trace_array_cpu *data; 46 struct trace_array_cpu *data;
47 unsigned long flags; 47 unsigned long flags;
48 long disabled; 48 long disabled;
49 int resched;
50 int cpu; 49 int cpu;
51 int pc; 50 int pc;
52 51
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
54 return; 53 return;
55 54
56 pc = preempt_count(); 55 pc = preempt_count();
57 resched = ftrace_preempt_disable(); 56 preempt_disable_notrace();
58 57
59 cpu = raw_smp_processor_id(); 58 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu) 59 if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
74 out: 73 out:
75 atomic_dec(&data->disabled); 74 atomic_dec(&data->disabled);
76 out_enable: 75 out_enable:
77 ftrace_preempt_enable(resched); 76 preempt_enable_notrace();
78} 77}
79 78
80static struct ftrace_ops trace_ops __read_mostly = 79static struct ftrace_ops trace_ops __read_mostly =
@@ -383,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
383#ifdef CONFIG_FTRACE_SELFTEST 382#ifdef CONFIG_FTRACE_SELFTEST
384 .selftest = trace_selftest_startup_wakeup, 383 .selftest = trace_selftest_startup_wakeup,
385#endif 384#endif
385 .use_max_tr = 1,
386}; 386};
387 387
388static struct tracer wakeup_rt_tracer __read_mostly = 388static struct tracer wakeup_rt_tracer __read_mostly =
@@ -397,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
397#ifdef CONFIG_FTRACE_SELFTEST 397#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 398 .selftest = trace_selftest_startup_wakeup,
399#endif 399#endif
400 .use_max_tr = 1,
400}; 401};
401 402
402__init static int init_wakeup_tracer(void) 403__init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 250e7f9bd2f0..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_WAKE: 13 case TRACE_WAKE:
14 case TRACE_STACK: 14 case TRACE_STACK:
15 case TRACE_PRINT: 15 case TRACE_PRINT:
16 case TRACE_SPECIAL:
17 case TRACE_BRANCH: 16 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
20 case TRACE_KSYM:
21 return 1; 19 return 1;
22 } 20 }
23 return 0; 21 return 0;
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
691} 689}
692#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 690#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
693 691
694#ifdef CONFIG_SYSPROF_TRACER
695int
696trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
697{
698 unsigned long count;
699 int ret;
700
701 /* start the tracing */
702 ret = tracer_init(trace, tr);
703 if (ret) {
704 warn_failed_init_tracer(trace, ret);
705 return ret;
706 }
707
708 /* Sleep for a 1/10 of a second */
709 msleep(100);
710 /* stop the tracing. */
711 tracing_stop();
712 /* check the trace buffer */
713 ret = trace_test_buffer(tr, &count);
714 trace->reset(tr);
715 tracing_start();
716
717 if (!ret && !count) {
718 printk(KERN_CONT ".. no entries found ..");
719 ret = -1;
720 }
721
722 return ret;
723}
724#endif /* CONFIG_SYSPROF_TRACER */
725
726#ifdef CONFIG_BRANCH_TRACER 692#ifdef CONFIG_BRANCH_TRACER
727int 693int
728trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) 694trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 721}
756#endif /* CONFIG_BRANCH_TRACER */ 722#endif /* CONFIG_BRANCH_TRACER */
757 723
758#ifdef CONFIG_KSYM_TRACER
759static int ksym_selftest_dummy;
760
761int
762trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
763{
764 unsigned long count;
765 int ret;
766
767 /* start the tracing */
768 ret = tracer_init(trace, tr);
769 if (ret) {
770 warn_failed_init_tracer(trace, ret);
771 return ret;
772 }
773
774 ksym_selftest_dummy = 0;
775 /* Register the read-write tracing request */
776
777 ret = process_new_ksym_entry("ksym_selftest_dummy",
778 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
779 (unsigned long)(&ksym_selftest_dummy));
780
781 if (ret < 0) {
782 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
783 goto ret_path;
784 }
785 /* Perform a read and a write operation over the dummy variable to
786 * trigger the tracer
787 */
788 if (ksym_selftest_dummy == 0)
789 ksym_selftest_dummy++;
790
791 /* stop the tracing. */
792 tracing_stop();
793 /* check the trace buffer */
794 ret = trace_test_buffer(tr, &count);
795 trace->reset(tr);
796 tracing_start();
797
798 /* read & write operations - one each is performed on the dummy variable
799 * triggering two entries in the trace buffer
800 */
801 if (!ret && count != 2) {
802 printk(KERN_CONT "Ksym tracer startup test failed");
803 ret = -1;
804 }
805
806ret_path:
807 return ret;
808}
809#endif /* CONFIG_KSYM_TRACER */
810
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..a6b7e0e0f3eb 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
110static void 110static void
111stack_trace_call(unsigned long ip, unsigned long parent_ip) 111stack_trace_call(unsigned long ip, unsigned long parent_ip)
112{ 112{
113 int cpu, resched; 113 int cpu;
114 114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return; 116 return;
117 117
118 resched = ftrace_preempt_disable(); 118 preempt_disable_notrace();
119 119
120 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
121 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
127 out: 127 out:
128 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
129 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
130 ftrace_preempt_enable(resched); 130 preempt_enable_notrace();
131} 131}
132 132
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
@@ -249,7 +249,7 @@ static int trace_lookup_stack(struct seq_file *m, long i)
249{ 249{
250 unsigned long addr = stack_dump_trace[i]; 250 unsigned long addr = stack_dump_trace[i];
251 251
252 return seq_printf(m, "%pF\n", (void *)addr); 252 return seq_printf(m, "%pS\n", (void *)addr);
253} 253}
254 254
255static void print_disabled(struct seq_file *m) 255static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
26static struct list_head * 29static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call) 30syscall_get_enter_fields(struct ftrace_event_call *call)
28{ 31{
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34static struct list_head * 37static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call) 38syscall_get_exit_fields(struct ftrace_event_call *call)
36{ 39{
37 struct syscall_metadata *entry = call->data; 40 return &syscall_exit_fields;
38
39 return &entry->exit_fields;
40} 41}
41 42
42struct trace_event_functions enter_syscall_print_funcs = { 43struct trace_event_functions enter_syscall_print_funcs = {
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca9..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
1/*
2 * trace stack traces
3 *
4 * Copyright (C) 2004-2008, Soeren Sandmann
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/kallsyms.h>
9#include <linux/debugfs.h>
10#include <linux/hrtimer.h>
11#include <linux/uaccess.h>
12#include <linux/ftrace.h>
13#include <linux/module.h>
14#include <linux/irq.h>
15#include <linux/fs.h>
16
17#include <asm/stacktrace.h>
18
19#include "trace.h"
20
21static struct trace_array *sysprof_trace;
22static int __read_mostly tracer_enabled;
23
24/*
25 * 1 msec sample interval by default:
26 */
27static unsigned long sample_period = 1000000;
28static const unsigned int sample_max_depth = 512;
29
30static DEFINE_MUTEX(sample_timer_lock);
31/*
32 * Per CPU hrtimers that do the profiling:
33 */
34static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
35
36struct stack_frame {
37 const void __user *next_fp;
38 unsigned long return_address;
39};
40
41static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
42{
43 int ret;
44
45 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
46 return 0;
47
48 ret = 1;
49 pagefault_disable();
50 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
51 ret = 0;
52 pagefault_enable();
53
54 return ret;
55}
56
57struct backtrace_info {
58 struct trace_array_cpu *data;
59 struct trace_array *tr;
60 int pos;
61};
62
63static void
64backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
65{
66 /* Ignore warnings */
67}
68
69static void backtrace_warning(void *data, char *msg)
70{
71 /* Ignore warnings */
72}
73
74static int backtrace_stack(void *data, char *name)
75{
76 /* Don't bother with IRQ stacks for now */
77 return -1;
78}
79
80static void backtrace_address(void *data, unsigned long addr, int reliable)
81{
82 struct backtrace_info *info = data;
83
84 if (info->pos < sample_max_depth && reliable) {
85 __trace_special(info->tr, info->data, 1, addr, 0);
86
87 info->pos++;
88 }
89}
90
91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack,
95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
97};
98
99static int
100trace_kernel(struct pt_regs *regs, struct trace_array *tr,
101 struct trace_array_cpu *data)
102{
103 struct backtrace_info info;
104 unsigned long bp;
105 char *stack;
106
107 info.tr = tr;
108 info.data = data;
109 info.pos = 1;
110
111 __trace_special(info.tr, info.data, 1, regs->ip, 0);
112
113 stack = ((char *)regs + sizeof(struct pt_regs));
114#ifdef CONFIG_FRAME_POINTER
115 bp = regs->bp;
116#else
117 bp = 0;
118#endif
119
120 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
121
122 return info.pos;
123}
124
125static void timer_notify(struct pt_regs *regs, int cpu)
126{
127 struct trace_array_cpu *data;
128 struct stack_frame frame;
129 struct trace_array *tr;
130 const void __user *fp;
131 int is_user;
132 int i;
133
134 if (!regs)
135 return;
136
137 tr = sysprof_trace;
138 data = tr->data[cpu];
139 is_user = user_mode(regs);
140
141 if (!current || current->pid == 0)
142 return;
143
144 if (is_user && current->state != TASK_RUNNING)
145 return;
146
147 __trace_special(tr, data, 0, 0, current->pid);
148
149 if (!is_user)
150 i = trace_kernel(regs, tr, data);
151 else
152 i = 0;
153
154 /*
155 * Trace user stack if we are not a kernel thread
156 */
157 if (current->mm && i < sample_max_depth) {
158 regs = (struct pt_regs *)current->thread.sp0 - 1;
159
160 fp = (void __user *)regs->bp;
161
162 __trace_special(tr, data, 2, regs->ip, 0);
163
164 while (i < sample_max_depth) {
165 frame.next_fp = NULL;
166 frame.return_address = 0;
167 if (!copy_stack_frame(fp, &frame))
168 break;
169 if ((unsigned long)fp < regs->sp)
170 break;
171
172 __trace_special(tr, data, 2, frame.return_address,
173 (unsigned long)fp);
174 fp = frame.next_fp;
175
176 i++;
177 }
178
179 }
180
181 /*
182 * Special trace entry if we overflow the max depth:
183 */
184 if (i == sample_max_depth)
185 __trace_special(tr, data, -1, -1, -1);
186
187 __trace_special(tr, data, 3, current->pid, i);
188}
189
190static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
191{
192 /* trace here */
193 timer_notify(get_irq_regs(), smp_processor_id());
194
195 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
196
197 return HRTIMER_RESTART;
198}
199
200static void start_stack_timer(void *unused)
201{
202 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
203
204 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 hrtimer->function = stack_trace_timer_fn;
206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
208 HRTIMER_MODE_REL_PINNED);
209}
210
211static void start_stack_timers(void)
212{
213 on_each_cpu(start_stack_timer, NULL, 1);
214}
215
216static void stop_stack_timer(int cpu)
217{
218 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
219
220 hrtimer_cancel(hrtimer);
221}
222
223static void stop_stack_timers(void)
224{
225 int cpu;
226
227 for_each_online_cpu(cpu)
228 stop_stack_timer(cpu);
229}
230
231static void stop_stack_trace(struct trace_array *tr)
232{
233 mutex_lock(&sample_timer_lock);
234 stop_stack_timers();
235 tracer_enabled = 0;
236 mutex_unlock(&sample_timer_lock);
237}
238
239static int stack_trace_init(struct trace_array *tr)
240{
241 sysprof_trace = tr;
242
243 tracing_start_cmdline_record();
244
245 mutex_lock(&sample_timer_lock);
246 start_stack_timers();
247 tracer_enabled = 1;
248 mutex_unlock(&sample_timer_lock);
249 return 0;
250}
251
252static void stack_trace_reset(struct trace_array *tr)
253{
254 tracing_stop_cmdline_record();
255 stop_stack_trace(tr);
256}
257
258static struct tracer stack_trace __read_mostly =
259{
260 .name = "sysprof",
261 .init = stack_trace_init,
262 .reset = stack_trace_reset,
263#ifdef CONFIG_FTRACE_SELFTEST
264 .selftest = trace_selftest_startup_sysprof,
265#endif
266};
267
268__init static int init_stack_trace(void)
269{
270 return register_tracer(&stack_trace);
271}
272device_initcall(init_stack_trace);
273
274#define MAX_LONG_DIGITS 22
275
276static ssize_t
277sysprof_sample_read(struct file *filp, char __user *ubuf,
278 size_t cnt, loff_t *ppos)
279{
280 char buf[MAX_LONG_DIGITS];
281 int r;
282
283 r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
284
285 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
286}
287
288static ssize_t
289sysprof_sample_write(struct file *filp, const char __user *ubuf,
290 size_t cnt, loff_t *ppos)
291{
292 char buf[MAX_LONG_DIGITS];
293 unsigned long val;
294
295 if (cnt > MAX_LONG_DIGITS-1)
296 cnt = MAX_LONG_DIGITS-1;
297
298 if (copy_from_user(&buf, ubuf, cnt))
299 return -EFAULT;
300
301 buf[cnt] = 0;
302
303 val = simple_strtoul(buf, NULL, 10);
304 /*
305 * Enforce a minimum sample period of 100 usecs:
306 */
307 if (val < 100)
308 val = 100;
309
310 mutex_lock(&sample_timer_lock);
311 stop_stack_timers();
312 sample_period = val * 1000;
313 start_stack_timers();
314 mutex_unlock(&sample_timer_lock);
315
316 return cnt;
317}
318
319static const struct file_operations sysprof_sample_fops = {
320 .read = sysprof_sample_read,
321 .write = sysprof_sample_write,
322};
323
324void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
325{
326
327 trace_create_file("sysprof_sample_period", 0644,
328 d_tracer, NULL, &sysprof_sample_fops);
329}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..7f9c3c52ecc1
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,577 @@
1/*
2 * Detect hard and soft lockups on a system
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * this code detects hard lockups: incidents in where on a CPU
7 * the kernel does not respond to anything except NMI.
8 *
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well.
13 */
14
15#include <linux/mm.h>
16#include <linux/cpu.h>
17#include <linux/nmi.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/freezer.h>
21#include <linux/kthread.h>
22#include <linux/lockdep.h>
23#include <linux/notifier.h>
24#include <linux/module.h>
25#include <linux/sysctl.h>
26
27#include <asm/irq_regs.h>
28#include <linux/perf_event.h>
29
30int watchdog_enabled;
31int __read_mostly softlockup_thresh = 60;
32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif
45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog;
48
49
50/* boot commands */
51/*
52 * Should we panic when a soft-lockup or hard-lockup occurs:
53 */
54#ifdef CONFIG_HARDLOCKUP_DETECTOR
55static int hardlockup_panic;
56
57static int __init hardlockup_panic_setup(char *str)
58{
59 if (!strncmp(str, "panic", 5))
60 hardlockup_panic = 1;
61 return 1;
62}
63__setup("nmi_watchdog=", hardlockup_panic_setup);
64#endif
65
66unsigned int __read_mostly softlockup_panic =
67 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
68
69static int __init softlockup_panic_setup(char *str)
70{
71 softlockup_panic = simple_strtoul(str, NULL, 0);
72
73 return 1;
74}
75__setup("softlockup_panic=", softlockup_panic_setup);
76
77static int __init nowatchdog_setup(char *str)
78{
79 no_watchdog = 1;
80 return 1;
81}
82__setup("nowatchdog", nowatchdog_setup);
83
84/* deprecated */
85static int __init nosoftlockup_setup(char *str)
86{
87 no_watchdog = 1;
88 return 1;
89}
90__setup("nosoftlockup", nosoftlockup_setup);
91/* */
92
93
94/*
95 * Returns seconds, approximately. We don't need nanosecond
96 * resolution, and we don't need to waste time with a big divide when
97 * 2^30ns == 1.074s.
98 */
99static unsigned long get_timestamp(int this_cpu)
100{
101 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
102}
103
104static unsigned long get_sample_period(void)
105{
106 /*
107 * convert softlockup_thresh from seconds to ns
108 * the divide by 5 is to give hrtimer 5 chances to
109 * increment before the hardlockup detector generates
110 * a warning
111 */
112 return softlockup_thresh / 5 * NSEC_PER_SEC;
113}
114
115/* Commands for resetting the watchdog */
116static void __touch_watchdog(void)
117{
118 int this_cpu = smp_processor_id();
119
120 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
121}
122
123void touch_softlockup_watchdog(void)
124{
125 __raw_get_cpu_var(watchdog_touch_ts) = 0;
126}
127EXPORT_SYMBOL(touch_softlockup_watchdog);
128
129void touch_all_softlockup_watchdogs(void)
130{
131 int cpu;
132
133 /*
134 * this is done lockless
135 * do we care if a 0 races with a timestamp?
136 * all it means is the softlock check starts one cycle later
137 */
138 for_each_online_cpu(cpu)
139 per_cpu(watchdog_touch_ts, cpu) = 0;
140}
141
142#ifdef CONFIG_HARDLOCKUP_DETECTOR
143void touch_nmi_watchdog(void)
144{
145 if (watchdog_enabled) {
146 unsigned cpu;
147
148 for_each_present_cpu(cpu) {
149 if (per_cpu(watchdog_nmi_touch, cpu) != true)
150 per_cpu(watchdog_nmi_touch, cpu) = true;
151 }
152 }
153 touch_softlockup_watchdog();
154}
155EXPORT_SYMBOL(touch_nmi_watchdog);
156
157#endif
158
159void touch_softlockup_watchdog_sync(void)
160{
161 __raw_get_cpu_var(softlockup_touch_sync) = true;
162 __raw_get_cpu_var(watchdog_touch_ts) = 0;
163}
164
165#ifdef CONFIG_HARDLOCKUP_DETECTOR
166/* watchdog detector functions */
167static int is_hardlockup(void)
168{
169 unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
170
171 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
172 return 1;
173
174 __get_cpu_var(hrtimer_interrupts_saved) = hrint;
175 return 0;
176}
177#endif
178
179static int is_softlockup(unsigned long touch_ts)
180{
181 unsigned long now = get_timestamp(smp_processor_id());
182
183 /* Warn about unreasonable delays: */
184 if (time_after(now, touch_ts + softlockup_thresh))
185 return now - touch_ts;
186
187 return 0;
188}
189
190static int
191watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
192{
193 did_panic = 1;
194
195 return NOTIFY_DONE;
196}
197
198static struct notifier_block panic_block = {
199 .notifier_call = watchdog_panic,
200};
201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR
203static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE,
205 .config = PERF_COUNT_HW_CPU_CYCLES,
206 .size = sizeof(struct perf_event_attr),
207 .pinned = 1,
208 .disabled = 1,
209};
210
211/* Callback function for perf event subsystem */
212void watchdog_overflow_callback(struct perf_event *event, int nmi,
213 struct perf_sample_data *data,
214 struct pt_regs *regs)
215{
216 /* Ensure the watchdog never gets throttled */
217 event->hw.interrupts = 0;
218
219 if (__get_cpu_var(watchdog_nmi_touch) == true) {
220 __get_cpu_var(watchdog_nmi_touch) = false;
221 return;
222 }
223
224 /* check for a hardlockup
225 * This is done by making sure our timer interrupt
226 * is incrementing. The timer interrupt should have
227 * fired multiple times before we overflow'd. If it hasn't
228 * then this is a good indication the cpu is stuck
229 */
230 if (is_hardlockup()) {
231 int this_cpu = smp_processor_id();
232
233 /* only print hardlockups once */
234 if (__get_cpu_var(hard_watchdog_warn) == true)
235 return;
236
237 if (hardlockup_panic)
238 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
239 else
240 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
241
242 __get_cpu_var(hard_watchdog_warn) = true;
243 return;
244 }
245
246 __get_cpu_var(hard_watchdog_warn) = false;
247 return;
248}
249static void watchdog_interrupt_count(void)
250{
251 __get_cpu_var(hrtimer_interrupts)++;
252}
253#else
254static inline void watchdog_interrupt_count(void) { return; }
255#endif /* CONFIG_HARDLOCKUP_DETECTOR */
256
257/* watchdog kicker functions */
258static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
259{
260 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
261 struct pt_regs *regs = get_irq_regs();
262 int duration;
263
264 /* kick the hardlockup detector */
265 watchdog_interrupt_count();
266
267 /* kick the softlockup detector */
268 wake_up_process(__get_cpu_var(softlockup_watchdog));
269
270 /* .. and repeat */
271 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
272
273 if (touch_ts == 0) {
274 if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
275 /*
276 * If the time stamp was touched atomically
277 * make sure the scheduler tick is up to date.
278 */
279 __get_cpu_var(softlockup_touch_sync) = false;
280 sched_clock_tick();
281 }
282 __touch_watchdog();
283 return HRTIMER_RESTART;
284 }
285
286 /* check for a softlockup
287 * This is done by making sure a high priority task is
288 * being scheduled. The task touches the watchdog to
289 * indicate it is getting cpu time. If it hasn't then
290 * this is a good indication some task is hogging the cpu
291 */
292 duration = is_softlockup(touch_ts);
293 if (unlikely(duration)) {
294 /* only warn once */
295 if (__get_cpu_var(soft_watchdog_warn) == true)
296 return HRTIMER_RESTART;
297
298 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
299 smp_processor_id(), duration,
300 current->comm, task_pid_nr(current));
301 print_modules();
302 print_irqtrace_events(current);
303 if (regs)
304 show_regs(regs);
305 else
306 dump_stack();
307
308 if (softlockup_panic)
309 panic("softlockup: hung tasks");
310 __get_cpu_var(soft_watchdog_warn) = true;
311 } else
312 __get_cpu_var(soft_watchdog_warn) = false;
313
314 return HRTIMER_RESTART;
315}
316
317
318/*
319 * The watchdog thread - touches the timestamp.
320 */
321static int watchdog(void *unused)
322{
323 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
324 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
325
326 sched_setscheduler(current, SCHED_FIFO, &param);
327
328 /* initialize timestamp */
329 __touch_watchdog();
330
331 /* kick off the timer for the hardlockup detector */
332 /* done here because hrtimer_start can only pin to smp_processor_id() */
333 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
334 HRTIMER_MODE_REL_PINNED);
335
336 set_current_state(TASK_INTERRUPTIBLE);
337 /*
338 * Run briefly once per second to reset the softlockup timestamp.
339 * If this gets delayed for more than 60 seconds then the
340 * debug-printout triggers in watchdog_timer_fn().
341 */
342 while (!kthread_should_stop()) {
343 __touch_watchdog();
344 schedule();
345
346 if (kthread_should_stop())
347 break;
348
349 set_current_state(TASK_INTERRUPTIBLE);
350 }
351 __set_current_state(TASK_RUNNING);
352
353 return 0;
354}
355
356
357#ifdef CONFIG_HARDLOCKUP_DETECTOR
358static int watchdog_nmi_enable(int cpu)
359{
360 struct perf_event_attr *wd_attr;
361 struct perf_event *event = per_cpu(watchdog_ev, cpu);
362
363 /* is it already setup and enabled? */
364 if (event && event->state > PERF_EVENT_STATE_OFF)
365 goto out;
366
367 /* it is setup but not enabled */
368 if (event != NULL)
369 goto out_enable;
370
371 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period();
374 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
375 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save;
378 }
379
380 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
381 return -1;
382
383 /* success path */
384out_save:
385 per_cpu(watchdog_ev, cpu) = event;
386out_enable:
387 perf_event_enable(per_cpu(watchdog_ev, cpu));
388out:
389 return 0;
390}
391
392static void watchdog_nmi_disable(int cpu)
393{
394 struct perf_event *event = per_cpu(watchdog_ev, cpu);
395
396 if (event) {
397 perf_event_disable(event);
398 per_cpu(watchdog_ev, cpu) = NULL;
399
400 /* should be in cleanup, but blocks oprofile */
401 perf_event_release_kernel(event);
402 }
403 return;
404}
405#else
406static int watchdog_nmi_enable(int cpu) { return 0; }
407static void watchdog_nmi_disable(int cpu) { return; }
408#endif /* CONFIG_HARDLOCKUP_DETECTOR */
409
410/* prepare/enable/disable routines */
411static int watchdog_prepare_cpu(int cpu)
412{
413 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
414
415 WARN_ON(per_cpu(softlockup_watchdog, cpu));
416 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
417 hrtimer->function = watchdog_timer_fn;
418
419 return 0;
420}
421
422static int watchdog_enable(int cpu)
423{
424 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
425
426 /* enable the perf event */
427 if (watchdog_nmi_enable(cpu) != 0)
428 return -1;
429
430 /* create the watchdog thread */
431 if (!p) {
432 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
433 if (IS_ERR(p)) {
434 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
435 return -1;
436 }
437 kthread_bind(p, cpu);
438 per_cpu(watchdog_touch_ts, cpu) = 0;
439 per_cpu(softlockup_watchdog, cpu) = p;
440 wake_up_process(p);
441 }
442
443 /* if any cpu succeeds, watchdog is considered enabled for the system */
444 watchdog_enabled = 1;
445
446 return 0;
447}
448
449static void watchdog_disable(int cpu)
450{
451 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
452 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
453
454 /*
455 * cancel the timer first to stop incrementing the stats
456 * and waking up the kthread
457 */
458 hrtimer_cancel(hrtimer);
459
460 /* disable the perf event */
461 watchdog_nmi_disable(cpu);
462
463 /* stop the watchdog thread */
464 if (p) {
465 per_cpu(softlockup_watchdog, cpu) = NULL;
466 kthread_stop(p);
467 }
468}
469
470static void watchdog_enable_all_cpus(void)
471{
472 int cpu;
473 int result = 0;
474
475 for_each_online_cpu(cpu)
476 result += watchdog_enable(cpu);
477
478 if (result)
479 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
480
481}
482
483static void watchdog_disable_all_cpus(void)
484{
485 int cpu;
486
487 for_each_online_cpu(cpu)
488 watchdog_disable(cpu);
489
490 /* if all watchdogs are disabled, then they are disabled for the system */
491 watchdog_enabled = 0;
492}
493
494
495/* sysctl functions */
496#ifdef CONFIG_SYSCTL
497/*
498 * proc handler for /proc/sys/kernel/nmi_watchdog
499 */
500
501int proc_dowatchdog_enabled(struct ctl_table *table, int write,
502 void __user *buffer, size_t *length, loff_t *ppos)
503{
504 proc_dointvec(table, write, buffer, length, ppos);
505
506 if (watchdog_enabled)
507 watchdog_enable_all_cpus();
508 else
509 watchdog_disable_all_cpus();
510 return 0;
511}
512
513int proc_dowatchdog_thresh(struct ctl_table *table, int write,
514 void __user *buffer,
515 size_t *lenp, loff_t *ppos)
516{
517 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
518}
519#endif /* CONFIG_SYSCTL */
520
521
522/*
523 * Create/destroy watchdog threads as CPUs come and go:
524 */
525static int __cpuinit
526cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
527{
528 int hotcpu = (unsigned long)hcpu;
529
530 switch (action) {
531 case CPU_UP_PREPARE:
532 case CPU_UP_PREPARE_FROZEN:
533 if (watchdog_prepare_cpu(hotcpu))
534 return NOTIFY_BAD;
535 break;
536 case CPU_ONLINE:
537 case CPU_ONLINE_FROZEN:
538 if (watchdog_enable(hotcpu))
539 return NOTIFY_BAD;
540 break;
541#ifdef CONFIG_HOTPLUG_CPU
542 case CPU_UP_CANCELED:
543 case CPU_UP_CANCELED_FROZEN:
544 watchdog_disable(hotcpu);
545 break;
546 case CPU_DEAD:
547 case CPU_DEAD_FROZEN:
548 watchdog_disable(hotcpu);
549 break;
550#endif /* CONFIG_HOTPLUG_CPU */
551 }
552 return NOTIFY_OK;
553}
554
555static struct notifier_block __cpuinitdata cpu_nfb = {
556 .notifier_call = cpu_callback
557};
558
559static int __init spawn_watchdog_task(void)
560{
561 void *cpu = (void *)(long)smp_processor_id();
562 int err;
563
564 if (no_watchdog)
565 return 0;
566
567 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
568 WARN_ON(err == NOTIFY_BAD);
569
570 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
571 register_cpu_notifier(&cpu_nfb);
572
573 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
574
575 return 0;
576}
577early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..f77afd939229 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,19 +1,26 @@
1/* 1/*
2 * linux/kernel/workqueue.c 2 * kernel/workqueue.c - generic async execution with shared worker pool
3 * 3 *
4 * Generic mechanism for defining kernel helper threads for running 4 * Copyright (C) 2002 Ingo Molnar
5 * arbitrary tasks in process context.
6 * 5 *
7 * Started by Ingo Molnar, Copyright (C) 2002 6 * Derived from the taskqueue/keventd code by:
7 * David Woodhouse <dwmw2@infradead.org>
8 * Andrew Morton
9 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
10 * Theodore Ts'o <tytso@mit.edu>
8 * 11 *
9 * Derived from the taskqueue/keventd code by: 12 * Made to use alloc_percpu by Christoph Lameter.
10 * 13 *
11 * David Woodhouse <dwmw2@infradead.org> 14 * Copyright (C) 2010 SUSE Linux Products GmbH
12 * Andrew Morton 15 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu>
15 * 16 *
16 * Made to use alloc_percpu by Christoph Lameter. 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and
20 * one extra for works which are better served by workers which are
21 * not bound to any specific CPU.
22 *
23 * Please read Documentation/workqueue.txt for details.
17 */ 24 */
18 25
19#include <linux/module.h> 26#include <linux/module.h>
@@ -33,41 +40,291 @@
33#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h>
44
36#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
37#include <trace/events/workqueue.h> 46#include <trace/events/workqueue.h>
38 47
48#include "workqueue_sched.h"
49
50enum {
51 /* global_cwq flags */
52 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
53 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
54 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
55 GCWQ_FREEZING = 1 << 3, /* freeze in progress */
56 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
57
58 /* worker flags */
59 WORKER_STARTED = 1 << 0, /* started */
60 WORKER_DIE = 1 << 1, /* die die die */
61 WORKER_IDLE = 1 << 2, /* is idle */
62 WORKER_PREP = 1 << 3, /* preparing to run works */
63 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
64 WORKER_REBIND = 1 << 5, /* mom is home, come back */
65 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
66 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
67
68 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
69 WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
70
71 /* gcwq->trustee_state */
72 TRUSTEE_START = 0, /* start */
73 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
74 TRUSTEE_BUTCHER = 2, /* butcher workers */
75 TRUSTEE_RELEASE = 3, /* release workers */
76 TRUSTEE_DONE = 4, /* trustee is done */
77
78 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
79 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
80 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
81
82 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
83 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
84
85 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
86 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
87 CREATE_COOLDOWN = HZ, /* time to breath after fail */
88 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
89
90 /*
91 * Rescue workers are used only on emergencies and shared by
92 * all cpus. Give -20.
93 */
94 RESCUER_NICE_LEVEL = -20,
95};
96
97/*
98 * Structure fields follow one of the following exclusion rules.
99 *
100 * I: Modifiable by initialization/destruction paths and read-only for
101 * everyone else.
102 *
103 * P: Preemption protected. Disabling preemption is enough and should
104 * only be modified and accessed from the local cpu.
105 *
106 * L: gcwq->lock protected. Access with gcwq->lock held.
107 *
108 * X: During normal operation, modification requires gcwq->lock and
109 * should be done only from local cpu. Either disabling preemption
110 * on local cpu or grabbing gcwq->lock is enough for read access.
111 * If GCWQ_DISASSOCIATED is set, it's identical to L.
112 *
113 * F: wq->flush_mutex protected.
114 *
115 * W: workqueue_lock protected.
116 */
117
118struct global_cwq;
119
39/* 120/*
40 * The per-CPU workqueue (if single thread, we always use the first 121 * The poor guys doing the actual heavy lifting. All on-duty workers
41 * possible cpu). 122 * are either serving the manager role, on idle list or on busy hash.
42 */ 123 */
43struct cpu_workqueue_struct { 124struct worker {
125 /* on idle list while idle, on busy hash table while busy */
126 union {
127 struct list_head entry; /* L: while idle */
128 struct hlist_node hentry; /* L: while busy */
129 };
44 130
45 spinlock_t lock; 131 struct work_struct *current_work; /* L: work being processed */
132 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
133 struct list_head scheduled; /* L: scheduled works */
134 struct task_struct *task; /* I: worker task */
135 struct global_cwq *gcwq; /* I: the associated gcwq */
136 /* 64 bytes boundary on 64bit, 32 on 32bit */
137 unsigned long last_active; /* L: last active timestamp */
138 unsigned int flags; /* X: flags */
139 int id; /* I: worker id */
140 struct work_struct rebind_work; /* L: rebind worker to cpu */
141};
46 142
47 struct list_head worklist; 143/*
48 wait_queue_head_t more_work; 144 * Global per-cpu workqueue. There's one and only one for each cpu
49 struct work_struct *current_work; 145 * and all works are queued and processed here regardless of their
146 * target workqueues.
147 */
148struct global_cwq {
149 spinlock_t lock; /* the gcwq lock */
150 struct list_head worklist; /* L: list of pending works */
151 unsigned int cpu; /* I: the associated cpu */
152 unsigned int flags; /* L: GCWQ_* flags */
50 153
51 struct workqueue_struct *wq; 154 int nr_workers; /* L: total number of workers */
52 struct task_struct *thread; 155 int nr_idle; /* L: currently idle ones */
53} ____cacheline_aligned; 156
157 /* workers are chained either in the idle_list or busy_hash */
158 struct list_head idle_list; /* X: list of idle workers */
159 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
160 /* L: hash of busy workers */
161
162 struct timer_list idle_timer; /* L: worker idle timeout */
163 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
164
165 struct ida worker_ida; /* L: for worker IDs */
166
167 struct task_struct *trustee; /* L: for gcwq shutdown */
168 unsigned int trustee_state; /* L: trustee state */
169 wait_queue_head_t trustee_wait; /* trustee wait */
170 struct worker *first_idle; /* L: first idle worker */
171} ____cacheline_aligned_in_smp;
172
173/*
174 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
175 * work_struct->data are used for flags and thus cwqs need to be
176 * aligned at two's power of the number of flag bits.
177 */
178struct cpu_workqueue_struct {
179 struct global_cwq *gcwq; /* I: the associated gcwq */
180 struct workqueue_struct *wq; /* I: the owning workqueue */
181 int work_color; /* L: current color */
182 int flush_color; /* L: flushing color */
183 int nr_in_flight[WORK_NR_COLORS];
184 /* L: nr of in_flight works */
185 int nr_active; /* L: nr of active works */
186 int max_active; /* L: max active works */
187 struct list_head delayed_works; /* L: delayed works */
188};
189
190/*
191 * Structure used to wait for workqueue flush.
192 */
193struct wq_flusher {
194 struct list_head list; /* F: list of flushers */
195 int flush_color; /* F: flush color waiting for */
196 struct completion done; /* flush completion */
197};
198
199/*
200 * All cpumasks are assumed to be always set on UP and thus can't be
201 * used to determine whether there's something to be done.
202 */
203#ifdef CONFIG_SMP
204typedef cpumask_var_t mayday_mask_t;
205#define mayday_test_and_set_cpu(cpu, mask) \
206 cpumask_test_and_set_cpu((cpu), (mask))
207#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
208#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
209#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
210#define free_mayday_mask(mask) free_cpumask_var((mask))
211#else
212typedef unsigned long mayday_mask_t;
213#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
214#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
215#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
216#define alloc_mayday_mask(maskp, gfp) true
217#define free_mayday_mask(mask) do { } while (0)
218#endif
54 219
55/* 220/*
56 * The externally visible workqueue abstraction is an array of 221 * The externally visible workqueue abstraction is an array of
57 * per-CPU workqueues: 222 * per-CPU workqueues:
58 */ 223 */
59struct workqueue_struct { 224struct workqueue_struct {
60 struct cpu_workqueue_struct *cpu_wq; 225 unsigned int flags; /* I: WQ_* flags */
61 struct list_head list; 226 union {
62 const char *name; 227 struct cpu_workqueue_struct __percpu *pcpu;
63 int singlethread; 228 struct cpu_workqueue_struct *single;
64 int freezeable; /* Freeze threads during suspend */ 229 unsigned long v;
65 int rt; 230 } cpu_wq; /* I: cwq's */
231 struct list_head list; /* W: list of all workqueues */
232
233 struct mutex flush_mutex; /* protects wq flushing */
234 int work_color; /* F: current work color */
235 int flush_color; /* F: current flush color */
236 atomic_t nr_cwqs_to_flush; /* flush in progress */
237 struct wq_flusher *first_flusher; /* F: first flusher */
238 struct list_head flusher_queue; /* F: flush waiters */
239 struct list_head flusher_overflow; /* F: flush overflow list */
240
241 mayday_mask_t mayday_mask; /* cpus requesting rescue */
242 struct worker *rescuer; /* I: rescue worker */
243
244 int saved_max_active; /* W: saved cwq max_active */
245 const char *name; /* I: workqueue name */
66#ifdef CONFIG_LOCKDEP 246#ifdef CONFIG_LOCKDEP
67 struct lockdep_map lockdep_map; 247 struct lockdep_map lockdep_map;
68#endif 248#endif
69}; 249};
70 250
251struct workqueue_struct *system_wq __read_mostly;
252struct workqueue_struct *system_long_wq __read_mostly;
253struct workqueue_struct *system_nrt_wq __read_mostly;
254struct workqueue_struct *system_unbound_wq __read_mostly;
255EXPORT_SYMBOL_GPL(system_wq);
256EXPORT_SYMBOL_GPL(system_long_wq);
257EXPORT_SYMBOL_GPL(system_nrt_wq);
258EXPORT_SYMBOL_GPL(system_unbound_wq);
259
260#define for_each_busy_worker(worker, i, pos, gcwq) \
261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
262 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
263
264static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
265 unsigned int sw)
266{
267 if (cpu < nr_cpu_ids) {
268 if (sw & 1) {
269 cpu = cpumask_next(cpu, mask);
270 if (cpu < nr_cpu_ids)
271 return cpu;
272 }
273 if (sw & 2)
274 return WORK_CPU_UNBOUND;
275 }
276 return WORK_CPU_NONE;
277}
278
279static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
280 struct workqueue_struct *wq)
281{
282 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
283}
284
285/*
286 * CPU iterators
287 *
288 * An extra gcwq is defined for an invalid cpu number
289 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
290 * specific CPU. The following iterators are similar to
291 * for_each_*_cpu() iterators but also considers the unbound gcwq.
292 *
293 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
294 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
295 * for_each_cwq_cpu() : possible CPUs for bound workqueues,
296 * WORK_CPU_UNBOUND for unbound workqueues
297 */
298#define for_each_gcwq_cpu(cpu) \
299 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
300 (cpu) < WORK_CPU_NONE; \
301 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
302
303#define for_each_online_gcwq_cpu(cpu) \
304 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
305 (cpu) < WORK_CPU_NONE; \
306 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
307
308#define for_each_cwq_cpu(cpu, wq) \
309 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
310 (cpu) < WORK_CPU_NONE; \
311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
312
313#ifdef CONFIG_LOCKDEP
314/**
315 * in_workqueue_context() - in context of specified workqueue?
316 * @wq: the workqueue of interest
317 *
318 * Checks lockdep state to see if the current task is executing from
319 * within a workqueue item. This function exists only if lockdep is
320 * enabled.
321 */
322int in_workqueue_context(struct workqueue_struct *wq)
323{
324 return lock_is_held(&wq->lockdep_map);
325}
326#endif
327
71#ifdef CONFIG_DEBUG_OBJECTS_WORK 328#ifdef CONFIG_DEBUG_OBJECTS_WORK
72 329
73static struct debug_obj_descr work_debug_descr; 330static struct debug_obj_descr work_debug_descr;
@@ -107,7 +364,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
107 * statically initialized. We just make sure that it 364 * statically initialized. We just make sure that it
108 * is tracked in the object tracker. 365 * is tracked in the object tracker.
109 */ 366 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { 367 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr); 368 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr); 369 debug_object_activate(work, &work_debug_descr);
113 return 0; 370 return 0;
@@ -181,94 +438,582 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
181/* Serializes the accesses to the list of workqueues. */ 438/* Serializes the accesses to the list of workqueues. */
182static DEFINE_SPINLOCK(workqueue_lock); 439static DEFINE_SPINLOCK(workqueue_lock);
183static LIST_HEAD(workqueues); 440static LIST_HEAD(workqueues);
441static bool workqueue_freezing; /* W: have wqs started freezing? */
184 442
185static int singlethread_cpu __read_mostly;
186static const struct cpumask *cpu_singlethread_map __read_mostly;
187/* 443/*
188 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 444 * The almighty global cpu workqueues. nr_running is the only field
189 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 445 * which is expected to be used frequently by other cpus via
190 * which comes in between can't use for_each_online_cpu(). We could 446 * try_to_wake_up(). Put it in a separate cacheline.
191 * use cpu_possible_map, the cpumask below is more a documentation
192 * than optimization.
193 */ 447 */
194static cpumask_var_t cpu_populated_map __read_mostly; 448static DEFINE_PER_CPU(struct global_cwq, global_cwq);
449static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
450
451/*
452 * Global cpu workqueue and nr_running counter for unbound gcwq. The
453 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
454 * workers have WORKER_UNBOUND set.
455 */
456static struct global_cwq unbound_global_cwq;
457static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
458
459static int worker_thread(void *__worker);
460
461static struct global_cwq *get_gcwq(unsigned int cpu)
462{
463 if (cpu != WORK_CPU_UNBOUND)
464 return &per_cpu(global_cwq, cpu);
465 else
466 return &unbound_global_cwq;
467}
195 468
196/* If it's single threaded, it isn't in the list of workqueues. */ 469static atomic_t *get_gcwq_nr_running(unsigned int cpu)
197static inline int is_wq_single_threaded(struct workqueue_struct *wq)
198{ 470{
199 return wq->singlethread; 471 if (cpu != WORK_CPU_UNBOUND)
472 return &per_cpu(gcwq_nr_running, cpu);
473 else
474 return &unbound_gcwq_nr_running;
475}
476
477static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
478 struct workqueue_struct *wq)
479{
480 if (!(wq->flags & WQ_UNBOUND)) {
481 if (likely(cpu < nr_cpu_ids)) {
482#ifdef CONFIG_SMP
483 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
484#else
485 return wq->cpu_wq.single;
486#endif
487 }
488 } else if (likely(cpu == WORK_CPU_UNBOUND))
489 return wq->cpu_wq.single;
490 return NULL;
200} 491}
201 492
202static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) 493static unsigned int work_color_to_flags(int color)
203{ 494{
204 return is_wq_single_threaded(wq) 495 return color << WORK_STRUCT_COLOR_SHIFT;
205 ? cpu_singlethread_map : cpu_populated_map;
206} 496}
207 497
208static 498static int get_work_color(struct work_struct *work)
209struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
210{ 499{
211 if (unlikely(is_wq_single_threaded(wq))) 500 return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
212 cpu = singlethread_cpu; 501 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
213 return per_cpu_ptr(wq->cpu_wq, cpu); 502}
503
504static int work_next_color(int color)
505{
506 return (color + 1) % WORK_NR_COLORS;
214} 507}
215 508
216/* 509/*
217 * Set the workqueue on which a work item is to be run 510 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
218 * - Must *only* be called if the pending flag is set 511 * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
512 * cleared and the work data contains the cpu number it was last on.
513 *
514 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
515 * cwq, cpu or clear work->data. These functions should only be
516 * called while the work is owned - ie. while the PENDING bit is set.
517 *
518 * get_work_[g]cwq() can be used to obtain the gcwq or cwq
519 * corresponding to a work. gcwq is available once the work has been
520 * queued anywhere after initialization. cwq is available only from
521 * queueing until execution starts.
219 */ 522 */
220static inline void set_wq_data(struct work_struct *work, 523static inline void set_work_data(struct work_struct *work, unsigned long data,
221 struct cpu_workqueue_struct *cwq) 524 unsigned long flags)
222{ 525{
223 unsigned long new;
224
225 BUG_ON(!work_pending(work)); 526 BUG_ON(!work_pending(work));
527 atomic_long_set(&work->data, data | flags | work_static(work));
528}
226 529
227 new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); 530static void set_work_cwq(struct work_struct *work,
228 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); 531 struct cpu_workqueue_struct *cwq,
229 atomic_long_set(&work->data, new); 532 unsigned long extra_flags)
533{
534 set_work_data(work, (unsigned long)cwq,
535 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
536}
537
538static void set_work_cpu(struct work_struct *work, unsigned int cpu)
539{
540 set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
541}
542
543static void clear_work_data(struct work_struct *work)
544{
545 set_work_data(work, WORK_STRUCT_NO_CPU, 0);
546}
547
548static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
549{
550 unsigned long data = atomic_long_read(&work->data);
551
552 if (data & WORK_STRUCT_CWQ)
553 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
554 else
555 return NULL;
556}
557
558static struct global_cwq *get_work_gcwq(struct work_struct *work)
559{
560 unsigned long data = atomic_long_read(&work->data);
561 unsigned int cpu;
562
563 if (data & WORK_STRUCT_CWQ)
564 return ((struct cpu_workqueue_struct *)
565 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
566
567 cpu = data >> WORK_STRUCT_FLAG_BITS;
568 if (cpu == WORK_CPU_NONE)
569 return NULL;
570
571 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
572 return get_gcwq(cpu);
573}
574
575/*
576 * Policy functions. These define the policies on how the global
577 * worker pool is managed. Unless noted otherwise, these functions
578 * assume that they're being called with gcwq->lock held.
579 */
580
581static bool __need_more_worker(struct global_cwq *gcwq)
582{
583 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
584 gcwq->flags & GCWQ_HIGHPRI_PENDING;
230} 585}
231 586
232/* 587/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. 588 * Need to wake up a worker? Called from anything but currently
589 * running workers.
234 */ 590 */
235static inline void clear_wq_data(struct work_struct *work) 591static bool need_more_worker(struct global_cwq *gcwq)
236{ 592{
237 unsigned long flags = *work_data_bits(work) & 593 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
238 (1UL << WORK_STRUCT_STATIC);
239 atomic_long_set(&work->data, flags);
240} 594}
241 595
242static inline 596/* Can I start working? Called from busy but !running workers. */
243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 597static bool may_start_working(struct global_cwq *gcwq)
244{ 598{
245 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 599 return gcwq->nr_idle;
246} 600}
247 601
602/* Do I need to keep working? Called from currently running workers. */
603static bool keep_working(struct global_cwq *gcwq)
604{
605 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
606
607 return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
608}
609
610/* Do we need a new worker? Called from manager. */
611static bool need_to_create_worker(struct global_cwq *gcwq)
612{
613 return need_more_worker(gcwq) && !may_start_working(gcwq);
614}
615
616/* Do I need to be the manager? */
617static bool need_to_manage_workers(struct global_cwq *gcwq)
618{
619 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
620}
621
622/* Do we have too many workers and should some go away? */
623static bool too_many_workers(struct global_cwq *gcwq)
624{
625 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
626 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
627 int nr_busy = gcwq->nr_workers - nr_idle;
628
629 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
630}
631
632/*
633 * Wake up functions.
634 */
635
636/* Return the first worker. Safe with preemption disabled */
637static struct worker *first_worker(struct global_cwq *gcwq)
638{
639 if (unlikely(list_empty(&gcwq->idle_list)))
640 return NULL;
641
642 return list_first_entry(&gcwq->idle_list, struct worker, entry);
643}
644
645/**
646 * wake_up_worker - wake up an idle worker
647 * @gcwq: gcwq to wake worker for
648 *
649 * Wake up the first idle worker of @gcwq.
650 *
651 * CONTEXT:
652 * spin_lock_irq(gcwq->lock).
653 */
654static void wake_up_worker(struct global_cwq *gcwq)
655{
656 struct worker *worker = first_worker(gcwq);
657
658 if (likely(worker))
659 wake_up_process(worker->task);
660}
661
662/**
663 * wq_worker_waking_up - a worker is waking up
664 * @task: task waking up
665 * @cpu: CPU @task is waking up to
666 *
667 * This function is called during try_to_wake_up() when a worker is
668 * being awoken.
669 *
670 * CONTEXT:
671 * spin_lock_irq(rq->lock)
672 */
673void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
674{
675 struct worker *worker = kthread_data(task);
676
677 if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
678 atomic_inc(get_gcwq_nr_running(cpu));
679}
680
681/**
682 * wq_worker_sleeping - a worker is going to sleep
683 * @task: task going to sleep
684 * @cpu: CPU in question, must be the current CPU number
685 *
686 * This function is called during schedule() when a busy worker is
687 * going to sleep. Worker on the same cpu can be woken up by
688 * returning pointer to its task.
689 *
690 * CONTEXT:
691 * spin_lock_irq(rq->lock)
692 *
693 * RETURNS:
694 * Worker task on @cpu to wake up, %NULL if none.
695 */
696struct task_struct *wq_worker_sleeping(struct task_struct *task,
697 unsigned int cpu)
698{
699 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
700 struct global_cwq *gcwq = get_gcwq(cpu);
701 atomic_t *nr_running = get_gcwq_nr_running(cpu);
702
703 if (unlikely(worker->flags & WORKER_NOT_RUNNING))
704 return NULL;
705
706 /* this can only happen on the local cpu */
707 BUG_ON(cpu != raw_smp_processor_id());
708
709 /*
710 * The counterpart of the following dec_and_test, implied mb,
711 * worklist not empty test sequence is in insert_work().
712 * Please read comment there.
713 *
714 * NOT_RUNNING is clear. This means that trustee is not in
715 * charge and we're running on the local cpu w/ rq lock held
716 * and preemption disabled, which in turn means that none else
717 * could be manipulating idle_list, so dereferencing idle_list
718 * without gcwq lock is safe.
719 */
720 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
721 to_wakeup = first_worker(gcwq);
722 return to_wakeup ? to_wakeup->task : NULL;
723}
724
725/**
726 * worker_set_flags - set worker flags and adjust nr_running accordingly
727 * @worker: self
728 * @flags: flags to set
729 * @wakeup: wakeup an idle worker if necessary
730 *
731 * Set @flags in @worker->flags and adjust nr_running accordingly. If
732 * nr_running becomes zero and @wakeup is %true, an idle worker is
733 * woken up.
734 *
735 * CONTEXT:
736 * spin_lock_irq(gcwq->lock)
737 */
738static inline void worker_set_flags(struct worker *worker, unsigned int flags,
739 bool wakeup)
740{
741 struct global_cwq *gcwq = worker->gcwq;
742
743 WARN_ON_ONCE(worker->task != current);
744
745 /*
746 * If transitioning into NOT_RUNNING, adjust nr_running and
747 * wake up an idle worker as necessary if requested by
748 * @wakeup.
749 */
750 if ((flags & WORKER_NOT_RUNNING) &&
751 !(worker->flags & WORKER_NOT_RUNNING)) {
752 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
753
754 if (wakeup) {
755 if (atomic_dec_and_test(nr_running) &&
756 !list_empty(&gcwq->worklist))
757 wake_up_worker(gcwq);
758 } else
759 atomic_dec(nr_running);
760 }
761
762 worker->flags |= flags;
763}
764
765/**
766 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
767 * @worker: self
768 * @flags: flags to clear
769 *
770 * Clear @flags in @worker->flags and adjust nr_running accordingly.
771 *
772 * CONTEXT:
773 * spin_lock_irq(gcwq->lock)
774 */
775static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
776{
777 struct global_cwq *gcwq = worker->gcwq;
778 unsigned int oflags = worker->flags;
779
780 WARN_ON_ONCE(worker->task != current);
781
782 worker->flags &= ~flags;
783
784 /* if transitioning out of NOT_RUNNING, increment nr_running */
785 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
786 if (!(worker->flags & WORKER_NOT_RUNNING))
787 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
788}
789
790/**
791 * busy_worker_head - return the busy hash head for a work
792 * @gcwq: gcwq of interest
793 * @work: work to be hashed
794 *
795 * Return hash head of @gcwq for @work.
796 *
797 * CONTEXT:
798 * spin_lock_irq(gcwq->lock).
799 *
800 * RETURNS:
801 * Pointer to the hash head.
802 */
803static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
804 struct work_struct *work)
805{
806 const int base_shift = ilog2(sizeof(struct work_struct));
807 unsigned long v = (unsigned long)work;
808
809 /* simple shift and fold hash, do we need something better? */
810 v >>= base_shift;
811 v += v >> BUSY_WORKER_HASH_ORDER;
812 v &= BUSY_WORKER_HASH_MASK;
813
814 return &gcwq->busy_hash[v];
815}
816
817/**
818 * __find_worker_executing_work - find worker which is executing a work
819 * @gcwq: gcwq of interest
820 * @bwh: hash head as returned by busy_worker_head()
821 * @work: work to find worker for
822 *
823 * Find a worker which is executing @work on @gcwq. @bwh should be
824 * the hash head obtained by calling busy_worker_head() with the same
825 * work.
826 *
827 * CONTEXT:
828 * spin_lock_irq(gcwq->lock).
829 *
830 * RETURNS:
831 * Pointer to worker which is executing @work if found, NULL
832 * otherwise.
833 */
834static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
835 struct hlist_head *bwh,
836 struct work_struct *work)
837{
838 struct worker *worker;
839 struct hlist_node *tmp;
840
841 hlist_for_each_entry(worker, tmp, bwh, hentry)
842 if (worker->current_work == work)
843 return worker;
844 return NULL;
845}
846
847/**
848 * find_worker_executing_work - find worker which is executing a work
849 * @gcwq: gcwq of interest
850 * @work: work to find worker for
851 *
852 * Find a worker which is executing @work on @gcwq. This function is
853 * identical to __find_worker_executing_work() except that this
854 * function calculates @bwh itself.
855 *
856 * CONTEXT:
857 * spin_lock_irq(gcwq->lock).
858 *
859 * RETURNS:
860 * Pointer to worker which is executing @work if found, NULL
861 * otherwise.
862 */
863static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
864 struct work_struct *work)
865{
866 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
867 work);
868}
869
870/**
871 * gcwq_determine_ins_pos - find insertion position
872 * @gcwq: gcwq of interest
873 * @cwq: cwq a work is being queued for
874 *
875 * A work for @cwq is about to be queued on @gcwq, determine insertion
876 * position for the work. If @cwq is for HIGHPRI wq, the work is
877 * queued at the head of the queue but in FIFO order with respect to
878 * other HIGHPRI works; otherwise, at the end of the queue. This
879 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
880 * there are HIGHPRI works pending.
881 *
882 * CONTEXT:
883 * spin_lock_irq(gcwq->lock).
884 *
885 * RETURNS:
886 * Pointer to inserstion position.
887 */
888static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
889 struct cpu_workqueue_struct *cwq)
890{
891 struct work_struct *twork;
892
893 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
894 return &gcwq->worklist;
895
896 list_for_each_entry(twork, &gcwq->worklist, entry) {
897 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
898
899 if (!(tcwq->wq->flags & WQ_HIGHPRI))
900 break;
901 }
902
903 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
904 return &twork->entry;
905}
906
907/**
908 * insert_work - insert a work into gcwq
909 * @cwq: cwq @work belongs to
910 * @work: work to insert
911 * @head: insertion point
912 * @extra_flags: extra WORK_STRUCT_* flags to set
913 *
914 * Insert @work which belongs to @cwq into @gcwq after @head.
915 * @extra_flags is or'd to work_struct flags.
916 *
917 * CONTEXT:
918 * spin_lock_irq(gcwq->lock).
919 */
248static void insert_work(struct cpu_workqueue_struct *cwq, 920static void insert_work(struct cpu_workqueue_struct *cwq,
249 struct work_struct *work, struct list_head *head) 921 struct work_struct *work, struct list_head *head,
922 unsigned int extra_flags)
250{ 923{
251 trace_workqueue_insertion(cwq->thread, work); 924 struct global_cwq *gcwq = cwq->gcwq;
925
926 /* we own @work, set data and link */
927 set_work_cwq(work, cwq, extra_flags);
252 928
253 set_wq_data(work, cwq);
254 /* 929 /*
255 * Ensure that we get the right work->data if we see the 930 * Ensure that we get the right work->data if we see the
256 * result of list_add() below, see try_to_grab_pending(). 931 * result of list_add() below, see try_to_grab_pending().
257 */ 932 */
258 smp_wmb(); 933 smp_wmb();
934
259 list_add_tail(&work->entry, head); 935 list_add_tail(&work->entry, head);
260 wake_up(&cwq->more_work); 936
937 /*
938 * Ensure either worker_sched_deactivated() sees the above
939 * list_add_tail() or we see zero nr_running to avoid workers
940 * lying around lazily while there are works to be processed.
941 */
942 smp_mb();
943
944 if (__need_more_worker(gcwq))
945 wake_up_worker(gcwq);
261} 946}
262 947
263static void __queue_work(struct cpu_workqueue_struct *cwq, 948static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
264 struct work_struct *work) 949 struct work_struct *work)
265{ 950{
951 struct global_cwq *gcwq;
952 struct cpu_workqueue_struct *cwq;
953 struct list_head *worklist;
954 unsigned int work_flags;
266 unsigned long flags; 955 unsigned long flags;
267 956
268 debug_work_activate(work); 957 debug_work_activate(work);
269 spin_lock_irqsave(&cwq->lock, flags); 958
270 insert_work(cwq, work, &cwq->worklist); 959 if (WARN_ON_ONCE(wq->flags & WQ_DYING))
271 spin_unlock_irqrestore(&cwq->lock, flags); 960 return;
961
962 /* determine gcwq to use */
963 if (!(wq->flags & WQ_UNBOUND)) {
964 struct global_cwq *last_gcwq;
965
966 if (unlikely(cpu == WORK_CPU_UNBOUND))
967 cpu = raw_smp_processor_id();
968
969 /*
970 * It's multi cpu. If @wq is non-reentrant and @work
971 * was previously on a different cpu, it might still
972 * be running there, in which case the work needs to
973 * be queued on that cpu to guarantee non-reentrance.
974 */
975 gcwq = get_gcwq(cpu);
976 if (wq->flags & WQ_NON_REENTRANT &&
977 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
978 struct worker *worker;
979
980 spin_lock_irqsave(&last_gcwq->lock, flags);
981
982 worker = find_worker_executing_work(last_gcwq, work);
983
984 if (worker && worker->current_cwq->wq == wq)
985 gcwq = last_gcwq;
986 else {
987 /* meh... not running there, queue here */
988 spin_unlock_irqrestore(&last_gcwq->lock, flags);
989 spin_lock_irqsave(&gcwq->lock, flags);
990 }
991 } else
992 spin_lock_irqsave(&gcwq->lock, flags);
993 } else {
994 gcwq = get_gcwq(WORK_CPU_UNBOUND);
995 spin_lock_irqsave(&gcwq->lock, flags);
996 }
997
998 /* gcwq determined, get cwq and queue */
999 cwq = get_cwq(gcwq->cpu, wq);
1000
1001 BUG_ON(!list_empty(&work->entry));
1002
1003 cwq->nr_in_flight[cwq->work_color]++;
1004 work_flags = work_color_to_flags(cwq->work_color);
1005
1006 if (likely(cwq->nr_active < cwq->max_active)) {
1007 cwq->nr_active++;
1008 worklist = gcwq_determine_ins_pos(gcwq, cwq);
1009 } else {
1010 work_flags |= WORK_STRUCT_DELAYED;
1011 worklist = &cwq->delayed_works;
1012 }
1013
1014 insert_work(cwq, work, worklist, work_flags);
1015
1016 spin_unlock_irqrestore(&gcwq->lock, flags);
272} 1017}
273 1018
274/** 1019/**
@@ -308,9 +1053,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
308{ 1053{
309 int ret = 0; 1054 int ret = 0;
310 1055
311 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1056 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
312 BUG_ON(!list_empty(&work->entry)); 1057 __queue_work(cpu, wq, work);
313 __queue_work(wq_per_cpu(wq, cpu), work);
314 ret = 1; 1058 ret = 1;
315 } 1059 }
316 return ret; 1060 return ret;
@@ -320,10 +1064,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
320static void delayed_work_timer_fn(unsigned long __data) 1064static void delayed_work_timer_fn(unsigned long __data)
321{ 1065{
322 struct delayed_work *dwork = (struct delayed_work *)__data; 1066 struct delayed_work *dwork = (struct delayed_work *)__data;
323 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); 1067 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
324 struct workqueue_struct *wq = cwq->wq;
325 1068
326 __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); 1069 __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
327} 1070}
328 1071
329/** 1072/**
@@ -360,14 +1103,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
360 struct timer_list *timer = &dwork->timer; 1103 struct timer_list *timer = &dwork->timer;
361 struct work_struct *work = &dwork->work; 1104 struct work_struct *work = &dwork->work;
362 1105
363 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 1106 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1107 unsigned int lcpu;
1108
364 BUG_ON(timer_pending(timer)); 1109 BUG_ON(timer_pending(timer));
365 BUG_ON(!list_empty(&work->entry)); 1110 BUG_ON(!list_empty(&work->entry));
366 1111
367 timer_stats_timer_set_start_info(&dwork->timer); 1112 timer_stats_timer_set_start_info(&dwork->timer);
368 1113
369 /* This stores cwq for the moment, for the timer_fn */ 1114 /*
370 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); 1115 * This stores cwq for the moment, for the timer_fn.
1116 * Note that the work's gcwq is preserved to allow
1117 * reentrance detection for delayed works.
1118 */
1119 if (!(wq->flags & WQ_UNBOUND)) {
1120 struct global_cwq *gcwq = get_work_gcwq(work);
1121
1122 if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1123 lcpu = gcwq->cpu;
1124 else
1125 lcpu = raw_smp_processor_id();
1126 } else
1127 lcpu = WORK_CPU_UNBOUND;
1128
1129 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1130
371 timer->expires = jiffies + delay; 1131 timer->expires = jiffies + delay;
372 timer->data = (unsigned long)dwork; 1132 timer->data = (unsigned long)dwork;
373 timer->function = delayed_work_timer_fn; 1133 timer->function = delayed_work_timer_fn;
@@ -382,80 +1142,888 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
382} 1142}
383EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1143EXPORT_SYMBOL_GPL(queue_delayed_work_on);
384 1144
385static void run_workqueue(struct cpu_workqueue_struct *cwq) 1145/**
1146 * worker_enter_idle - enter idle state
1147 * @worker: worker which is entering idle state
1148 *
1149 * @worker is entering idle state. Update stats and idle timer if
1150 * necessary.
1151 *
1152 * LOCKING:
1153 * spin_lock_irq(gcwq->lock).
1154 */
1155static void worker_enter_idle(struct worker *worker)
386{ 1156{
387 spin_lock_irq(&cwq->lock); 1157 struct global_cwq *gcwq = worker->gcwq;
388 while (!list_empty(&cwq->worklist)) { 1158
389 struct work_struct *work = list_entry(cwq->worklist.next, 1159 BUG_ON(worker->flags & WORKER_IDLE);
390 struct work_struct, entry); 1160 BUG_ON(!list_empty(&worker->entry) &&
391 work_func_t f = work->func; 1161 (worker->hentry.next || worker->hentry.pprev));
392#ifdef CONFIG_LOCKDEP 1162
1163 /* can't use worker_set_flags(), also called from start_worker() */
1164 worker->flags |= WORKER_IDLE;
1165 gcwq->nr_idle++;
1166 worker->last_active = jiffies;
1167
1168 /* idle_list is LIFO */
1169 list_add(&worker->entry, &gcwq->idle_list);
1170
1171 if (likely(!(worker->flags & WORKER_ROGUE))) {
1172 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1173 mod_timer(&gcwq->idle_timer,
1174 jiffies + IDLE_WORKER_TIMEOUT);
1175 } else
1176 wake_up_all(&gcwq->trustee_wait);
1177
1178 /* sanity check nr_running */
1179 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1180 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1181}
1182
1183/**
1184 * worker_leave_idle - leave idle state
1185 * @worker: worker which is leaving idle state
1186 *
1187 * @worker is leaving idle state. Update stats.
1188 *
1189 * LOCKING:
1190 * spin_lock_irq(gcwq->lock).
1191 */
1192static void worker_leave_idle(struct worker *worker)
1193{
1194 struct global_cwq *gcwq = worker->gcwq;
1195
1196 BUG_ON(!(worker->flags & WORKER_IDLE));
1197 worker_clr_flags(worker, WORKER_IDLE);
1198 gcwq->nr_idle--;
1199 list_del_init(&worker->entry);
1200}
1201
1202/**
1203 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1204 * @worker: self
1205 *
1206 * Works which are scheduled while the cpu is online must at least be
1207 * scheduled to a worker which is bound to the cpu so that if they are
1208 * flushed from cpu callbacks while cpu is going down, they are
1209 * guaranteed to execute on the cpu.
1210 *
1211 * This function is to be used by rogue workers and rescuers to bind
1212 * themselves to the target cpu and may race with cpu going down or
1213 * coming online. kthread_bind() can't be used because it may put the
1214 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1215 * verbatim as it's best effort and blocking and gcwq may be
1216 * [dis]associated in the meantime.
1217 *
1218 * This function tries set_cpus_allowed() and locks gcwq and verifies
1219 * the binding against GCWQ_DISASSOCIATED which is set during
1220 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1221 * idle state or fetches works without dropping lock, it can guarantee
1222 * the scheduling requirement described in the first paragraph.
1223 *
1224 * CONTEXT:
1225 * Might sleep. Called without any lock but returns with gcwq->lock
1226 * held.
1227 *
1228 * RETURNS:
1229 * %true if the associated gcwq is online (@worker is successfully
1230 * bound), %false if offline.
1231 */
1232static bool worker_maybe_bind_and_lock(struct worker *worker)
1233__acquires(&gcwq->lock)
1234{
1235 struct global_cwq *gcwq = worker->gcwq;
1236 struct task_struct *task = worker->task;
1237
1238 while (true) {
393 /* 1239 /*
394 * It is permissible to free the struct work_struct 1240 * The following call may fail, succeed or succeed
395 * from inside the function that is called from it, 1241 * without actually migrating the task to the cpu if
396 * this we need to take into account for lockdep too. 1242 * it races with cpu hotunplug operation. Verify
397 * To avoid bogus "held lock freed" warnings as well 1243 * against GCWQ_DISASSOCIATED.
398 * as problems when looking into work->lockdep_map,
399 * make a copy and use that here.
400 */ 1244 */
401 struct lockdep_map lockdep_map = work->lockdep_map; 1245 if (!(gcwq->flags & GCWQ_DISASSOCIATED))
402#endif 1246 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
403 trace_workqueue_execution(cwq->thread, work); 1247
404 debug_work_deactivate(work); 1248 spin_lock_irq(&gcwq->lock);
405 cwq->current_work = work; 1249 if (gcwq->flags & GCWQ_DISASSOCIATED)
406 list_del_init(cwq->worklist.next); 1250 return false;
407 spin_unlock_irq(&cwq->lock); 1251 if (task_cpu(task) == gcwq->cpu &&
408 1252 cpumask_equal(&current->cpus_allowed,
409 BUG_ON(get_wq_data(work) != cwq); 1253 get_cpu_mask(gcwq->cpu)))
410 work_clear_pending(work); 1254 return true;
411 lock_map_acquire(&cwq->wq->lockdep_map); 1255 spin_unlock_irq(&gcwq->lock);
412 lock_map_acquire(&lockdep_map); 1256
413 f(work); 1257 /* CPU has come up inbetween, retry migration */
414 lock_map_release(&lockdep_map); 1258 cpu_relax();
415 lock_map_release(&cwq->wq->lockdep_map); 1259 }
416 1260}
417 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 1261
418 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 1262/*
419 "%s/0x%08x/%d\n", 1263 * Function for worker->rebind_work used to rebind rogue busy workers
420 current->comm, preempt_count(), 1264 * to the associated cpu which is coming back online. This is
421 task_pid_nr(current)); 1265 * scheduled by cpu up but can race with other cpu hotplug operations
422 printk(KERN_ERR " last function: "); 1266 * and may be executed twice without intervening cpu down.
423 print_symbol("%s\n", (unsigned long)f); 1267 */
424 debug_show_held_locks(current); 1268static void worker_rebind_fn(struct work_struct *work)
425 dump_stack(); 1269{
1270 struct worker *worker = container_of(work, struct worker, rebind_work);
1271 struct global_cwq *gcwq = worker->gcwq;
1272
1273 if (worker_maybe_bind_and_lock(worker))
1274 worker_clr_flags(worker, WORKER_REBIND);
1275
1276 spin_unlock_irq(&gcwq->lock);
1277}
1278
1279static struct worker *alloc_worker(void)
1280{
1281 struct worker *worker;
1282
1283 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1284 if (worker) {
1285 INIT_LIST_HEAD(&worker->entry);
1286 INIT_LIST_HEAD(&worker->scheduled);
1287 INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1288 /* on creation a worker is in !idle && prep state */
1289 worker->flags = WORKER_PREP;
1290 }
1291 return worker;
1292}
1293
1294/**
1295 * create_worker - create a new workqueue worker
1296 * @gcwq: gcwq the new worker will belong to
1297 * @bind: whether to set affinity to @cpu or not
1298 *
1299 * Create a new worker which is bound to @gcwq. The returned worker
1300 * can be started by calling start_worker() or destroyed using
1301 * destroy_worker().
1302 *
1303 * CONTEXT:
1304 * Might sleep. Does GFP_KERNEL allocations.
1305 *
1306 * RETURNS:
1307 * Pointer to the newly created worker.
1308 */
1309static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1310{
1311 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1312 struct worker *worker = NULL;
1313 int id = -1;
1314
1315 spin_lock_irq(&gcwq->lock);
1316 while (ida_get_new(&gcwq->worker_ida, &id)) {
1317 spin_unlock_irq(&gcwq->lock);
1318 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1319 goto fail;
1320 spin_lock_irq(&gcwq->lock);
1321 }
1322 spin_unlock_irq(&gcwq->lock);
1323
1324 worker = alloc_worker();
1325 if (!worker)
1326 goto fail;
1327
1328 worker->gcwq = gcwq;
1329 worker->id = id;
1330
1331 if (!on_unbound_cpu)
1332 worker->task = kthread_create(worker_thread, worker,
1333 "kworker/%u:%d", gcwq->cpu, id);
1334 else
1335 worker->task = kthread_create(worker_thread, worker,
1336 "kworker/u:%d", id);
1337 if (IS_ERR(worker->task))
1338 goto fail;
1339
1340 /*
1341 * A rogue worker will become a regular one if CPU comes
1342 * online later on. Make sure every worker has
1343 * PF_THREAD_BOUND set.
1344 */
1345 if (bind && !on_unbound_cpu)
1346 kthread_bind(worker->task, gcwq->cpu);
1347 else {
1348 worker->task->flags |= PF_THREAD_BOUND;
1349 if (on_unbound_cpu)
1350 worker->flags |= WORKER_UNBOUND;
1351 }
1352
1353 return worker;
1354fail:
1355 if (id >= 0) {
1356 spin_lock_irq(&gcwq->lock);
1357 ida_remove(&gcwq->worker_ida, id);
1358 spin_unlock_irq(&gcwq->lock);
1359 }
1360 kfree(worker);
1361 return NULL;
1362}
1363
1364/**
1365 * start_worker - start a newly created worker
1366 * @worker: worker to start
1367 *
1368 * Make the gcwq aware of @worker and start it.
1369 *
1370 * CONTEXT:
1371 * spin_lock_irq(gcwq->lock).
1372 */
1373static void start_worker(struct worker *worker)
1374{
1375 worker->flags |= WORKER_STARTED;
1376 worker->gcwq->nr_workers++;
1377 worker_enter_idle(worker);
1378 wake_up_process(worker->task);
1379}
1380
1381/**
1382 * destroy_worker - destroy a workqueue worker
1383 * @worker: worker to be destroyed
1384 *
1385 * Destroy @worker and adjust @gcwq stats accordingly.
1386 *
1387 * CONTEXT:
1388 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1389 */
1390static void destroy_worker(struct worker *worker)
1391{
1392 struct global_cwq *gcwq = worker->gcwq;
1393 int id = worker->id;
1394
1395 /* sanity check frenzy */
1396 BUG_ON(worker->current_work);
1397 BUG_ON(!list_empty(&worker->scheduled));
1398
1399 if (worker->flags & WORKER_STARTED)
1400 gcwq->nr_workers--;
1401 if (worker->flags & WORKER_IDLE)
1402 gcwq->nr_idle--;
1403
1404 list_del_init(&worker->entry);
1405 worker->flags |= WORKER_DIE;
1406
1407 spin_unlock_irq(&gcwq->lock);
1408
1409 kthread_stop(worker->task);
1410 kfree(worker);
1411
1412 spin_lock_irq(&gcwq->lock);
1413 ida_remove(&gcwq->worker_ida, id);
1414}
1415
1416static void idle_worker_timeout(unsigned long __gcwq)
1417{
1418 struct global_cwq *gcwq = (void *)__gcwq;
1419
1420 spin_lock_irq(&gcwq->lock);
1421
1422 if (too_many_workers(gcwq)) {
1423 struct worker *worker;
1424 unsigned long expires;
1425
1426 /* idle_list is kept in LIFO order, check the last one */
1427 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1428 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1429
1430 if (time_before(jiffies, expires))
1431 mod_timer(&gcwq->idle_timer, expires);
1432 else {
1433 /* it's been idle for too long, wake up manager */
1434 gcwq->flags |= GCWQ_MANAGE_WORKERS;
1435 wake_up_worker(gcwq);
426 } 1436 }
1437 }
427 1438
428 spin_lock_irq(&cwq->lock); 1439 spin_unlock_irq(&gcwq->lock);
429 cwq->current_work = NULL; 1440}
1441
1442static bool send_mayday(struct work_struct *work)
1443{
1444 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1445 struct workqueue_struct *wq = cwq->wq;
1446 unsigned int cpu;
1447
1448 if (!(wq->flags & WQ_RESCUER))
1449 return false;
1450
1451 /* mayday mayday mayday */
1452 cpu = cwq->gcwq->cpu;
1453 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1454 if (cpu == WORK_CPU_UNBOUND)
1455 cpu = 0;
1456 if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1457 wake_up_process(wq->rescuer->task);
1458 return true;
1459}
1460
1461static void gcwq_mayday_timeout(unsigned long __gcwq)
1462{
1463 struct global_cwq *gcwq = (void *)__gcwq;
1464 struct work_struct *work;
1465
1466 spin_lock_irq(&gcwq->lock);
1467
1468 if (need_to_create_worker(gcwq)) {
1469 /*
1470 * We've been trying to create a new worker but
1471 * haven't been successful. We might be hitting an
1472 * allocation deadlock. Send distress signals to
1473 * rescuers.
1474 */
1475 list_for_each_entry(work, &gcwq->worklist, entry)
1476 send_mayday(work);
430 } 1477 }
431 spin_unlock_irq(&cwq->lock); 1478
1479 spin_unlock_irq(&gcwq->lock);
1480
1481 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
432} 1482}
433 1483
434static int worker_thread(void *__cwq) 1484/**
1485 * maybe_create_worker - create a new worker if necessary
1486 * @gcwq: gcwq to create a new worker for
1487 *
1488 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
1489 * have at least one idle worker on return from this function. If
1490 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1491 * sent to all rescuers with works scheduled on @gcwq to resolve
1492 * possible allocation deadlock.
1493 *
1494 * On return, need_to_create_worker() is guaranteed to be false and
1495 * may_start_working() true.
1496 *
1497 * LOCKING:
1498 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1499 * multiple times. Does GFP_KERNEL allocations. Called only from
1500 * manager.
1501 *
1502 * RETURNS:
1503 * false if no action was taken and gcwq->lock stayed locked, true
1504 * otherwise.
1505 */
1506static bool maybe_create_worker(struct global_cwq *gcwq)
1507__releases(&gcwq->lock)
1508__acquires(&gcwq->lock)
435{ 1509{
436 struct cpu_workqueue_struct *cwq = __cwq; 1510 if (!need_to_create_worker(gcwq))
437 DEFINE_WAIT(wait); 1511 return false;
1512restart:
1513 spin_unlock_irq(&gcwq->lock);
1514
1515 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1516 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1517
1518 while (true) {
1519 struct worker *worker;
1520
1521 worker = create_worker(gcwq, true);
1522 if (worker) {
1523 del_timer_sync(&gcwq->mayday_timer);
1524 spin_lock_irq(&gcwq->lock);
1525 start_worker(worker);
1526 BUG_ON(need_to_create_worker(gcwq));
1527 return true;
1528 }
1529
1530 if (!need_to_create_worker(gcwq))
1531 break;
438 1532
439 if (cwq->wq->freezeable) 1533 __set_current_state(TASK_INTERRUPTIBLE);
440 set_freezable(); 1534 schedule_timeout(CREATE_COOLDOWN);
441 1535
442 for (;;) { 1536 if (!need_to_create_worker(gcwq))
443 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 1537 break;
444 if (!freezing(current) && 1538 }
445 !kthread_should_stop() && 1539
446 list_empty(&cwq->worklist)) 1540 del_timer_sync(&gcwq->mayday_timer);
447 schedule(); 1541 spin_lock_irq(&gcwq->lock);
448 finish_wait(&cwq->more_work, &wait); 1542 if (need_to_create_worker(gcwq))
1543 goto restart;
1544 return true;
1545}
1546
1547/**
1548 * maybe_destroy_worker - destroy workers which have been idle for a while
1549 * @gcwq: gcwq to destroy workers for
1550 *
1551 * Destroy @gcwq workers which have been idle for longer than
1552 * IDLE_WORKER_TIMEOUT.
1553 *
1554 * LOCKING:
1555 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1556 * multiple times. Called only from manager.
1557 *
1558 * RETURNS:
1559 * false if no action was taken and gcwq->lock stayed locked, true
1560 * otherwise.
1561 */
1562static bool maybe_destroy_workers(struct global_cwq *gcwq)
1563{
1564 bool ret = false;
449 1565
450 try_to_freeze(); 1566 while (too_many_workers(gcwq)) {
1567 struct worker *worker;
1568 unsigned long expires;
451 1569
452 if (kthread_should_stop()) 1570 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1571 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1572
1573 if (time_before(jiffies, expires)) {
1574 mod_timer(&gcwq->idle_timer, expires);
453 break; 1575 break;
1576 }
454 1577
455 run_workqueue(cwq); 1578 destroy_worker(worker);
1579 ret = true;
456 } 1580 }
457 1581
458 return 0; 1582 return ret;
1583}
1584
1585/**
1586 * manage_workers - manage worker pool
1587 * @worker: self
1588 *
1589 * Assume the manager role and manage gcwq worker pool @worker belongs
1590 * to. At any given time, there can be only zero or one manager per
1591 * gcwq. The exclusion is handled automatically by this function.
1592 *
1593 * The caller can safely start processing works on false return. On
1594 * true return, it's guaranteed that need_to_create_worker() is false
1595 * and may_start_working() is true.
1596 *
1597 * CONTEXT:
1598 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1599 * multiple times. Does GFP_KERNEL allocations.
1600 *
1601 * RETURNS:
1602 * false if no action was taken and gcwq->lock stayed locked, true if
1603 * some action was taken.
1604 */
1605static bool manage_workers(struct worker *worker)
1606{
1607 struct global_cwq *gcwq = worker->gcwq;
1608 bool ret = false;
1609
1610 if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1611 return ret;
1612
1613 gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1614 gcwq->flags |= GCWQ_MANAGING_WORKERS;
1615
1616 /*
1617 * Destroy and then create so that may_start_working() is true
1618 * on return.
1619 */
1620 ret |= maybe_destroy_workers(gcwq);
1621 ret |= maybe_create_worker(gcwq);
1622
1623 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1624
1625 /*
1626 * The trustee might be waiting to take over the manager
1627 * position, tell it we're done.
1628 */
1629 if (unlikely(gcwq->trustee))
1630 wake_up_all(&gcwq->trustee_wait);
1631
1632 return ret;
1633}
1634
1635/**
1636 * move_linked_works - move linked works to a list
1637 * @work: start of series of works to be scheduled
1638 * @head: target list to append @work to
1639 * @nextp: out paramter for nested worklist walking
1640 *
1641 * Schedule linked works starting from @work to @head. Work series to
1642 * be scheduled starts at @work and includes any consecutive work with
1643 * WORK_STRUCT_LINKED set in its predecessor.
1644 *
1645 * If @nextp is not NULL, it's updated to point to the next work of
1646 * the last scheduled work. This allows move_linked_works() to be
1647 * nested inside outer list_for_each_entry_safe().
1648 *
1649 * CONTEXT:
1650 * spin_lock_irq(gcwq->lock).
1651 */
1652static void move_linked_works(struct work_struct *work, struct list_head *head,
1653 struct work_struct **nextp)
1654{
1655 struct work_struct *n;
1656
1657 /*
1658 * Linked worklist will always end before the end of the list,
1659 * use NULL for list head.
1660 */
1661 list_for_each_entry_safe_from(work, n, NULL, entry) {
1662 list_move_tail(&work->entry, head);
1663 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1664 break;
1665 }
1666
1667 /*
1668 * If we're already inside safe list traversal and have moved
1669 * multiple works to the scheduled queue, the next position
1670 * needs to be updated.
1671 */
1672 if (nextp)
1673 *nextp = n;
1674}
1675
1676static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1677{
1678 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1679 struct work_struct, entry);
1680 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1681
1682 move_linked_works(work, pos, NULL);
1683 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1684 cwq->nr_active++;
1685}
1686
1687/**
1688 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1689 * @cwq: cwq of interest
1690 * @color: color of work which left the queue
1691 * @delayed: for a delayed work
1692 *
1693 * A work either has completed or is removed from pending queue,
1694 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1695 *
1696 * CONTEXT:
1697 * spin_lock_irq(gcwq->lock).
1698 */
1699static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1700 bool delayed)
1701{
1702 /* ignore uncolored works */
1703 if (color == WORK_NO_COLOR)
1704 return;
1705
1706 cwq->nr_in_flight[color]--;
1707
1708 if (!delayed) {
1709 cwq->nr_active--;
1710 if (!list_empty(&cwq->delayed_works)) {
1711 /* one down, submit a delayed one */
1712 if (cwq->nr_active < cwq->max_active)
1713 cwq_activate_first_delayed(cwq);
1714 }
1715 }
1716
1717 /* is flush in progress and are we at the flushing tip? */
1718 if (likely(cwq->flush_color != color))
1719 return;
1720
1721 /* are there still in-flight works? */
1722 if (cwq->nr_in_flight[color])
1723 return;
1724
1725 /* this cwq is done, clear flush_color */
1726 cwq->flush_color = -1;
1727
1728 /*
1729 * If this was the last cwq, wake up the first flusher. It
1730 * will handle the rest.
1731 */
1732 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1733 complete(&cwq->wq->first_flusher->done);
1734}
1735
1736/**
1737 * process_one_work - process single work
1738 * @worker: self
1739 * @work: work to process
1740 *
1741 * Process @work. This function contains all the logics necessary to
1742 * process a single work including synchronization against and
1743 * interaction with other workers on the same cpu, queueing and
1744 * flushing. As long as context requirement is met, any worker can
1745 * call this function to process a work.
1746 *
1747 * CONTEXT:
1748 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1749 */
1750static void process_one_work(struct worker *worker, struct work_struct *work)
1751__releases(&gcwq->lock)
1752__acquires(&gcwq->lock)
1753{
1754 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1755 struct global_cwq *gcwq = cwq->gcwq;
1756 struct hlist_head *bwh = busy_worker_head(gcwq, work);
1757 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1758 work_func_t f = work->func;
1759 int work_color;
1760 struct worker *collision;
1761#ifdef CONFIG_LOCKDEP
1762 /*
1763 * It is permissible to free the struct work_struct from
1764 * inside the function that is called from it, this we need to
1765 * take into account for lockdep too. To avoid bogus "held
1766 * lock freed" warnings as well as problems when looking into
1767 * work->lockdep_map, make a copy and use that here.
1768 */
1769 struct lockdep_map lockdep_map = work->lockdep_map;
1770#endif
1771 /*
1772 * A single work shouldn't be executed concurrently by
1773 * multiple workers on a single cpu. Check whether anyone is
1774 * already processing the work. If so, defer the work to the
1775 * currently executing one.
1776 */
1777 collision = __find_worker_executing_work(gcwq, bwh, work);
1778 if (unlikely(collision)) {
1779 move_linked_works(work, &collision->scheduled, NULL);
1780 return;
1781 }
1782
1783 /* claim and process */
1784 debug_work_deactivate(work);
1785 hlist_add_head(&worker->hentry, bwh);
1786 worker->current_work = work;
1787 worker->current_cwq = cwq;
1788 work_color = get_work_color(work);
1789
1790 /* record the current cpu number in the work data and dequeue */
1791 set_work_cpu(work, gcwq->cpu);
1792 list_del_init(&work->entry);
1793
1794 /*
1795 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1796 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1797 */
1798 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1799 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1800 struct work_struct, entry);
1801
1802 if (!list_empty(&gcwq->worklist) &&
1803 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1804 wake_up_worker(gcwq);
1805 else
1806 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1807 }
1808
1809 /*
1810 * CPU intensive works don't participate in concurrency
1811 * management. They're the scheduler's responsibility.
1812 */
1813 if (unlikely(cpu_intensive))
1814 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1815
1816 spin_unlock_irq(&gcwq->lock);
1817
1818 work_clear_pending(work);
1819 lock_map_acquire(&cwq->wq->lockdep_map);
1820 lock_map_acquire(&lockdep_map);
1821 trace_workqueue_execute_start(work);
1822 f(work);
1823 /*
1824 * While we must be careful to not use "work" after this, the trace
1825 * point will only record its address.
1826 */
1827 trace_workqueue_execute_end(work);
1828 lock_map_release(&lockdep_map);
1829 lock_map_release(&cwq->wq->lockdep_map);
1830
1831 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1832 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1833 "%s/0x%08x/%d\n",
1834 current->comm, preempt_count(), task_pid_nr(current));
1835 printk(KERN_ERR " last function: ");
1836 print_symbol("%s\n", (unsigned long)f);
1837 debug_show_held_locks(current);
1838 dump_stack();
1839 }
1840
1841 spin_lock_irq(&gcwq->lock);
1842
1843 /* clear cpu intensive status */
1844 if (unlikely(cpu_intensive))
1845 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1846
1847 /* we're done with it, release */
1848 hlist_del_init(&worker->hentry);
1849 worker->current_work = NULL;
1850 worker->current_cwq = NULL;
1851 cwq_dec_nr_in_flight(cwq, work_color, false);
1852}
1853
1854/**
1855 * process_scheduled_works - process scheduled works
1856 * @worker: self
1857 *
1858 * Process all scheduled works. Please note that the scheduled list
1859 * may change while processing a work, so this function repeatedly
1860 * fetches a work from the top and executes it.
1861 *
1862 * CONTEXT:
1863 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1864 * multiple times.
1865 */
1866static void process_scheduled_works(struct worker *worker)
1867{
1868 while (!list_empty(&worker->scheduled)) {
1869 struct work_struct *work = list_first_entry(&worker->scheduled,
1870 struct work_struct, entry);
1871 process_one_work(worker, work);
1872 }
1873}
1874
1875/**
1876 * worker_thread - the worker thread function
1877 * @__worker: self
1878 *
1879 * The gcwq worker thread function. There's a single dynamic pool of
1880 * these per each cpu. These workers process all works regardless of
1881 * their specific target workqueue. The only exception is works which
1882 * belong to workqueues with a rescuer which will be explained in
1883 * rescuer_thread().
1884 */
1885static int worker_thread(void *__worker)
1886{
1887 struct worker *worker = __worker;
1888 struct global_cwq *gcwq = worker->gcwq;
1889
1890 /* tell the scheduler that this is a workqueue worker */
1891 worker->task->flags |= PF_WQ_WORKER;
1892woke_up:
1893 spin_lock_irq(&gcwq->lock);
1894
1895 /* DIE can be set only while we're idle, checking here is enough */
1896 if (worker->flags & WORKER_DIE) {
1897 spin_unlock_irq(&gcwq->lock);
1898 worker->task->flags &= ~PF_WQ_WORKER;
1899 return 0;
1900 }
1901
1902 worker_leave_idle(worker);
1903recheck:
1904 /* no more worker necessary? */
1905 if (!need_more_worker(gcwq))
1906 goto sleep;
1907
1908 /* do we need to manage? */
1909 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1910 goto recheck;
1911
1912 /*
1913 * ->scheduled list can only be filled while a worker is
1914 * preparing to process a work or actually processing it.
1915 * Make sure nobody diddled with it while I was sleeping.
1916 */
1917 BUG_ON(!list_empty(&worker->scheduled));
1918
1919 /*
1920 * When control reaches this point, we're guaranteed to have
1921 * at least one idle worker or that someone else has already
1922 * assumed the manager role.
1923 */
1924 worker_clr_flags(worker, WORKER_PREP);
1925
1926 do {
1927 struct work_struct *work =
1928 list_first_entry(&gcwq->worklist,
1929 struct work_struct, entry);
1930
1931 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1932 /* optimization path, not strictly necessary */
1933 process_one_work(worker, work);
1934 if (unlikely(!list_empty(&worker->scheduled)))
1935 process_scheduled_works(worker);
1936 } else {
1937 move_linked_works(work, &worker->scheduled, NULL);
1938 process_scheduled_works(worker);
1939 }
1940 } while (keep_working(gcwq));
1941
1942 worker_set_flags(worker, WORKER_PREP, false);
1943sleep:
1944 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1945 goto recheck;
1946
1947 /*
1948 * gcwq->lock is held and there's no work to process and no
1949 * need to manage, sleep. Workers are woken up only while
1950 * holding gcwq->lock or from local cpu, so setting the
1951 * current state before releasing gcwq->lock is enough to
1952 * prevent losing any event.
1953 */
1954 worker_enter_idle(worker);
1955 __set_current_state(TASK_INTERRUPTIBLE);
1956 spin_unlock_irq(&gcwq->lock);
1957 schedule();
1958 goto woke_up;
1959}
1960
1961/**
1962 * rescuer_thread - the rescuer thread function
1963 * @__wq: the associated workqueue
1964 *
1965 * Workqueue rescuer thread function. There's one rescuer for each
1966 * workqueue which has WQ_RESCUER set.
1967 *
1968 * Regular work processing on a gcwq may block trying to create a new
1969 * worker which uses GFP_KERNEL allocation which has slight chance of
1970 * developing into deadlock if some works currently on the same queue
1971 * need to be processed to satisfy the GFP_KERNEL allocation. This is
1972 * the problem rescuer solves.
1973 *
1974 * When such condition is possible, the gcwq summons rescuers of all
1975 * workqueues which have works queued on the gcwq and let them process
1976 * those works so that forward progress can be guaranteed.
1977 *
1978 * This should happen rarely.
1979 */
1980static int rescuer_thread(void *__wq)
1981{
1982 struct workqueue_struct *wq = __wq;
1983 struct worker *rescuer = wq->rescuer;
1984 struct list_head *scheduled = &rescuer->scheduled;
1985 bool is_unbound = wq->flags & WQ_UNBOUND;
1986 unsigned int cpu;
1987
1988 set_user_nice(current, RESCUER_NICE_LEVEL);
1989repeat:
1990 set_current_state(TASK_INTERRUPTIBLE);
1991
1992 if (kthread_should_stop())
1993 return 0;
1994
1995 /*
1996 * See whether any cpu is asking for help. Unbounded
1997 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
1998 */
1999 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2000 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2001 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2002 struct global_cwq *gcwq = cwq->gcwq;
2003 struct work_struct *work, *n;
2004
2005 __set_current_state(TASK_RUNNING);
2006 mayday_clear_cpu(cpu, wq->mayday_mask);
2007
2008 /* migrate to the target cpu if possible */
2009 rescuer->gcwq = gcwq;
2010 worker_maybe_bind_and_lock(rescuer);
2011
2012 /*
2013 * Slurp in all works issued via this workqueue and
2014 * process'em.
2015 */
2016 BUG_ON(!list_empty(&rescuer->scheduled));
2017 list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2018 if (get_work_cwq(work) == cwq)
2019 move_linked_works(work, scheduled, &n);
2020
2021 process_scheduled_works(rescuer);
2022 spin_unlock_irq(&gcwq->lock);
2023 }
2024
2025 schedule();
2026 goto repeat;
459} 2027}
460 2028
461struct wq_barrier { 2029struct wq_barrier {
@@ -469,44 +2037,137 @@ static void wq_barrier_func(struct work_struct *work)
469 complete(&barr->done); 2037 complete(&barr->done);
470} 2038}
471 2039
2040/**
2041 * insert_wq_barrier - insert a barrier work
2042 * @cwq: cwq to insert barrier into
2043 * @barr: wq_barrier to insert
2044 * @target: target work to attach @barr to
2045 * @worker: worker currently executing @target, NULL if @target is not executing
2046 *
2047 * @barr is linked to @target such that @barr is completed only after
2048 * @target finishes execution. Please note that the ordering
2049 * guarantee is observed only with respect to @target and on the local
2050 * cpu.
2051 *
2052 * Currently, a queued barrier can't be canceled. This is because
2053 * try_to_grab_pending() can't determine whether the work to be
2054 * grabbed is at the head of the queue and thus can't clear LINKED
2055 * flag of the previous work while there must be a valid next work
2056 * after a work with LINKED flag set.
2057 *
2058 * Note that when @worker is non-NULL, @target may be modified
2059 * underneath us, so we can't reliably determine cwq from @target.
2060 *
2061 * CONTEXT:
2062 * spin_lock_irq(gcwq->lock).
2063 */
472static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2064static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
473 struct wq_barrier *barr, struct list_head *head) 2065 struct wq_barrier *barr,
2066 struct work_struct *target, struct worker *worker)
474{ 2067{
2068 struct list_head *head;
2069 unsigned int linked = 0;
2070
475 /* 2071 /*
476 * debugobject calls are safe here even with cwq->lock locked 2072 * debugobject calls are safe here even with gcwq->lock locked
477 * as we know for sure that this will not trigger any of the 2073 * as we know for sure that this will not trigger any of the
478 * checks and call back into the fixup functions where we 2074 * checks and call back into the fixup functions where we
479 * might deadlock. 2075 * might deadlock.
480 */ 2076 */
481 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2077 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
482 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 2078 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
483
484 init_completion(&barr->done); 2079 init_completion(&barr->done);
485 2080
2081 /*
2082 * If @target is currently being executed, schedule the
2083 * barrier to the worker; otherwise, put it after @target.
2084 */
2085 if (worker)
2086 head = worker->scheduled.next;
2087 else {
2088 unsigned long *bits = work_data_bits(target);
2089
2090 head = target->entry.next;
2091 /* there can already be other linked works, inherit and set */
2092 linked = *bits & WORK_STRUCT_LINKED;
2093 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2094 }
2095
486 debug_work_activate(&barr->work); 2096 debug_work_activate(&barr->work);
487 insert_work(cwq, &barr->work, head); 2097 insert_work(cwq, &barr->work, head,
2098 work_color_to_flags(WORK_NO_COLOR) | linked);
488} 2099}
489 2100
490static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 2101/**
2102 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2103 * @wq: workqueue being flushed
2104 * @flush_color: new flush color, < 0 for no-op
2105 * @work_color: new work color, < 0 for no-op
2106 *
2107 * Prepare cwqs for workqueue flushing.
2108 *
2109 * If @flush_color is non-negative, flush_color on all cwqs should be
2110 * -1. If no cwq has in-flight commands at the specified color, all
2111 * cwq->flush_color's stay at -1 and %false is returned. If any cwq
2112 * has in flight commands, its cwq->flush_color is set to
2113 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2114 * wakeup logic is armed and %true is returned.
2115 *
2116 * The caller should have initialized @wq->first_flusher prior to
2117 * calling this function with non-negative @flush_color. If
2118 * @flush_color is negative, no flush color update is done and %false
2119 * is returned.
2120 *
2121 * If @work_color is non-negative, all cwqs should have the same
2122 * work_color which is previous to @work_color and all will be
2123 * advanced to @work_color.
2124 *
2125 * CONTEXT:
2126 * mutex_lock(wq->flush_mutex).
2127 *
2128 * RETURNS:
2129 * %true if @flush_color >= 0 and there's something to flush. %false
2130 * otherwise.
2131 */
2132static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2133 int flush_color, int work_color)
491{ 2134{
492 int active = 0; 2135 bool wait = false;
493 struct wq_barrier barr; 2136 unsigned int cpu;
494
495 WARN_ON(cwq->thread == current);
496 2137
497 spin_lock_irq(&cwq->lock); 2138 if (flush_color >= 0) {
498 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 2139 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
499 insert_wq_barrier(cwq, &barr, &cwq->worklist); 2140 atomic_set(&wq->nr_cwqs_to_flush, 1);
500 active = 1;
501 } 2141 }
502 spin_unlock_irq(&cwq->lock);
503 2142
504 if (active) { 2143 for_each_cwq_cpu(cpu, wq) {
505 wait_for_completion(&barr.done); 2144 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
506 destroy_work_on_stack(&barr.work); 2145 struct global_cwq *gcwq = cwq->gcwq;
2146
2147 spin_lock_irq(&gcwq->lock);
2148
2149 if (flush_color >= 0) {
2150 BUG_ON(cwq->flush_color != -1);
2151
2152 if (cwq->nr_in_flight[flush_color]) {
2153 cwq->flush_color = flush_color;
2154 atomic_inc(&wq->nr_cwqs_to_flush);
2155 wait = true;
2156 }
2157 }
2158
2159 if (work_color >= 0) {
2160 BUG_ON(work_color != work_next_color(cwq->work_color));
2161 cwq->work_color = work_color;
2162 }
2163
2164 spin_unlock_irq(&gcwq->lock);
507 } 2165 }
508 2166
509 return active; 2167 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2168 complete(&wq->first_flusher->done);
2169
2170 return wait;
510} 2171}
511 2172
512/** 2173/**
@@ -518,20 +2179,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
518 * 2179 *
519 * We sleep until all works which were queued on entry have been handled, 2180 * We sleep until all works which were queued on entry have been handled,
520 * but we are not livelocked by new incoming ones. 2181 * but we are not livelocked by new incoming ones.
521 *
522 * This function used to run the workqueues itself. Now we just wait for the
523 * helper threads to do it.
524 */ 2182 */
525void flush_workqueue(struct workqueue_struct *wq) 2183void flush_workqueue(struct workqueue_struct *wq)
526{ 2184{
527 const struct cpumask *cpu_map = wq_cpu_map(wq); 2185 struct wq_flusher this_flusher = {
528 int cpu; 2186 .list = LIST_HEAD_INIT(this_flusher.list),
2187 .flush_color = -1,
2188 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2189 };
2190 int next_color;
529 2191
530 might_sleep();
531 lock_map_acquire(&wq->lockdep_map); 2192 lock_map_acquire(&wq->lockdep_map);
532 lock_map_release(&wq->lockdep_map); 2193 lock_map_release(&wq->lockdep_map);
533 for_each_cpu(cpu, cpu_map) 2194
534 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 2195 mutex_lock(&wq->flush_mutex);
2196
2197 /*
2198 * Start-to-wait phase
2199 */
2200 next_color = work_next_color(wq->work_color);
2201
2202 if (next_color != wq->flush_color) {
2203 /*
2204 * Color space is not full. The current work_color
2205 * becomes our flush_color and work_color is advanced
2206 * by one.
2207 */
2208 BUG_ON(!list_empty(&wq->flusher_overflow));
2209 this_flusher.flush_color = wq->work_color;
2210 wq->work_color = next_color;
2211
2212 if (!wq->first_flusher) {
2213 /* no flush in progress, become the first flusher */
2214 BUG_ON(wq->flush_color != this_flusher.flush_color);
2215
2216 wq->first_flusher = &this_flusher;
2217
2218 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2219 wq->work_color)) {
2220 /* nothing to flush, done */
2221 wq->flush_color = next_color;
2222 wq->first_flusher = NULL;
2223 goto out_unlock;
2224 }
2225 } else {
2226 /* wait in queue */
2227 BUG_ON(wq->flush_color == this_flusher.flush_color);
2228 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2229 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2230 }
2231 } else {
2232 /*
2233 * Oops, color space is full, wait on overflow queue.
2234 * The next flush completion will assign us
2235 * flush_color and transfer to flusher_queue.
2236 */
2237 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2238 }
2239
2240 mutex_unlock(&wq->flush_mutex);
2241
2242 wait_for_completion(&this_flusher.done);
2243
2244 /*
2245 * Wake-up-and-cascade phase
2246 *
2247 * First flushers are responsible for cascading flushes and
2248 * handling overflow. Non-first flushers can simply return.
2249 */
2250 if (wq->first_flusher != &this_flusher)
2251 return;
2252
2253 mutex_lock(&wq->flush_mutex);
2254
2255 /* we might have raced, check again with mutex held */
2256 if (wq->first_flusher != &this_flusher)
2257 goto out_unlock;
2258
2259 wq->first_flusher = NULL;
2260
2261 BUG_ON(!list_empty(&this_flusher.list));
2262 BUG_ON(wq->flush_color != this_flusher.flush_color);
2263
2264 while (true) {
2265 struct wq_flusher *next, *tmp;
2266
2267 /* complete all the flushers sharing the current flush color */
2268 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2269 if (next->flush_color != wq->flush_color)
2270 break;
2271 list_del_init(&next->list);
2272 complete(&next->done);
2273 }
2274
2275 BUG_ON(!list_empty(&wq->flusher_overflow) &&
2276 wq->flush_color != work_next_color(wq->work_color));
2277
2278 /* this flush_color is finished, advance by one */
2279 wq->flush_color = work_next_color(wq->flush_color);
2280
2281 /* one color has been freed, handle overflow queue */
2282 if (!list_empty(&wq->flusher_overflow)) {
2283 /*
2284 * Assign the same color to all overflowed
2285 * flushers, advance work_color and append to
2286 * flusher_queue. This is the start-to-wait
2287 * phase for these overflowed flushers.
2288 */
2289 list_for_each_entry(tmp, &wq->flusher_overflow, list)
2290 tmp->flush_color = wq->work_color;
2291
2292 wq->work_color = work_next_color(wq->work_color);
2293
2294 list_splice_tail_init(&wq->flusher_overflow,
2295 &wq->flusher_queue);
2296 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2297 }
2298
2299 if (list_empty(&wq->flusher_queue)) {
2300 BUG_ON(wq->flush_color != wq->work_color);
2301 break;
2302 }
2303
2304 /*
2305 * Need to flush more colors. Make the next flusher
2306 * the new first flusher and arm cwqs.
2307 */
2308 BUG_ON(wq->flush_color == wq->work_color);
2309 BUG_ON(wq->flush_color != next->flush_color);
2310
2311 list_del_init(&next->list);
2312 wq->first_flusher = next;
2313
2314 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2315 break;
2316
2317 /*
2318 * Meh... this color is already done, clear first
2319 * flusher and repeat cascading.
2320 */
2321 wq->first_flusher = NULL;
2322 }
2323
2324out_unlock:
2325 mutex_unlock(&wq->flush_mutex);
535} 2326}
536EXPORT_SYMBOL_GPL(flush_workqueue); 2327EXPORT_SYMBOL_GPL(flush_workqueue);
537 2328
@@ -547,43 +2338,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
547 */ 2338 */
548int flush_work(struct work_struct *work) 2339int flush_work(struct work_struct *work)
549{ 2340{
2341 struct worker *worker = NULL;
2342 struct global_cwq *gcwq;
550 struct cpu_workqueue_struct *cwq; 2343 struct cpu_workqueue_struct *cwq;
551 struct list_head *prev;
552 struct wq_barrier barr; 2344 struct wq_barrier barr;
553 2345
554 might_sleep(); 2346 might_sleep();
555 cwq = get_wq_data(work); 2347 gcwq = get_work_gcwq(work);
556 if (!cwq) 2348 if (!gcwq)
557 return 0; 2349 return 0;
558 2350
559 lock_map_acquire(&cwq->wq->lockdep_map); 2351 spin_lock_irq(&gcwq->lock);
560 lock_map_release(&cwq->wq->lockdep_map);
561
562 prev = NULL;
563 spin_lock_irq(&cwq->lock);
564 if (!list_empty(&work->entry)) { 2352 if (!list_empty(&work->entry)) {
565 /* 2353 /*
566 * See the comment near try_to_grab_pending()->smp_rmb(). 2354 * See the comment near try_to_grab_pending()->smp_rmb().
567 * If it was re-queued under us we are not going to wait. 2355 * If it was re-queued to a different gcwq under us, we
2356 * are not going to wait.
568 */ 2357 */
569 smp_rmb(); 2358 smp_rmb();
570 if (unlikely(cwq != get_wq_data(work))) 2359 cwq = get_work_cwq(work);
571 goto out; 2360 if (unlikely(!cwq || gcwq != cwq->gcwq))
572 prev = &work->entry; 2361 goto already_gone;
573 } else { 2362 } else {
574 if (cwq->current_work != work) 2363 worker = find_worker_executing_work(gcwq, work);
575 goto out; 2364 if (!worker)
576 prev = &cwq->worklist; 2365 goto already_gone;
2366 cwq = worker->current_cwq;
577 } 2367 }
578 insert_wq_barrier(cwq, &barr, prev->next); 2368
579out: 2369 insert_wq_barrier(cwq, &barr, work, worker);
580 spin_unlock_irq(&cwq->lock); 2370 spin_unlock_irq(&gcwq->lock);
581 if (!prev) 2371
582 return 0; 2372 lock_map_acquire(&cwq->wq->lockdep_map);
2373 lock_map_release(&cwq->wq->lockdep_map);
583 2374
584 wait_for_completion(&barr.done); 2375 wait_for_completion(&barr.done);
585 destroy_work_on_stack(&barr.work); 2376 destroy_work_on_stack(&barr.work);
586 return 1; 2377 return 1;
2378already_gone:
2379 spin_unlock_irq(&gcwq->lock);
2380 return 0;
587} 2381}
588EXPORT_SYMBOL_GPL(flush_work); 2382EXPORT_SYMBOL_GPL(flush_work);
589 2383
@@ -593,54 +2387,56 @@ EXPORT_SYMBOL_GPL(flush_work);
593 */ 2387 */
594static int try_to_grab_pending(struct work_struct *work) 2388static int try_to_grab_pending(struct work_struct *work)
595{ 2389{
596 struct cpu_workqueue_struct *cwq; 2390 struct global_cwq *gcwq;
597 int ret = -1; 2391 int ret = -1;
598 2392
599 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) 2393 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
600 return 0; 2394 return 0;
601 2395
602 /* 2396 /*
603 * The queueing is in progress, or it is already queued. Try to 2397 * The queueing is in progress, or it is already queued. Try to
604 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 2398 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
605 */ 2399 */
606 2400 gcwq = get_work_gcwq(work);
607 cwq = get_wq_data(work); 2401 if (!gcwq)
608 if (!cwq)
609 return ret; 2402 return ret;
610 2403
611 spin_lock_irq(&cwq->lock); 2404 spin_lock_irq(&gcwq->lock);
612 if (!list_empty(&work->entry)) { 2405 if (!list_empty(&work->entry)) {
613 /* 2406 /*
614 * This work is queued, but perhaps we locked the wrong cwq. 2407 * This work is queued, but perhaps we locked the wrong gcwq.
615 * In that case we must see the new value after rmb(), see 2408 * In that case we must see the new value after rmb(), see
616 * insert_work()->wmb(). 2409 * insert_work()->wmb().
617 */ 2410 */
618 smp_rmb(); 2411 smp_rmb();
619 if (cwq == get_wq_data(work)) { 2412 if (gcwq == get_work_gcwq(work)) {
620 debug_work_deactivate(work); 2413 debug_work_deactivate(work);
621 list_del_init(&work->entry); 2414 list_del_init(&work->entry);
2415 cwq_dec_nr_in_flight(get_work_cwq(work),
2416 get_work_color(work),
2417 *work_data_bits(work) & WORK_STRUCT_DELAYED);
622 ret = 1; 2418 ret = 1;
623 } 2419 }
624 } 2420 }
625 spin_unlock_irq(&cwq->lock); 2421 spin_unlock_irq(&gcwq->lock);
626 2422
627 return ret; 2423 return ret;
628} 2424}
629 2425
630static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, 2426static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
631 struct work_struct *work)
632{ 2427{
633 struct wq_barrier barr; 2428 struct wq_barrier barr;
634 int running = 0; 2429 struct worker *worker;
635 2430
636 spin_lock_irq(&cwq->lock); 2431 spin_lock_irq(&gcwq->lock);
637 if (unlikely(cwq->current_work == work)) {
638 insert_wq_barrier(cwq, &barr, cwq->worklist.next);
639 running = 1;
640 }
641 spin_unlock_irq(&cwq->lock);
642 2432
643 if (unlikely(running)) { 2433 worker = find_worker_executing_work(gcwq, work);
2434 if (unlikely(worker))
2435 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2436
2437 spin_unlock_irq(&gcwq->lock);
2438
2439 if (unlikely(worker)) {
644 wait_for_completion(&barr.done); 2440 wait_for_completion(&barr.done);
645 destroy_work_on_stack(&barr.work); 2441 destroy_work_on_stack(&barr.work);
646 } 2442 }
@@ -648,9 +2444,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
648 2444
649static void wait_on_work(struct work_struct *work) 2445static void wait_on_work(struct work_struct *work)
650{ 2446{
651 struct cpu_workqueue_struct *cwq;
652 struct workqueue_struct *wq;
653 const struct cpumask *cpu_map;
654 int cpu; 2447 int cpu;
655 2448
656 might_sleep(); 2449 might_sleep();
@@ -658,15 +2451,8 @@ static void wait_on_work(struct work_struct *work)
658 lock_map_acquire(&work->lockdep_map); 2451 lock_map_acquire(&work->lockdep_map);
659 lock_map_release(&work->lockdep_map); 2452 lock_map_release(&work->lockdep_map);
660 2453
661 cwq = get_wq_data(work); 2454 for_each_gcwq_cpu(cpu)
662 if (!cwq) 2455 wait_on_cpu_work(get_gcwq(cpu), work);
663 return;
664
665 wq = cwq->wq;
666 cpu_map = wq_cpu_map(wq);
667
668 for_each_cpu(cpu, cpu_map)
669 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
670} 2456}
671 2457
672static int __cancel_work_timer(struct work_struct *work, 2458static int __cancel_work_timer(struct work_struct *work,
@@ -681,7 +2467,7 @@ static int __cancel_work_timer(struct work_struct *work,
681 wait_on_work(work); 2467 wait_on_work(work);
682 } while (unlikely(ret < 0)); 2468 } while (unlikely(ret < 0));
683 2469
684 clear_wq_data(work); 2470 clear_work_data(work);
685 return ret; 2471 return ret;
686} 2472}
687 2473
@@ -727,8 +2513,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
727} 2513}
728EXPORT_SYMBOL(cancel_delayed_work_sync); 2514EXPORT_SYMBOL(cancel_delayed_work_sync);
729 2515
730static struct workqueue_struct *keventd_wq __read_mostly;
731
732/** 2516/**
733 * schedule_work - put work task in global workqueue 2517 * schedule_work - put work task in global workqueue
734 * @work: job to be done 2518 * @work: job to be done
@@ -742,7 +2526,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
742 */ 2526 */
743int schedule_work(struct work_struct *work) 2527int schedule_work(struct work_struct *work)
744{ 2528{
745 return queue_work(keventd_wq, work); 2529 return queue_work(system_wq, work);
746} 2530}
747EXPORT_SYMBOL(schedule_work); 2531EXPORT_SYMBOL(schedule_work);
748 2532
@@ -755,7 +2539,7 @@ EXPORT_SYMBOL(schedule_work);
755 */ 2539 */
756int schedule_work_on(int cpu, struct work_struct *work) 2540int schedule_work_on(int cpu, struct work_struct *work)
757{ 2541{
758 return queue_work_on(cpu, keventd_wq, work); 2542 return queue_work_on(cpu, system_wq, work);
759} 2543}
760EXPORT_SYMBOL(schedule_work_on); 2544EXPORT_SYMBOL(schedule_work_on);
761 2545
@@ -770,7 +2554,7 @@ EXPORT_SYMBOL(schedule_work_on);
770int schedule_delayed_work(struct delayed_work *dwork, 2554int schedule_delayed_work(struct delayed_work *dwork,
771 unsigned long delay) 2555 unsigned long delay)
772{ 2556{
773 return queue_delayed_work(keventd_wq, dwork, delay); 2557 return queue_delayed_work(system_wq, dwork, delay);
774} 2558}
775EXPORT_SYMBOL(schedule_delayed_work); 2559EXPORT_SYMBOL(schedule_delayed_work);
776 2560
@@ -783,9 +2567,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
783void flush_delayed_work(struct delayed_work *dwork) 2567void flush_delayed_work(struct delayed_work *dwork)
784{ 2568{
785 if (del_timer_sync(&dwork->timer)) { 2569 if (del_timer_sync(&dwork->timer)) {
786 struct cpu_workqueue_struct *cwq; 2570 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
787 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); 2571 &dwork->work);
788 __queue_work(cwq, &dwork->work);
789 put_cpu(); 2572 put_cpu();
790 } 2573 }
791 flush_work(&dwork->work); 2574 flush_work(&dwork->work);
@@ -804,7 +2587,7 @@ EXPORT_SYMBOL(flush_delayed_work);
804int schedule_delayed_work_on(int cpu, 2587int schedule_delayed_work_on(int cpu,
805 struct delayed_work *dwork, unsigned long delay) 2588 struct delayed_work *dwork, unsigned long delay)
806{ 2589{
807 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); 2590 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
808} 2591}
809EXPORT_SYMBOL(schedule_delayed_work_on); 2592EXPORT_SYMBOL(schedule_delayed_work_on);
810 2593
@@ -820,8 +2603,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
820int schedule_on_each_cpu(work_func_t func) 2603int schedule_on_each_cpu(work_func_t func)
821{ 2604{
822 int cpu; 2605 int cpu;
823 int orig = -1; 2606 struct work_struct __percpu *works;
824 struct work_struct *works;
825 2607
826 works = alloc_percpu(struct work_struct); 2608 works = alloc_percpu(struct work_struct);
827 if (!works) 2609 if (!works)
@@ -829,23 +2611,12 @@ int schedule_on_each_cpu(work_func_t func)
829 2611
830 get_online_cpus(); 2612 get_online_cpus();
831 2613
832 /*
833 * When running in keventd don't schedule a work item on
834 * itself. Can just call directly because the work queue is
835 * already bound. This also is faster.
836 */
837 if (current_is_keventd())
838 orig = raw_smp_processor_id();
839
840 for_each_online_cpu(cpu) { 2614 for_each_online_cpu(cpu) {
841 struct work_struct *work = per_cpu_ptr(works, cpu); 2615 struct work_struct *work = per_cpu_ptr(works, cpu);
842 2616
843 INIT_WORK(work, func); 2617 INIT_WORK(work, func);
844 if (cpu != orig) 2618 schedule_work_on(cpu, work);
845 schedule_work_on(cpu, work);
846 } 2619 }
847 if (orig >= 0)
848 func(per_cpu_ptr(works, orig));
849 2620
850 for_each_online_cpu(cpu) 2621 for_each_online_cpu(cpu)
851 flush_work(per_cpu_ptr(works, cpu)); 2622 flush_work(per_cpu_ptr(works, cpu));
@@ -881,7 +2652,7 @@ int schedule_on_each_cpu(work_func_t func)
881 */ 2652 */
882void flush_scheduled_work(void) 2653void flush_scheduled_work(void)
883{ 2654{
884 flush_workqueue(keventd_wq); 2655 flush_workqueue(system_wq);
885} 2656}
886EXPORT_SYMBOL(flush_scheduled_work); 2657EXPORT_SYMBOL(flush_scheduled_work);
887 2658
@@ -913,170 +2684,169 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
913 2684
914int keventd_up(void) 2685int keventd_up(void)
915{ 2686{
916 return keventd_wq != NULL; 2687 return system_wq != NULL;
917} 2688}
918 2689
919int current_is_keventd(void) 2690static int alloc_cwqs(struct workqueue_struct *wq)
920{ 2691{
921 struct cpu_workqueue_struct *cwq; 2692 /*
922 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ 2693 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
923 int ret = 0; 2694 * Make sure that the alignment isn't lower than that of
924 2695 * unsigned long long.
925 BUG_ON(!keventd_wq); 2696 */
2697 const size_t size = sizeof(struct cpu_workqueue_struct);
2698 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2699 __alignof__(unsigned long long));
2700#ifdef CONFIG_SMP
2701 bool percpu = !(wq->flags & WQ_UNBOUND);
2702#else
2703 bool percpu = false;
2704#endif
926 2705
927 cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); 2706 if (percpu)
928 if (current == cwq->thread) 2707 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
929 ret = 1; 2708 else {
2709 void *ptr;
930 2710
931 return ret; 2711 /*
2712 * Allocate enough room to align cwq and put an extra
2713 * pointer at the end pointing back to the originally
2714 * allocated pointer which will be used for free.
2715 */
2716 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2717 if (ptr) {
2718 wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2719 *(void **)(wq->cpu_wq.single + 1) = ptr;
2720 }
2721 }
932 2722
2723 /* just in case, make sure it's actually aligned */
2724 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2725 return wq->cpu_wq.v ? 0 : -ENOMEM;
933} 2726}
934 2727
935static struct cpu_workqueue_struct * 2728static void free_cwqs(struct workqueue_struct *wq)
936init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
937{ 2729{
938 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 2730#ifdef CONFIG_SMP
939 2731 bool percpu = !(wq->flags & WQ_UNBOUND);
940 cwq->wq = wq; 2732#else
941 spin_lock_init(&cwq->lock); 2733 bool percpu = false;
942 INIT_LIST_HEAD(&cwq->worklist); 2734#endif
943 init_waitqueue_head(&cwq->more_work);
944 2735
945 return cwq; 2736 if (percpu)
2737 free_percpu(wq->cpu_wq.pcpu);
2738 else if (wq->cpu_wq.single) {
2739 /* the pointer to free is stored right after the cwq */
2740 kfree(*(void **)(wq->cpu_wq.single + 1));
2741 }
946} 2742}
947 2743
948static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2744static int wq_clamp_max_active(int max_active, unsigned int flags,
2745 const char *name)
949{ 2746{
950 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 2747 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
951 struct workqueue_struct *wq = cwq->wq;
952 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
953 struct task_struct *p;
954
955 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
956 /*
957 * Nobody can add the work_struct to this cwq,
958 * if (caller is __create_workqueue)
959 * nobody should see this wq
960 * else // caller is CPU_UP_PREPARE
961 * cpu is not on cpu_online_map
962 * so we can abort safely.
963 */
964 if (IS_ERR(p))
965 return PTR_ERR(p);
966 if (cwq->wq->rt)
967 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
968 cwq->thread = p;
969 2748
970 trace_workqueue_creation(cwq->thread, cpu); 2749 if (max_active < 1 || max_active > lim)
2750 printk(KERN_WARNING "workqueue: max_active %d requested for %s "
2751 "is out of range, clamping between %d and %d\n",
2752 max_active, name, 1, lim);
971 2753
972 return 0; 2754 return clamp_val(max_active, 1, lim);
973} 2755}
974 2756
975static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 2757struct workqueue_struct *__alloc_workqueue_key(const char *name,
2758 unsigned int flags,
2759 int max_active,
2760 struct lock_class_key *key,
2761 const char *lock_name)
976{ 2762{
977 struct task_struct *p = cwq->thread; 2763 struct workqueue_struct *wq;
2764 unsigned int cpu;
978 2765
979 if (p != NULL) { 2766 /*
980 if (cpu >= 0) 2767 * Unbound workqueues aren't concurrency managed and should be
981 kthread_bind(p, cpu); 2768 * dispatched to workers immediately.
982 wake_up_process(p); 2769 */
983 } 2770 if (flags & WQ_UNBOUND)
984} 2771 flags |= WQ_HIGHPRI;
985 2772
986struct workqueue_struct *__create_workqueue_key(const char *name, 2773 max_active = max_active ?: WQ_DFL_ACTIVE;
987 int singlethread, 2774 max_active = wq_clamp_max_active(max_active, flags, name);
988 int freezeable,
989 int rt,
990 struct lock_class_key *key,
991 const char *lock_name)
992{
993 struct workqueue_struct *wq;
994 struct cpu_workqueue_struct *cwq;
995 int err = 0, cpu;
996 2775
997 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 2776 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
998 if (!wq) 2777 if (!wq)
999 return NULL; 2778 goto err;
1000 2779
1001 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 2780 wq->flags = flags;
1002 if (!wq->cpu_wq) { 2781 wq->saved_max_active = max_active;
1003 kfree(wq); 2782 mutex_init(&wq->flush_mutex);
1004 return NULL; 2783 atomic_set(&wq->nr_cwqs_to_flush, 0);
1005 } 2784 INIT_LIST_HEAD(&wq->flusher_queue);
2785 INIT_LIST_HEAD(&wq->flusher_overflow);
1006 2786
1007 wq->name = name; 2787 wq->name = name;
1008 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 2788 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
1009 wq->singlethread = singlethread;
1010 wq->freezeable = freezeable;
1011 wq->rt = rt;
1012 INIT_LIST_HEAD(&wq->list); 2789 INIT_LIST_HEAD(&wq->list);
1013 2790
1014 if (singlethread) { 2791 if (alloc_cwqs(wq) < 0)
1015 cwq = init_cpu_workqueue(wq, singlethread_cpu); 2792 goto err;
1016 err = create_workqueue_thread(cwq, singlethread_cpu); 2793
1017 start_workqueue_thread(cwq, -1); 2794 for_each_cwq_cpu(cpu, wq) {
1018 } else { 2795 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1019 cpu_maps_update_begin(); 2796 struct global_cwq *gcwq = get_gcwq(cpu);
1020 /* 2797
1021 * We must place this wq on list even if the code below fails. 2798 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
1022 * cpu_down(cpu) can remove cpu from cpu_populated_map before 2799 cwq->gcwq = gcwq;
1023 * destroy_workqueue() takes the lock, in that case we leak 2800 cwq->wq = wq;
1024 * cwq[cpu]->thread. 2801 cwq->flush_color = -1;
1025 */ 2802 cwq->max_active = max_active;
1026 spin_lock(&workqueue_lock); 2803 INIT_LIST_HEAD(&cwq->delayed_works);
1027 list_add(&wq->list, &workqueues);
1028 spin_unlock(&workqueue_lock);
1029 /*
1030 * We must initialize cwqs for each possible cpu even if we
1031 * are going to call destroy_workqueue() finally. Otherwise
1032 * cpu_up() can hit the uninitialized cwq once we drop the
1033 * lock.
1034 */
1035 for_each_possible_cpu(cpu) {
1036 cwq = init_cpu_workqueue(wq, cpu);
1037 if (err || !cpu_online(cpu))
1038 continue;
1039 err = create_workqueue_thread(cwq, cpu);
1040 start_workqueue_thread(cwq, cpu);
1041 }
1042 cpu_maps_update_done();
1043 } 2804 }
1044 2805
1045 if (err) { 2806 if (flags & WQ_RESCUER) {
1046 destroy_workqueue(wq); 2807 struct worker *rescuer;
1047 wq = NULL; 2808
2809 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
2810 goto err;
2811
2812 wq->rescuer = rescuer = alloc_worker();
2813 if (!rescuer)
2814 goto err;
2815
2816 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
2817 if (IS_ERR(rescuer->task))
2818 goto err;
2819
2820 rescuer->task->flags |= PF_THREAD_BOUND;
2821 wake_up_process(rescuer->task);
1048 } 2822 }
1049 return wq;
1050}
1051EXPORT_SYMBOL_GPL(__create_workqueue_key);
1052 2823
1053static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1054{
1055 /* 2824 /*
1056 * Our caller is either destroy_workqueue() or CPU_POST_DEAD, 2825 * workqueue_lock protects global freeze state and workqueues
1057 * cpu_add_remove_lock protects cwq->thread. 2826 * list. Grab it, set max_active accordingly and add the new
2827 * workqueue to workqueues list.
1058 */ 2828 */
1059 if (cwq->thread == NULL) 2829 spin_lock(&workqueue_lock);
1060 return;
1061 2830
1062 lock_map_acquire(&cwq->wq->lockdep_map); 2831 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
1063 lock_map_release(&cwq->wq->lockdep_map); 2832 for_each_cwq_cpu(cpu, wq)
2833 get_cwq(cpu, wq)->max_active = 0;
1064 2834
1065 flush_cpu_workqueue(cwq); 2835 list_add(&wq->list, &workqueues);
1066 /* 2836
1067 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, 2837 spin_unlock(&workqueue_lock);
1068 * a concurrent flush_workqueue() can insert a barrier after us. 2838
1069 * However, in that case run_workqueue() won't return and check 2839 return wq;
1070 * kthread_should_stop() until it flushes all work_struct's. 2840err:
1071 * When ->worklist becomes empty it is safe to exit because no 2841 if (wq) {
1072 * more work_structs can be queued on this cwq: flush_workqueue 2842 free_cwqs(wq);
1073 * checks list_empty(), and a "normal" queue_work() can't use 2843 free_mayday_mask(wq->mayday_mask);
1074 * a dead CPU. 2844 kfree(wq->rescuer);
1075 */ 2845 kfree(wq);
1076 trace_workqueue_destruction(cwq->thread); 2846 }
1077 kthread_stop(cwq->thread); 2847 return NULL;
1078 cwq->thread = NULL;
1079} 2848}
2849EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
1080 2850
1081/** 2851/**
1082 * destroy_workqueue - safely terminate a workqueue 2852 * destroy_workqueue - safely terminate a workqueue
@@ -1086,72 +2856,520 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
1086 */ 2856 */
1087void destroy_workqueue(struct workqueue_struct *wq) 2857void destroy_workqueue(struct workqueue_struct *wq)
1088{ 2858{
1089 const struct cpumask *cpu_map = wq_cpu_map(wq); 2859 unsigned int cpu;
1090 int cpu; 2860
2861 wq->flags |= WQ_DYING;
2862 flush_workqueue(wq);
1091 2863
1092 cpu_maps_update_begin(); 2864 /*
2865 * wq list is used to freeze wq, remove from list after
2866 * flushing is complete in case freeze races us.
2867 */
1093 spin_lock(&workqueue_lock); 2868 spin_lock(&workqueue_lock);
1094 list_del(&wq->list); 2869 list_del(&wq->list);
1095 spin_unlock(&workqueue_lock); 2870 spin_unlock(&workqueue_lock);
1096 2871
1097 for_each_cpu(cpu, cpu_map) 2872 /* sanity check */
1098 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 2873 for_each_cwq_cpu(cpu, wq) {
1099 cpu_maps_update_done(); 2874 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2875 int i;
2876
2877 for (i = 0; i < WORK_NR_COLORS; i++)
2878 BUG_ON(cwq->nr_in_flight[i]);
2879 BUG_ON(cwq->nr_active);
2880 BUG_ON(!list_empty(&cwq->delayed_works));
2881 }
1100 2882
1101 free_percpu(wq->cpu_wq); 2883 if (wq->flags & WQ_RESCUER) {
2884 kthread_stop(wq->rescuer->task);
2885 free_mayday_mask(wq->mayday_mask);
2886 kfree(wq->rescuer);
2887 }
2888
2889 free_cwqs(wq);
1102 kfree(wq); 2890 kfree(wq);
1103} 2891}
1104EXPORT_SYMBOL_GPL(destroy_workqueue); 2892EXPORT_SYMBOL_GPL(destroy_workqueue);
1105 2893
2894/**
2895 * workqueue_set_max_active - adjust max_active of a workqueue
2896 * @wq: target workqueue
2897 * @max_active: new max_active value.
2898 *
2899 * Set max_active of @wq to @max_active.
2900 *
2901 * CONTEXT:
2902 * Don't call from IRQ context.
2903 */
2904void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2905{
2906 unsigned int cpu;
2907
2908 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
2909
2910 spin_lock(&workqueue_lock);
2911
2912 wq->saved_max_active = max_active;
2913
2914 for_each_cwq_cpu(cpu, wq) {
2915 struct global_cwq *gcwq = get_gcwq(cpu);
2916
2917 spin_lock_irq(&gcwq->lock);
2918
2919 if (!(wq->flags & WQ_FREEZEABLE) ||
2920 !(gcwq->flags & GCWQ_FREEZING))
2921 get_cwq(gcwq->cpu, wq)->max_active = max_active;
2922
2923 spin_unlock_irq(&gcwq->lock);
2924 }
2925
2926 spin_unlock(&workqueue_lock);
2927}
2928EXPORT_SYMBOL_GPL(workqueue_set_max_active);
2929
2930/**
2931 * workqueue_congested - test whether a workqueue is congested
2932 * @cpu: CPU in question
2933 * @wq: target workqueue
2934 *
2935 * Test whether @wq's cpu workqueue for @cpu is congested. There is
2936 * no synchronization around this function and the test result is
2937 * unreliable and only useful as advisory hints or for debugging.
2938 *
2939 * RETURNS:
2940 * %true if congested, %false otherwise.
2941 */
2942bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
2943{
2944 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2945
2946 return !list_empty(&cwq->delayed_works);
2947}
2948EXPORT_SYMBOL_GPL(workqueue_congested);
2949
2950/**
2951 * work_cpu - return the last known associated cpu for @work
2952 * @work: the work of interest
2953 *
2954 * RETURNS:
2955 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
2956 */
2957unsigned int work_cpu(struct work_struct *work)
2958{
2959 struct global_cwq *gcwq = get_work_gcwq(work);
2960
2961 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
2962}
2963EXPORT_SYMBOL_GPL(work_cpu);
2964
2965/**
2966 * work_busy - test whether a work is currently pending or running
2967 * @work: the work to be tested
2968 *
2969 * Test whether @work is currently pending or running. There is no
2970 * synchronization around this function and the test result is
2971 * unreliable and only useful as advisory hints or for debugging.
2972 * Especially for reentrant wqs, the pending state might hide the
2973 * running state.
2974 *
2975 * RETURNS:
2976 * OR'd bitmask of WORK_BUSY_* bits.
2977 */
2978unsigned int work_busy(struct work_struct *work)
2979{
2980 struct global_cwq *gcwq = get_work_gcwq(work);
2981 unsigned long flags;
2982 unsigned int ret = 0;
2983
2984 if (!gcwq)
2985 return false;
2986
2987 spin_lock_irqsave(&gcwq->lock, flags);
2988
2989 if (work_pending(work))
2990 ret |= WORK_BUSY_PENDING;
2991 if (find_worker_executing_work(gcwq, work))
2992 ret |= WORK_BUSY_RUNNING;
2993
2994 spin_unlock_irqrestore(&gcwq->lock, flags);
2995
2996 return ret;
2997}
2998EXPORT_SYMBOL_GPL(work_busy);
2999
3000/*
3001 * CPU hotplug.
3002 *
3003 * There are two challenges in supporting CPU hotplug. Firstly, there
3004 * are a lot of assumptions on strong associations among work, cwq and
3005 * gcwq which make migrating pending and scheduled works very
3006 * difficult to implement without impacting hot paths. Secondly,
3007 * gcwqs serve mix of short, long and very long running works making
3008 * blocked draining impractical.
3009 *
3010 * This is solved by allowing a gcwq to be detached from CPU, running
3011 * it with unbound (rogue) workers and allowing it to be reattached
3012 * later if the cpu comes back online. A separate thread is created
3013 * to govern a gcwq in such state and is called the trustee of the
3014 * gcwq.
3015 *
3016 * Trustee states and their descriptions.
3017 *
3018 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3019 * new trustee is started with this state.
3020 *
3021 * IN_CHARGE Once started, trustee will enter this state after
3022 * assuming the manager role and making all existing
3023 * workers rogue. DOWN_PREPARE waits for trustee to
3024 * enter this state. After reaching IN_CHARGE, trustee
3025 * tries to execute the pending worklist until it's empty
3026 * and the state is set to BUTCHER, or the state is set
3027 * to RELEASE.
3028 *
3029 * BUTCHER Command state which is set by the cpu callback after
3030 * the cpu has went down. Once this state is set trustee
3031 * knows that there will be no new works on the worklist
3032 * and once the worklist is empty it can proceed to
3033 * killing idle workers.
3034 *
3035 * RELEASE Command state which is set by the cpu callback if the
3036 * cpu down has been canceled or it has come online
3037 * again. After recognizing this state, trustee stops
3038 * trying to drain or butcher and clears ROGUE, rebinds
3039 * all remaining workers back to the cpu and releases
3040 * manager role.
3041 *
3042 * DONE Trustee will enter this state after BUTCHER or RELEASE
3043 * is complete.
3044 *
3045 * trustee CPU draining
3046 * took over down complete
3047 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3048 * | | ^
3049 * | CPU is back online v return workers |
3050 * ----------------> RELEASE --------------
3051 */
3052
3053/**
3054 * trustee_wait_event_timeout - timed event wait for trustee
3055 * @cond: condition to wait for
3056 * @timeout: timeout in jiffies
3057 *
3058 * wait_event_timeout() for trustee to use. Handles locking and
3059 * checks for RELEASE request.
3060 *
3061 * CONTEXT:
3062 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3063 * multiple times. To be used by trustee.
3064 *
3065 * RETURNS:
3066 * Positive indicating left time if @cond is satisfied, 0 if timed
3067 * out, -1 if canceled.
3068 */
3069#define trustee_wait_event_timeout(cond, timeout) ({ \
3070 long __ret = (timeout); \
3071 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3072 __ret) { \
3073 spin_unlock_irq(&gcwq->lock); \
3074 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3075 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3076 __ret); \
3077 spin_lock_irq(&gcwq->lock); \
3078 } \
3079 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3080})
3081
3082/**
3083 * trustee_wait_event - event wait for trustee
3084 * @cond: condition to wait for
3085 *
3086 * wait_event() for trustee to use. Automatically handles locking and
3087 * checks for CANCEL request.
3088 *
3089 * CONTEXT:
3090 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3091 * multiple times. To be used by trustee.
3092 *
3093 * RETURNS:
3094 * 0 if @cond is satisfied, -1 if canceled.
3095 */
3096#define trustee_wait_event(cond) ({ \
3097 long __ret1; \
3098 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3099 __ret1 < 0 ? -1 : 0; \
3100})
3101
3102static int __cpuinit trustee_thread(void *__gcwq)
3103{
3104 struct global_cwq *gcwq = __gcwq;
3105 struct worker *worker;
3106 struct work_struct *work;
3107 struct hlist_node *pos;
3108 long rc;
3109 int i;
3110
3111 BUG_ON(gcwq->cpu != smp_processor_id());
3112
3113 spin_lock_irq(&gcwq->lock);
3114 /*
3115 * Claim the manager position and make all workers rogue.
3116 * Trustee must be bound to the target cpu and can't be
3117 * cancelled.
3118 */
3119 BUG_ON(gcwq->cpu != smp_processor_id());
3120 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3121 BUG_ON(rc < 0);
3122
3123 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3124
3125 list_for_each_entry(worker, &gcwq->idle_list, entry)
3126 worker->flags |= WORKER_ROGUE;
3127
3128 for_each_busy_worker(worker, i, pos, gcwq)
3129 worker->flags |= WORKER_ROGUE;
3130
3131 /*
3132 * Call schedule() so that we cross rq->lock and thus can
3133 * guarantee sched callbacks see the rogue flag. This is
3134 * necessary as scheduler callbacks may be invoked from other
3135 * cpus.
3136 */
3137 spin_unlock_irq(&gcwq->lock);
3138 schedule();
3139 spin_lock_irq(&gcwq->lock);
3140
3141 /*
3142 * Sched callbacks are disabled now. Zap nr_running. After
3143 * this, nr_running stays zero and need_more_worker() and
3144 * keep_working() are always true as long as the worklist is
3145 * not empty.
3146 */
3147 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3148
3149 spin_unlock_irq(&gcwq->lock);
3150 del_timer_sync(&gcwq->idle_timer);
3151 spin_lock_irq(&gcwq->lock);
3152
3153 /*
3154 * We're now in charge. Notify and proceed to drain. We need
3155 * to keep the gcwq running during the whole CPU down
3156 * procedure as other cpu hotunplug callbacks may need to
3157 * flush currently running tasks.
3158 */
3159 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3160 wake_up_all(&gcwq->trustee_wait);
3161
3162 /*
3163 * The original cpu is in the process of dying and may go away
3164 * anytime now. When that happens, we and all workers would
3165 * be migrated to other cpus. Try draining any left work. We
3166 * want to get it over with ASAP - spam rescuers, wake up as
3167 * many idlers as necessary and create new ones till the
3168 * worklist is empty. Note that if the gcwq is frozen, there
3169 * may be frozen works in freezeable cwqs. Don't declare
3170 * completion while frozen.
3171 */
3172 while (gcwq->nr_workers != gcwq->nr_idle ||
3173 gcwq->flags & GCWQ_FREEZING ||
3174 gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3175 int nr_works = 0;
3176
3177 list_for_each_entry(work, &gcwq->worklist, entry) {
3178 send_mayday(work);
3179 nr_works++;
3180 }
3181
3182 list_for_each_entry(worker, &gcwq->idle_list, entry) {
3183 if (!nr_works--)
3184 break;
3185 wake_up_process(worker->task);
3186 }
3187
3188 if (need_to_create_worker(gcwq)) {
3189 spin_unlock_irq(&gcwq->lock);
3190 worker = create_worker(gcwq, false);
3191 spin_lock_irq(&gcwq->lock);
3192 if (worker) {
3193 worker->flags |= WORKER_ROGUE;
3194 start_worker(worker);
3195 }
3196 }
3197
3198 /* give a breather */
3199 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3200 break;
3201 }
3202
3203 /*
3204 * Either all works have been scheduled and cpu is down, or
3205 * cpu down has already been canceled. Wait for and butcher
3206 * all workers till we're canceled.
3207 */
3208 do {
3209 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3210 while (!list_empty(&gcwq->idle_list))
3211 destroy_worker(list_first_entry(&gcwq->idle_list,
3212 struct worker, entry));
3213 } while (gcwq->nr_workers && rc >= 0);
3214
3215 /*
3216 * At this point, either draining has completed and no worker
3217 * is left, or cpu down has been canceled or the cpu is being
3218 * brought back up. There shouldn't be any idle one left.
3219 * Tell the remaining busy ones to rebind once it finishes the
3220 * currently scheduled works by scheduling the rebind_work.
3221 */
3222 WARN_ON(!list_empty(&gcwq->idle_list));
3223
3224 for_each_busy_worker(worker, i, pos, gcwq) {
3225 struct work_struct *rebind_work = &worker->rebind_work;
3226
3227 /*
3228 * Rebind_work may race with future cpu hotplug
3229 * operations. Use a separate flag to mark that
3230 * rebinding is scheduled.
3231 */
3232 worker->flags |= WORKER_REBIND;
3233 worker->flags &= ~WORKER_ROGUE;
3234
3235 /* queue rebind_work, wq doesn't matter, use the default one */
3236 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3237 work_data_bits(rebind_work)))
3238 continue;
3239
3240 debug_work_activate(rebind_work);
3241 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3242 worker->scheduled.next,
3243 work_color_to_flags(WORK_NO_COLOR));
3244 }
3245
3246 /* relinquish manager role */
3247 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3248
3249 /* notify completion */
3250 gcwq->trustee = NULL;
3251 gcwq->trustee_state = TRUSTEE_DONE;
3252 wake_up_all(&gcwq->trustee_wait);
3253 spin_unlock_irq(&gcwq->lock);
3254 return 0;
3255}
3256
3257/**
3258 * wait_trustee_state - wait for trustee to enter the specified state
3259 * @gcwq: gcwq the trustee of interest belongs to
3260 * @state: target state to wait for
3261 *
3262 * Wait for the trustee to reach @state. DONE is already matched.
3263 *
3264 * CONTEXT:
3265 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3266 * multiple times. To be used by cpu_callback.
3267 */
3268static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3269__releases(&gcwq->lock)
3270__acquires(&gcwq->lock)
3271{
3272 if (!(gcwq->trustee_state == state ||
3273 gcwq->trustee_state == TRUSTEE_DONE)) {
3274 spin_unlock_irq(&gcwq->lock);
3275 __wait_event(gcwq->trustee_wait,
3276 gcwq->trustee_state == state ||
3277 gcwq->trustee_state == TRUSTEE_DONE);
3278 spin_lock_irq(&gcwq->lock);
3279 }
3280}
3281
1106static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 3282static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1107 unsigned long action, 3283 unsigned long action,
1108 void *hcpu) 3284 void *hcpu)
1109{ 3285{
1110 unsigned int cpu = (unsigned long)hcpu; 3286 unsigned int cpu = (unsigned long)hcpu;
1111 struct cpu_workqueue_struct *cwq; 3287 struct global_cwq *gcwq = get_gcwq(cpu);
1112 struct workqueue_struct *wq; 3288 struct task_struct *new_trustee = NULL;
1113 int err = 0; 3289 struct worker *uninitialized_var(new_worker);
3290 unsigned long flags;
1114 3291
1115 action &= ~CPU_TASKS_FROZEN; 3292 action &= ~CPU_TASKS_FROZEN;
1116 3293
1117 switch (action) { 3294 switch (action) {
3295 case CPU_DOWN_PREPARE:
3296 new_trustee = kthread_create(trustee_thread, gcwq,
3297 "workqueue_trustee/%d\n", cpu);
3298 if (IS_ERR(new_trustee))
3299 return notifier_from_errno(PTR_ERR(new_trustee));
3300 kthread_bind(new_trustee, cpu);
3301 /* fall through */
1118 case CPU_UP_PREPARE: 3302 case CPU_UP_PREPARE:
1119 cpumask_set_cpu(cpu, cpu_populated_map); 3303 BUG_ON(gcwq->first_idle);
1120 } 3304 new_worker = create_worker(gcwq, false);
1121undo: 3305 if (!new_worker) {
1122 list_for_each_entry(wq, &workqueues, list) { 3306 if (new_trustee)
1123 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 3307 kthread_stop(new_trustee);
1124 3308 return NOTIFY_BAD;
1125 switch (action) {
1126 case CPU_UP_PREPARE:
1127 err = create_workqueue_thread(cwq, cpu);
1128 if (!err)
1129 break;
1130 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1131 wq->name, cpu);
1132 action = CPU_UP_CANCELED;
1133 err = -ENOMEM;
1134 goto undo;
1135
1136 case CPU_ONLINE:
1137 start_workqueue_thread(cwq, cpu);
1138 break;
1139
1140 case CPU_UP_CANCELED:
1141 start_workqueue_thread(cwq, -1);
1142 case CPU_POST_DEAD:
1143 cleanup_workqueue_thread(cwq);
1144 break;
1145 } 3309 }
1146 } 3310 }
1147 3311
3312 /* some are called w/ irq disabled, don't disturb irq status */
3313 spin_lock_irqsave(&gcwq->lock, flags);
3314
1148 switch (action) { 3315 switch (action) {
1149 case CPU_UP_CANCELED: 3316 case CPU_DOWN_PREPARE:
3317 /* initialize trustee and tell it to acquire the gcwq */
3318 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3319 gcwq->trustee = new_trustee;
3320 gcwq->trustee_state = TRUSTEE_START;
3321 wake_up_process(gcwq->trustee);
3322 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3323 /* fall through */
3324 case CPU_UP_PREPARE:
3325 BUG_ON(gcwq->first_idle);
3326 gcwq->first_idle = new_worker;
3327 break;
3328
3329 case CPU_DYING:
3330 /*
3331 * Before this, the trustee and all workers except for
3332 * the ones which are still executing works from
3333 * before the last CPU down must be on the cpu. After
3334 * this, they'll all be diasporas.
3335 */
3336 gcwq->flags |= GCWQ_DISASSOCIATED;
3337 break;
3338
1150 case CPU_POST_DEAD: 3339 case CPU_POST_DEAD:
1151 cpumask_clear_cpu(cpu, cpu_populated_map); 3340 gcwq->trustee_state = TRUSTEE_BUTCHER;
3341 /* fall through */
3342 case CPU_UP_CANCELED:
3343 destroy_worker(gcwq->first_idle);
3344 gcwq->first_idle = NULL;
3345 break;
3346
3347 case CPU_DOWN_FAILED:
3348 case CPU_ONLINE:
3349 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3350 if (gcwq->trustee_state != TRUSTEE_DONE) {
3351 gcwq->trustee_state = TRUSTEE_RELEASE;
3352 wake_up_process(gcwq->trustee);
3353 wait_trustee_state(gcwq, TRUSTEE_DONE);
3354 }
3355
3356 /*
3357 * Trustee is done and there might be no worker left.
3358 * Put the first_idle in and request a real manager to
3359 * take a look.
3360 */
3361 spin_unlock_irq(&gcwq->lock);
3362 kthread_bind(gcwq->first_idle->task, cpu);
3363 spin_lock_irq(&gcwq->lock);
3364 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3365 start_worker(gcwq->first_idle);
3366 gcwq->first_idle = NULL;
3367 break;
1152 } 3368 }
1153 3369
1154 return notifier_from_errno(err); 3370 spin_unlock_irqrestore(&gcwq->lock, flags);
3371
3372 return notifier_from_errno(0);
1155} 3373}
1156 3374
1157#ifdef CONFIG_SMP 3375#ifdef CONFIG_SMP
@@ -1201,14 +3419,200 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
1201EXPORT_SYMBOL_GPL(work_on_cpu); 3419EXPORT_SYMBOL_GPL(work_on_cpu);
1202#endif /* CONFIG_SMP */ 3420#endif /* CONFIG_SMP */
1203 3421
1204void __init init_workqueues(void) 3422#ifdef CONFIG_FREEZER
3423
3424/**
3425 * freeze_workqueues_begin - begin freezing workqueues
3426 *
3427 * Start freezing workqueues. After this function returns, all
3428 * freezeable workqueues will queue new works to their frozen_works
3429 * list instead of gcwq->worklist.
3430 *
3431 * CONTEXT:
3432 * Grabs and releases workqueue_lock and gcwq->lock's.
3433 */
3434void freeze_workqueues_begin(void)
3435{
3436 unsigned int cpu;
3437
3438 spin_lock(&workqueue_lock);
3439
3440 BUG_ON(workqueue_freezing);
3441 workqueue_freezing = true;
3442
3443 for_each_gcwq_cpu(cpu) {
3444 struct global_cwq *gcwq = get_gcwq(cpu);
3445 struct workqueue_struct *wq;
3446
3447 spin_lock_irq(&gcwq->lock);
3448
3449 BUG_ON(gcwq->flags & GCWQ_FREEZING);
3450 gcwq->flags |= GCWQ_FREEZING;
3451
3452 list_for_each_entry(wq, &workqueues, list) {
3453 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3454
3455 if (cwq && wq->flags & WQ_FREEZEABLE)
3456 cwq->max_active = 0;
3457 }
3458
3459 spin_unlock_irq(&gcwq->lock);
3460 }
3461
3462 spin_unlock(&workqueue_lock);
3463}
3464
3465/**
3466 * freeze_workqueues_busy - are freezeable workqueues still busy?
3467 *
3468 * Check whether freezing is complete. This function must be called
3469 * between freeze_workqueues_begin() and thaw_workqueues().
3470 *
3471 * CONTEXT:
3472 * Grabs and releases workqueue_lock.
3473 *
3474 * RETURNS:
3475 * %true if some freezeable workqueues are still busy. %false if
3476 * freezing is complete.
3477 */
3478bool freeze_workqueues_busy(void)
3479{
3480 unsigned int cpu;
3481 bool busy = false;
3482
3483 spin_lock(&workqueue_lock);
3484
3485 BUG_ON(!workqueue_freezing);
3486
3487 for_each_gcwq_cpu(cpu) {
3488 struct workqueue_struct *wq;
3489 /*
3490 * nr_active is monotonically decreasing. It's safe
3491 * to peek without lock.
3492 */
3493 list_for_each_entry(wq, &workqueues, list) {
3494 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3495
3496 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3497 continue;
3498
3499 BUG_ON(cwq->nr_active < 0);
3500 if (cwq->nr_active) {
3501 busy = true;
3502 goto out_unlock;
3503 }
3504 }
3505 }
3506out_unlock:
3507 spin_unlock(&workqueue_lock);
3508 return busy;
3509}
3510
3511/**
3512 * thaw_workqueues - thaw workqueues
3513 *
3514 * Thaw workqueues. Normal queueing is restored and all collected
3515 * frozen works are transferred to their respective gcwq worklists.
3516 *
3517 * CONTEXT:
3518 * Grabs and releases workqueue_lock and gcwq->lock's.
3519 */
3520void thaw_workqueues(void)
1205{ 3521{
1206 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); 3522 unsigned int cpu;
3523
3524 spin_lock(&workqueue_lock);
3525
3526 if (!workqueue_freezing)
3527 goto out_unlock;
1207 3528
1208 cpumask_copy(cpu_populated_map, cpu_online_mask); 3529 for_each_gcwq_cpu(cpu) {
1209 singlethread_cpu = cpumask_first(cpu_possible_mask); 3530 struct global_cwq *gcwq = get_gcwq(cpu);
1210 cpu_singlethread_map = cpumask_of(singlethread_cpu); 3531 struct workqueue_struct *wq;
1211 hotcpu_notifier(workqueue_cpu_callback, 0); 3532
1212 keventd_wq = create_workqueue("events"); 3533 spin_lock_irq(&gcwq->lock);
1213 BUG_ON(!keventd_wq); 3534
3535 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3536 gcwq->flags &= ~GCWQ_FREEZING;
3537
3538 list_for_each_entry(wq, &workqueues, list) {
3539 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3540
3541 if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3542 continue;
3543
3544 /* restore max_active and repopulate worklist */
3545 cwq->max_active = wq->saved_max_active;
3546
3547 while (!list_empty(&cwq->delayed_works) &&
3548 cwq->nr_active < cwq->max_active)
3549 cwq_activate_first_delayed(cwq);
3550 }
3551
3552 wake_up_worker(gcwq);
3553
3554 spin_unlock_irq(&gcwq->lock);
3555 }
3556
3557 workqueue_freezing = false;
3558out_unlock:
3559 spin_unlock(&workqueue_lock);
3560}
3561#endif /* CONFIG_FREEZER */
3562
3563static int __init init_workqueues(void)
3564{
3565 unsigned int cpu;
3566 int i;
3567
3568 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3569
3570 /* initialize gcwqs */
3571 for_each_gcwq_cpu(cpu) {
3572 struct global_cwq *gcwq = get_gcwq(cpu);
3573
3574 spin_lock_init(&gcwq->lock);
3575 INIT_LIST_HEAD(&gcwq->worklist);
3576 gcwq->cpu = cpu;
3577 gcwq->flags |= GCWQ_DISASSOCIATED;
3578
3579 INIT_LIST_HEAD(&gcwq->idle_list);
3580 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3581 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3582
3583 init_timer_deferrable(&gcwq->idle_timer);
3584 gcwq->idle_timer.function = idle_worker_timeout;
3585 gcwq->idle_timer.data = (unsigned long)gcwq;
3586
3587 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3588 (unsigned long)gcwq);
3589
3590 ida_init(&gcwq->worker_ida);
3591
3592 gcwq->trustee_state = TRUSTEE_DONE;
3593 init_waitqueue_head(&gcwq->trustee_wait);
3594 }
3595
3596 /* create the initial worker */
3597 for_each_online_gcwq_cpu(cpu) {
3598 struct global_cwq *gcwq = get_gcwq(cpu);
3599 struct worker *worker;
3600
3601 if (cpu != WORK_CPU_UNBOUND)
3602 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3603 worker = create_worker(gcwq, true);
3604 BUG_ON(!worker);
3605 spin_lock_irq(&gcwq->lock);
3606 start_worker(worker);
3607 spin_unlock_irq(&gcwq->lock);
3608 }
3609
3610 system_wq = alloc_workqueue("events", 0, 0);
3611 system_long_wq = alloc_workqueue("events_long", 0, 0);
3612 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3613 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3614 WQ_UNBOUND_MAX_ACTIVE);
3615 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
3616 return 0;
1214} 3617}
3618early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..2d10fc98dc79
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);