aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDaniel Vetter <daniel.vetter@ffwll.ch>2013-03-19 04:47:30 -0400
committerDaniel Vetter <daniel.vetter@ffwll.ch>2013-03-19 04:47:30 -0400
commit0d4a42f6bd298e826620585e766a154ab460617a (patch)
tree406d8f7778691d858dbe3e48e4bbb10e99c0a58a /kernel
parentd62b4892f3d9f7dd2002e5309be10719d6805b0f (diff)
parenta937536b868b8369b98967929045f1df54234323 (diff)
Merge tag 'v3.9-rc3' into drm-intel-next-queued
Backmerge so that I can merge Imre Deak's coalesced sg entries fixes, which depend upon the new for_each_sg_page introduce in commit a321e91b6d73ed011ffceed384c40d2785cf723b Author: Imre Deak <imre.deak@intel.com> Date: Wed Feb 27 17:02:56 2013 -0800 lib/scatterlist: add simple page iterator The merge itself is just two trivial conflicts: Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile44
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/async.c167
-rw-r--r--kernel/cgroup.c325
-rw-r--r--kernel/compat.c74
-rw-r--r--kernel/context_tracking.c114
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c884
-rw-r--r--kernel/debug/debug_core.c1
-rw-r--r--kernel/debug/debug_core.h2
-rw-r--r--kernel/debug/gdbstub.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c20
-rw-r--r--kernel/debug/kdb/kdb_debugger.c25
-rw-r--r--kernel/debug/kdb/kdb_main.c135
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/uprobes.c466
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/futex.c51
-rw-r--r--kernel/futex_compat.c21
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c38
-rw-r--r--kernel/irq/chip.c30
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/spurious.c7
-rw-r--r--kernel/irq_work.c150
-rw-r--r--kernel/kexec.c78
-rw-r--r--kernel/kfifo.c609
-rw-r--r--kernel/kmod.c9
-rw-r--r--kernel/kprobes.c66
-rw-r--r--kernel/lockdep.c32
-rw-r--r--kernel/module.c142
-rw-r--r--kernel/mutex.c1
-rw-r--r--kernel/nsproxy.c5
-rw-r--r--kernel/panic.c34
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/posix-cpu-timers.c51
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/qos.c9
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/power/suspend_test.c11
-rw-r--r--kernel/printk.c36
-rw-r--r--kernel/profile.c24
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c60
-rw-r--r--kernel/rcutiny.c8
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c66
-rw-r--r--kernel/rcutree.c260
-rw-r--r--kernel/rcutree.h11
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c1
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c213
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c314
-rw-r--r--kernel/sched/debug.c97
-rw-r--r--kernel/sched/fair.c27
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/stats.c79
-rw-r--r--kernel/signal.c361
-rw-r--r--kernel/smp.c183
-rw-r--r--kernel/smpboot.c7
-rw-r--r--kernel/softirq.c44
-rw-r--r--kernel/srcu.c37
-rw-r--r--kernel/stop_machine.c156
-rw-r--r--kernel/sys.c311
-rw-r--r--kernel/sysctl.c27
-rw-r--r--kernel/sysctl_binary.c43
-rw-r--r--kernel/time.c12
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/ntp.c48
-rw-r--r--kernel/time/tick-broadcast.c38
-rw-r--r--kernel/time/tick-sched.c14
-rw-r--r--kernel/time/timekeeping.c71
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig55
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/ftrace.c158
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c114
-rw-r--r--kernel/trace/trace.c276
-rw-r--r--kernel/trace/trace.h134
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_functions.c61
-rw-r--r--kernel/trace/trace_functions_graph.c68
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_selftest.c21
-rw-r--r--kernel/trace/trace_syscalls.c61
-rw-r--r--kernel/trace/trace_uprobe.c217
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/tsacct.c44
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c7
-rw-r--r--kernel/user_namespace.c66
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/utsname_sysctl.c3
-rw-r--r--kernel/watchdog.c11
-rw-r--r--kernel/workqueue.c1534
-rw-r--r--kernel/workqueue_internal.h65
-rw-r--r--kernel/workqueue_sched.h9
118 files changed, 5435 insertions, 4514 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6da239..bbde5f1a4486 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
@@ -25,9 +25,7 @@ endif
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27 27
28ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) 28obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
29obj-$(CONFIG_X86) += kcmp.o
30endif
31obj-$(CONFIG_FREEZER) += freezer.o 29obj-$(CONFIG_FREEZER) += freezer.o
32obj-$(CONFIG_PROFILING) += profile.o 30obj-$(CONFIG_PROFILING) += profile.o
33obj-$(CONFIG_STACKTRACE) += stacktrace.o 31obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -127,11 +125,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
127 125
128$(obj)/time.o: $(obj)/timeconst.h 126$(obj)/time.o: $(obj)/timeconst.h
129 127
130quiet_cmd_timeconst = TIMEC $@ 128quiet_cmd_hzfile = HZFILE $@
131 cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ 129 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
130
131targets += hz.bc
132$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
133 $(call if_changed,hzfile)
134
135quiet_cmd_bc = BC $@
136 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
137
132targets += timeconst.h 138targets += timeconst.h
133$(obj)/timeconst.h: $(src)/timeconst.pl FORCE 139$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
134 $(call if_changed,timeconst) 140 $(call if_changed,bc)
135 141
136ifeq ($(CONFIG_MODULE_SIG),y) 142ifeq ($(CONFIG_MODULE_SIG),y)
137# 143#
@@ -153,23 +159,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates
153# fail and that the kernel may be used afterwards. 159# fail and that the kernel may be used afterwards.
154# 160#
155############################################################################### 161###############################################################################
156sign_key_with_hash := 162ifndef CONFIG_MODULE_SIG_HASH
157ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
158sign_key_with_hash := -sha1
159endif
160ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
161sign_key_with_hash := -sha224
162endif
163ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
164sign_key_with_hash := -sha256
165endif
166ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
167sign_key_with_hash := -sha384
168endif
169ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
170sign_key_with_hash := -sha512
171endif
172ifeq ($(sign_key_with_hash),)
173$(error Could not determine digest type to use from kernel config) 163$(error Could not determine digest type to use from kernel config)
174endif 164endif
175 165
@@ -182,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey
182 @echo "### needs to be run as root, and uses a hardware random" 172 @echo "### needs to be run as root, and uses a hardware random"
183 @echo "### number generator if one is available." 173 @echo "### number generator if one is available."
184 @echo "###" 174 @echo "###"
185 openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ 175 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
186 -x509 -config x509.genkey \ 176 -batch -x509 -config x509.genkey \
187 -outform DER -out signing_key.x509 \ 177 -outform DER -out signing_key.x509 \
188 -keyout signing_key.priv 178 -keyout signing_key.priv
189 @echo "###" 179 @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e7..b9bd7f098ee5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
205 if (IS_ERR(file)) 205 if (IS_ERR(file))
206 return PTR_ERR(file); 206 return PTR_ERR(file);
207 207
208 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { 208 if (!S_ISREG(file_inode(file)->i_mode)) {
209 filp_close(file, NULL); 209 filp_close(file, NULL);
210 return -EACCES; 210 return -EACCES;
211 } 211 }
@@ -566,6 +566,7 @@ out:
566void acct_collect(long exitcode, int group_dead) 566void acct_collect(long exitcode, int group_dead)
567{ 567{
568 struct pacct_struct *pacct = &current->signal->pacct; 568 struct pacct_struct *pacct = &current->signal->pacct;
569 cputime_t utime, stime;
569 unsigned long vsize = 0; 570 unsigned long vsize = 0;
570 571
571 if (group_dead && current->mm) { 572 if (group_dead && current->mm) {
@@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead)
593 pacct->ac_flag |= ACORE; 594 pacct->ac_flag |= ACORE;
594 if (current->flags & PF_SIGNALED) 595 if (current->flags & PF_SIGNALED)
595 pacct->ac_flag |= AXSIG; 596 pacct->ac_flag |= AXSIG;
596 pacct->ac_utime += current->utime; 597 task_cputime(current, &utime, &stime);
597 pacct->ac_stime += current->stime; 598 pacct->ac_utime += utime;
599 pacct->ac_stime += stime;
598 pacct->ac_minflt += current->min_flt; 600 pacct->ac_minflt += current->min_flt;
599 pacct->ac_majflt += current->maj_flt; 601 pacct->ac_majflt += current->maj_flt;
600 spin_unlock_irq(&current->sighand->siglock); 602 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 6f34904a0b53..8ddee2c3e5b0 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,65 +57,52 @@ asynchronous and synchronous parts of the kernel.
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/workqueue.h> 58#include <linux/workqueue.h>
59 59
60#include "workqueue_internal.h"
61
60static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
61 63
62#define MAX_WORK 32768 64#define MAX_WORK 32768
65#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
63 66
64static LIST_HEAD(async_pending); 67static LIST_HEAD(async_global_pending); /* pending from all registered doms */
65static ASYNC_DOMAIN(async_running); 68static ASYNC_DOMAIN(async_dfl_domain);
66static LIST_HEAD(async_domains);
67static DEFINE_SPINLOCK(async_lock); 69static DEFINE_SPINLOCK(async_lock);
68static DEFINE_MUTEX(async_register_mutex);
69 70
70struct async_entry { 71struct async_entry {
71 struct list_head list; 72 struct list_head domain_list;
73 struct list_head global_list;
72 struct work_struct work; 74 struct work_struct work;
73 async_cookie_t cookie; 75 async_cookie_t cookie;
74 async_func_ptr *func; 76 async_func_ptr *func;
75 void *data; 77 void *data;
76 struct async_domain *running; 78 struct async_domain *domain;
77}; 79};
78 80
79static DECLARE_WAIT_QUEUE_HEAD(async_done); 81static DECLARE_WAIT_QUEUE_HEAD(async_done);
80 82
81static atomic_t entry_count; 83static atomic_t entry_count;
82 84
83 85static async_cookie_t lowest_in_progress(struct async_domain *domain)
84/*
85 * MUST be called with the lock held!
86 */
87static async_cookie_t __lowest_in_progress(struct async_domain *running)
88{ 86{
89 async_cookie_t first_running = next_cookie; /* infinity value */ 87 struct async_entry *first = NULL;
90 async_cookie_t first_pending = next_cookie; /* ditto */ 88 async_cookie_t ret = ASYNC_COOKIE_MAX;
91 struct async_entry *entry; 89 unsigned long flags;
92 90
93 /* 91 spin_lock_irqsave(&async_lock, flags);
94 * Both running and pending lists are sorted but not disjoint.
95 * Take the first cookies from both and return the min.
96 */
97 if (!list_empty(&running->domain)) {
98 entry = list_first_entry(&running->domain, typeof(*entry), list);
99 first_running = entry->cookie;
100 }
101 92
102 list_for_each_entry(entry, &async_pending, list) { 93 if (domain) {
103 if (entry->running == running) { 94 if (!list_empty(&domain->pending))
104 first_pending = entry->cookie; 95 first = list_first_entry(&domain->pending,
105 break; 96 struct async_entry, domain_list);
106 } 97 } else {
98 if (!list_empty(&async_global_pending))
99 first = list_first_entry(&async_global_pending,
100 struct async_entry, global_list);
107 } 101 }
108 102
109 return min(first_running, first_pending); 103 if (first)
110} 104 ret = first->cookie;
111
112static async_cookie_t lowest_in_progress(struct async_domain *running)
113{
114 unsigned long flags;
115 async_cookie_t ret;
116 105
117 spin_lock_irqsave(&async_lock, flags);
118 ret = __lowest_in_progress(running);
119 spin_unlock_irqrestore(&async_lock, flags); 106 spin_unlock_irqrestore(&async_lock, flags);
120 return ret; 107 return ret;
121} 108}
@@ -127,20 +114,10 @@ static void async_run_entry_fn(struct work_struct *work)
127{ 114{
128 struct async_entry *entry = 115 struct async_entry *entry =
129 container_of(work, struct async_entry, work); 116 container_of(work, struct async_entry, work);
130 struct async_entry *pos;
131 unsigned long flags; 117 unsigned long flags;
132 ktime_t uninitialized_var(calltime), delta, rettime; 118 ktime_t uninitialized_var(calltime), delta, rettime;
133 struct async_domain *running = entry->running;
134 119
135 /* 1) move self to the running queue, make sure it stays sorted */ 120 /* 1) run (and print duration) */
136 spin_lock_irqsave(&async_lock, flags);
137 list_for_each_entry_reverse(pos, &running->domain, list)
138 if (entry->cookie < pos->cookie)
139 break;
140 list_move_tail(&entry->list, &pos->list);
141 spin_unlock_irqrestore(&async_lock, flags);
142
143 /* 2) run (and print duration) */
144 if (initcall_debug && system_state == SYSTEM_BOOTING) { 121 if (initcall_debug && system_state == SYSTEM_BOOTING) {
145 printk(KERN_DEBUG "calling %lli_%pF @ %i\n", 122 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
146 (long long)entry->cookie, 123 (long long)entry->cookie,
@@ -157,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work)
157 (long long)ktime_to_ns(delta) >> 10); 134 (long long)ktime_to_ns(delta) >> 10);
158 } 135 }
159 136
160 /* 3) remove self from the running queue */ 137 /* 2) remove self from the pending queues */
161 spin_lock_irqsave(&async_lock, flags); 138 spin_lock_irqsave(&async_lock, flags);
162 list_del(&entry->list); 139 list_del_init(&entry->domain_list);
163 if (running->registered && --running->count == 0) 140 list_del_init(&entry->global_list);
164 list_del_init(&running->node);
165 141
166 /* 4) free the entry */ 142 /* 3) free the entry */
167 kfree(entry); 143 kfree(entry);
168 atomic_dec(&entry_count); 144 atomic_dec(&entry_count);
169 145
170 spin_unlock_irqrestore(&async_lock, flags); 146 spin_unlock_irqrestore(&async_lock, flags);
171 147
172 /* 5) wake up any waiters */ 148 /* 4) wake up any waiters */
173 wake_up(&async_done); 149 wake_up(&async_done);
174} 150}
175 151
176static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) 152static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
177{ 153{
178 struct async_entry *entry; 154 struct async_entry *entry;
179 unsigned long flags; 155 unsigned long flags;
@@ -196,16 +172,22 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
196 ptr(data, newcookie); 172 ptr(data, newcookie);
197 return newcookie; 173 return newcookie;
198 } 174 }
175 INIT_LIST_HEAD(&entry->domain_list);
176 INIT_LIST_HEAD(&entry->global_list);
199 INIT_WORK(&entry->work, async_run_entry_fn); 177 INIT_WORK(&entry->work, async_run_entry_fn);
200 entry->func = ptr; 178 entry->func = ptr;
201 entry->data = data; 179 entry->data = data;
202 entry->running = running; 180 entry->domain = domain;
203 181
204 spin_lock_irqsave(&async_lock, flags); 182 spin_lock_irqsave(&async_lock, flags);
183
184 /* allocate cookie and queue */
205 newcookie = entry->cookie = next_cookie++; 185 newcookie = entry->cookie = next_cookie++;
206 list_add_tail(&entry->list, &async_pending); 186
207 if (running->registered && running->count++ == 0) 187 list_add_tail(&entry->domain_list, &domain->pending);
208 list_add_tail(&running->node, &async_domains); 188 if (domain->registered)
189 list_add_tail(&entry->global_list, &async_global_pending);
190
209 atomic_inc(&entry_count); 191 atomic_inc(&entry_count);
210 spin_unlock_irqrestore(&async_lock, flags); 192 spin_unlock_irqrestore(&async_lock, flags);
211 193
@@ -228,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
228 */ 210 */
229async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 211async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
230{ 212{
231 return __async_schedule(ptr, data, &async_running); 213 return __async_schedule(ptr, data, &async_dfl_domain);
232} 214}
233EXPORT_SYMBOL_GPL(async_schedule); 215EXPORT_SYMBOL_GPL(async_schedule);
234 216
@@ -236,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule);
236 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain 218 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
237 * @ptr: function to execute asynchronously 219 * @ptr: function to execute asynchronously
238 * @data: data pointer to pass to the function 220 * @data: data pointer to pass to the function
239 * @running: running list for the domain 221 * @domain: the domain
240 * 222 *
241 * Returns an async_cookie_t that may be used for checkpointing later. 223 * Returns an async_cookie_t that may be used for checkpointing later.
242 * @running may be used in the async_synchronize_*_domain() functions 224 * @domain may be used in the async_synchronize_*_domain() functions to
243 * to wait within a certain synchronization domain rather than globally. 225 * wait within a certain synchronization domain rather than globally. A
244 * A synchronization domain is specified via the running queue @running to use. 226 * synchronization domain is specified via @domain. Note: This function
245 * Note: This function may be called from atomic or non-atomic contexts. 227 * may be called from atomic or non-atomic contexts.
246 */ 228 */
247async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 229async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
248 struct async_domain *running) 230 struct async_domain *domain)
249{ 231{
250 return __async_schedule(ptr, data, running); 232 return __async_schedule(ptr, data, domain);
251} 233}
252EXPORT_SYMBOL_GPL(async_schedule_domain); 234EXPORT_SYMBOL_GPL(async_schedule_domain);
253 235
@@ -258,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
258 */ 240 */
259void async_synchronize_full(void) 241void async_synchronize_full(void)
260{ 242{
261 mutex_lock(&async_register_mutex); 243 async_synchronize_full_domain(NULL);
262 do {
263 struct async_domain *domain = NULL;
264
265 spin_lock_irq(&async_lock);
266 if (!list_empty(&async_domains))
267 domain = list_first_entry(&async_domains, typeof(*domain), node);
268 spin_unlock_irq(&async_lock);
269
270 async_synchronize_cookie_domain(next_cookie, domain);
271 } while (!list_empty(&async_domains));
272 mutex_unlock(&async_register_mutex);
273} 244}
274EXPORT_SYMBOL_GPL(async_synchronize_full); 245EXPORT_SYMBOL_GPL(async_synchronize_full);
275 246
@@ -284,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
284 */ 255 */
285void async_unregister_domain(struct async_domain *domain) 256void async_unregister_domain(struct async_domain *domain)
286{ 257{
287 mutex_lock(&async_register_mutex);
288 spin_lock_irq(&async_lock); 258 spin_lock_irq(&async_lock);
289 WARN_ON(!domain->registered || !list_empty(&domain->node) || 259 WARN_ON(!domain->registered || !list_empty(&domain->pending));
290 !list_empty(&domain->domain));
291 domain->registered = 0; 260 domain->registered = 0;
292 spin_unlock_irq(&async_lock); 261 spin_unlock_irq(&async_lock);
293 mutex_unlock(&async_register_mutex);
294} 262}
295EXPORT_SYMBOL_GPL(async_unregister_domain); 263EXPORT_SYMBOL_GPL(async_unregister_domain);
296 264
297/** 265/**
298 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 266 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
299 * @domain: running list to synchronize on 267 * @domain: the domain to synchronize
300 * 268 *
301 * This function waits until all asynchronous function calls for the 269 * This function waits until all asynchronous function calls for the
302 * synchronization domain specified by the running list @domain have been done. 270 * synchronization domain specified by @domain have been done.
303 */ 271 */
304void async_synchronize_full_domain(struct async_domain *domain) 272void async_synchronize_full_domain(struct async_domain *domain)
305{ 273{
306 async_synchronize_cookie_domain(next_cookie, domain); 274 async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
307} 275}
308EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 276EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
309 277
310/** 278/**
311 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing 279 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
312 * @cookie: async_cookie_t to use as checkpoint 280 * @cookie: async_cookie_t to use as checkpoint
313 * @running: running list to synchronize on 281 * @domain: the domain to synchronize (%NULL for all registered domains)
314 * 282 *
315 * This function waits until all asynchronous function calls for the 283 * This function waits until all asynchronous function calls for the
316 * synchronization domain specified by running list @running submitted 284 * synchronization domain specified by @domain submitted prior to @cookie
317 * prior to @cookie have been done. 285 * have been done.
318 */ 286 */
319void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) 287void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
320{ 288{
321 ktime_t uninitialized_var(starttime), delta, endtime; 289 ktime_t uninitialized_var(starttime), delta, endtime;
322 290
323 if (!running)
324 return;
325
326 if (initcall_debug && system_state == SYSTEM_BOOTING) { 291 if (initcall_debug && system_state == SYSTEM_BOOTING) {
327 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 292 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
328 starttime = ktime_get(); 293 starttime = ktime_get();
329 } 294 }
330 295
331 wait_event(async_done, lowest_in_progress(running) >= cookie); 296 wait_event(async_done, lowest_in_progress(domain) >= cookie);
332 297
333 if (initcall_debug && system_state == SYSTEM_BOOTING) { 298 if (initcall_debug && system_state == SYSTEM_BOOTING) {
334 endtime = ktime_get(); 299 endtime = ktime_get();
@@ -350,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
350 */ 315 */
351void async_synchronize_cookie(async_cookie_t cookie) 316void async_synchronize_cookie(async_cookie_t cookie)
352{ 317{
353 async_synchronize_cookie_domain(cookie, &async_running); 318 async_synchronize_cookie_domain(cookie, &async_dfl_domain);
354} 319}
355EXPORT_SYMBOL_GPL(async_synchronize_cookie); 320EXPORT_SYMBOL_GPL(async_synchronize_cookie);
321
322/**
323 * current_is_async - is %current an async worker task?
324 *
325 * Returns %true if %current is an async worker task.
326 */
327bool current_is_async(void)
328{
329 struct worker *worker = current_wq_worker();
330
331 return worker && worker->current_func == async_run_entry_fn;
332}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..a32f9432666c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
52#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/delayacct.h> 53#include <linux/delayacct.h>
54#include <linux/cgroupstats.h> 54#include <linux/cgroupstats.h>
55#include <linux/hash.h> 55#include <linux/hashtable.h>
56#include <linux/namei.h> 56#include <linux/namei.h>
57#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
58#include <linux/idr.h> 58#include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
376 * account cgroups in empty hierarchies. 376 * account cgroups in empty hierarchies.
377 */ 377 */
378#define CSS_SET_HASH_BITS 7 378#define CSS_SET_HASH_BITS 7
379#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 379static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
380static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
381 380
382static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) 381static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383{ 382{
384 int i; 383 int i;
385 int index; 384 unsigned long key = 0UL;
386 unsigned long tmp = 0UL;
387 385
388 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
389 tmp += (unsigned long)css[i]; 387 key += (unsigned long)css[i];
390 tmp = (tmp >> 16) ^ tmp; 388 key = (key >> 16) ^ key;
391 389
392 index = hash_long(tmp, CSS_SET_HASH_BITS); 390 return key;
393
394 return &css_set_table[index];
395} 391}
396 392
397/* We don't maintain the lists running through each css_set to its 393/* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
418 } 414 }
419 415
420 /* This css_set is dead. unlink it and release cgroup refcounts */ 416 /* This css_set is dead. unlink it and release cgroup refcounts */
421 hlist_del(&cg->hlist); 417 hash_del(&cg->hlist);
422 css_set_count--; 418 css_set_count--;
423 419
424 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 420 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
426 struct cgroup *cgrp = link->cgrp; 422 struct cgroup *cgrp = link->cgrp;
427 list_del(&link->cg_link_list); 423 list_del(&link->cg_link_list);
428 list_del(&link->cgrp_link_list); 424 list_del(&link->cgrp_link_list);
425
426 /*
427 * We may not be holding cgroup_mutex, and if cgrp->count is
428 * dropped to 0 the cgroup can be destroyed at any time, hence
429 * rcu_read_lock is used to keep it alive.
430 */
431 rcu_read_lock();
429 if (atomic_dec_and_test(&cgrp->count) && 432 if (atomic_dec_and_test(&cgrp->count) &&
430 notify_on_release(cgrp)) { 433 notify_on_release(cgrp)) {
431 if (taskexit) 434 if (taskexit)
432 set_bit(CGRP_RELEASABLE, &cgrp->flags); 435 set_bit(CGRP_RELEASABLE, &cgrp->flags);
433 check_for_release(cgrp); 436 check_for_release(cgrp);
434 } 437 }
438 rcu_read_unlock();
435 439
436 kfree(link); 440 kfree(link);
437 } 441 }
@@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set(
550{ 554{
551 int i; 555 int i;
552 struct cgroupfs_root *root = cgrp->root; 556 struct cgroupfs_root *root = cgrp->root;
553 struct hlist_head *hhead;
554 struct hlist_node *node;
555 struct css_set *cg; 557 struct css_set *cg;
558 unsigned long key;
556 559
557 /* 560 /*
558 * Build the set of subsystem state objects that we want to see in the 561 * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set(
572 } 575 }
573 } 576 }
574 577
575 hhead = css_set_hash(template); 578 key = css_set_hash(template);
576 hlist_for_each_entry(cg, node, hhead, hlist) { 579 hash_for_each_possible(css_set_table, cg, hlist, key) {
577 if (!compare_css_sets(cg, oldcg, cgrp, template)) 580 if (!compare_css_sets(cg, oldcg, cgrp, template))
578 continue; 581 continue;
579 582
@@ -657,8 +660,8 @@ static struct css_set *find_css_set(
657 660
658 struct list_head tmp_cg_links; 661 struct list_head tmp_cg_links;
659 662
660 struct hlist_head *hhead;
661 struct cg_cgroup_link *link; 663 struct cg_cgroup_link *link;
664 unsigned long key;
662 665
663 /* First see if we already have a cgroup group that matches 666 /* First see if we already have a cgroup group that matches
664 * the desired set */ 667 * the desired set */
@@ -704,8 +707,8 @@ static struct css_set *find_css_set(
704 css_set_count++; 707 css_set_count++;
705 708
706 /* Add this cgroup group to the hash table */ 709 /* Add this cgroup group to the hash table */
707 hhead = css_set_hash(res->subsys); 710 key = css_set_hash(res->subsys);
708 hlist_add_head(&res->hlist, hhead); 711 hash_add(css_set_table, &res->hlist, key);
709 712
710 write_unlock(&css_set_lock); 713 write_unlock(&css_set_lock);
711 714
@@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
856 return inode; 859 return inode;
857} 860}
858 861
859static void cgroup_diput(struct dentry *dentry, struct inode *inode) 862static void cgroup_free_fn(struct work_struct *work)
860{ 863{
861 /* is dentry a directory ? if so, kfree() associated cgroup */ 864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
862 if (S_ISDIR(inode->i_mode)) { 865 struct cgroup_subsys *ss;
863 struct cgroup *cgrp = dentry->d_fsdata;
864 struct cgroup_subsys *ss;
865 BUG_ON(!(cgroup_is_removed(cgrp)));
866 /* It's possible for external users to be holding css
867 * reference counts on a cgroup; css_put() needs to
868 * be able to access the cgroup after decrementing
869 * the reference count in order to know if it needs to
870 * queue the cgroup to be handled by the release
871 * agent */
872 synchronize_rcu();
873 866
874 mutex_lock(&cgroup_mutex); 867 mutex_lock(&cgroup_mutex);
875 /* 868 /*
876 * Release the subsystem state objects. 869 * Release the subsystem state objects.
877 */ 870 */
878 for_each_subsys(cgrp->root, ss) 871 for_each_subsys(cgrp->root, ss)
879 ss->css_free(cgrp); 872 ss->css_free(cgrp);
880 873
881 cgrp->root->number_of_cgroups--; 874 cgrp->root->number_of_cgroups--;
882 mutex_unlock(&cgroup_mutex); 875 mutex_unlock(&cgroup_mutex);
883 876
884 /* 877 /*
885 * Drop the active superblock reference that we took when we 878 * Drop the active superblock reference that we took when we
886 * created the cgroup 879 * created the cgroup
887 */ 880 */
888 deactivate_super(cgrp->root->sb); 881 deactivate_super(cgrp->root->sb);
889 882
890 /* 883 /*
891 * if we're getting rid of the cgroup, refcount should ensure 884 * if we're getting rid of the cgroup, refcount should ensure
892 * that there are no pidlists left. 885 * that there are no pidlists left.
893 */ 886 */
894 BUG_ON(!list_empty(&cgrp->pidlists)); 887 BUG_ON(!list_empty(&cgrp->pidlists));
888
889 simple_xattrs_free(&cgrp->xattrs);
890
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
892 kfree(cgrp);
893}
895 894
896 simple_xattrs_free(&cgrp->xattrs); 895static void cgroup_free_rcu(struct rcu_head *head)
896{
897 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
898
899 schedule_work(&cgrp->free_work);
900}
901
902static void cgroup_diput(struct dentry *dentry, struct inode *inode)
903{
904 /* is dentry a directory ? if so, kfree() associated cgroup */
905 if (S_ISDIR(inode->i_mode)) {
906 struct cgroup *cgrp = dentry->d_fsdata;
897 907
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 908 BUG_ON(!(cgroup_is_removed(cgrp)));
899 kfree_rcu(cgrp, rcu_head); 909 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
900 } else { 910 } else {
901 struct cfent *cfe = __d_cfe(dentry); 911 struct cfent *cfe = __d_cfe(dentry);
902 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 912 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d)
925 dput(parent); 935 dput(parent);
926} 936}
927 937
928static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 938static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
929{ 939{
930 struct cfent *cfe; 940 struct cfent *cfe;
931 941
932 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 942 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
933 lockdep_assert_held(&cgroup_mutex); 943 lockdep_assert_held(&cgroup_mutex);
934 944
945 /*
946 * If we're doing cleanup due to failure of cgroup_create(),
947 * the corresponding @cfe may not exist.
948 */
935 list_for_each_entry(cfe, &cgrp->files, node) { 949 list_for_each_entry(cfe, &cgrp->files, node) {
936 struct dentry *d = cfe->dentry; 950 struct dentry *d = cfe->dentry;
937 951
@@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
944 list_del_init(&cfe->node); 958 list_del_init(&cfe->node);
945 dput(d); 959 dput(d);
946 960
947 return 0; 961 break;
948 } 962 }
949 return -ENOENT;
950} 963}
951 964
952/** 965/**
@@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1083 } 1096 }
1084 } 1097 }
1085 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1098 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1086 synchronize_rcu();
1087 1099
1088 return 0; 1100 return 0;
1089} 1101}
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1393 INIT_LIST_HEAD(&cgrp->allcg_node); 1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1394 INIT_LIST_HEAD(&cgrp->release_list); 1406 INIT_LIST_HEAD(&cgrp->release_list);
1395 INIT_LIST_HEAD(&cgrp->pidlists); 1407 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1396 mutex_init(&cgrp->pidlist_mutex); 1409 mutex_init(&cgrp->pidlist_mutex);
1397 INIT_LIST_HEAD(&cgrp->event_list); 1410 INIT_LIST_HEAD(&cgrp->event_list);
1398 spin_lock_init(&cgrp->event_list_lock); 1411 spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 struct cgroupfs_root *existing_root; 1610 struct cgroupfs_root *existing_root;
1598 const struct cred *cred; 1611 const struct cred *cred;
1599 int i; 1612 int i;
1613 struct css_set *cg;
1600 1614
1601 BUG_ON(sb->s_root != NULL); 1615 BUG_ON(sb->s_root != NULL);
1602 1616
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1650 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1651 * the css_set objects */ 1665 * the css_set objects */
1652 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
1653 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 1667 hash_for_each(css_set_table, i, cg, hlist)
1654 struct hlist_head *hhead = &css_set_table[i]; 1668 link_css_set(&tmp_cg_links, cg, root_cgrp);
1655 struct hlist_node *node;
1656 struct css_set *cg;
1657
1658 hlist_for_each_entry(cg, node, hhead, hlist)
1659 link_css_set(&tmp_cg_links, cg, root_cgrp);
1660 }
1661 write_unlock(&css_set_lock); 1669 write_unlock(&css_set_lock);
1662 1670
1663 free_cg_links(&tmp_cg_links); 1671 free_cg_links(&tmp_cg_links);
@@ -1773,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1774 "cgroup_path() called without proper locking"); 1782 "cgroup_path() called without proper locking");
1775 1783
1776 if (!dentry || cgrp == dummytop) { 1784 if (cgrp == dummytop) {
1777 /* 1785 /*
1778 * Inactive subsystems have no dentry for their root 1786 * Inactive subsystems have no dentry for their root
1779 * cgroup 1787 * cgroup
@@ -1982,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1982 ss->attach(cgrp, &tset); 1990 ss->attach(cgrp, &tset);
1983 } 1991 }
1984 1992
1985 synchronize_rcu();
1986out: 1993out:
1987 if (retval) { 1994 if (retval) {
1988 for_each_subsys(root, ss) { 1995 for_each_subsys(root, ss) {
@@ -2151,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2151 /* 2158 /*
2152 * step 5: success! and cleanup 2159 * step 5: success! and cleanup
2153 */ 2160 */
2154 synchronize_rcu();
2155 retval = 0; 2161 retval = 0;
2156out_put_css_set_refs: 2162out_put_css_set_refs:
2157 if (retval) { 2163 if (retval) {
@@ -2637,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
2637 */ 2643 */
2638static inline struct cftype *__file_cft(struct file *file) 2644static inline struct cftype *__file_cft(struct file *file)
2639{ 2645{
2640 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) 2646 if (file_inode(file)->i_fop != &cgroup_file_operations)
2641 return ERR_PTR(-EINVAL); 2647 return ERR_PTR(-EINVAL);
2642 return __d_cft(file->f_dentry); 2648 return __d_cft(file->f_dentry);
2643} 2649}
@@ -2769,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue; 2776 continue;
2771 2777
2772 if (is_add) 2778 if (is_add) {
2773 err = cgroup_add_file(cgrp, subsys, cft); 2779 err = cgroup_add_file(cgrp, subsys, cft);
2774 else 2780 if (err)
2775 err = cgroup_rm_file(cgrp, cft); 2781 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2776 if (err) { 2782 cft->name, err);
2777 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2778 is_add ? "add" : "remove", cft->name, err);
2779 ret = err; 2783 ret = err;
2784 } else {
2785 cgroup_rm_file(cgrp, cft);
2780 } 2786 }
2781 } 2787 }
2782 return ret; 2788 return ret;
@@ -3017,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
3017} 3023}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3024EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019 3025
3026/**
3027 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
3028 * @pos: cgroup of interest
3029 *
3030 * Return the rightmost descendant of @pos. If there's no descendant,
3031 * @pos is returned. This can be used during pre-order traversal to skip
3032 * subtree of @pos.
3033 */
3034struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3035{
3036 struct cgroup *last, *tmp;
3037
3038 WARN_ON_ONCE(!rcu_read_lock_held());
3039
3040 do {
3041 last = pos;
3042 /* ->prev isn't RCU safe, walk ->next till the end */
3043 pos = NULL;
3044 list_for_each_entry_rcu(tmp, &last->children, sibling)
3045 pos = tmp;
3046 } while (pos);
3047
3048 return last;
3049}
3050EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
3051
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3052static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{ 3053{
3022 struct cgroup *last; 3054 struct cgroup *last;
@@ -3752,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
3752 remove); 3784 remove);
3753 struct cgroup *cgrp = event->cgrp; 3785 struct cgroup *cgrp = event->cgrp;
3754 3786
3787 remove_wait_queue(event->wqh, &event->wait);
3788
3755 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3789 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3756 3790
3791 /* Notify userspace the event is going away. */
3792 eventfd_signal(event->eventfd, 1);
3793
3757 eventfd_ctx_put(event->eventfd); 3794 eventfd_ctx_put(event->eventfd);
3758 kfree(event); 3795 kfree(event);
3759 dput(cgrp->dentry); 3796 dput(cgrp->dentry);
@@ -3773,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3773 unsigned long flags = (unsigned long)key; 3810 unsigned long flags = (unsigned long)key;
3774 3811
3775 if (flags & POLLHUP) { 3812 if (flags & POLLHUP) {
3776 __remove_wait_queue(event->wqh, &event->wait);
3777 spin_lock(&cgrp->event_list_lock);
3778 list_del_init(&event->list);
3779 spin_unlock(&cgrp->event_list_lock);
3780 /* 3813 /*
3781 * We are in atomic context, but cgroup_event_remove() may 3814 * If the event has been detached at cgroup removal, we
3782 * sleep, so we have to call it in workqueue. 3815 * can simply return knowing the other side will cleanup
3816 * for us.
3817 *
3818 * We can't race against event freeing since the other
3819 * side will require wqh->lock via remove_wait_queue(),
3820 * which we hold.
3783 */ 3821 */
3784 schedule_work(&event->remove); 3822 spin_lock(&cgrp->event_list_lock);
3823 if (!list_empty(&event->list)) {
3824 list_del_init(&event->list);
3825 /*
3826 * We are in atomic context, but cgroup_event_remove()
3827 * may sleep, so we have to call it in workqueue.
3828 */
3829 schedule_work(&event->remove);
3830 }
3831 spin_unlock(&cgrp->event_list_lock);
3785 } 3832 }
3786 3833
3787 return 0; 3834 return 0;
@@ -3807,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3807 const char *buffer) 3854 const char *buffer)
3808{ 3855{
3809 struct cgroup_event *event = NULL; 3856 struct cgroup_event *event = NULL;
3857 struct cgroup *cgrp_cfile;
3810 unsigned int efd, cfd; 3858 unsigned int efd, cfd;
3811 struct file *efile = NULL; 3859 struct file *efile = NULL;
3812 struct file *cfile = NULL; 3860 struct file *cfile = NULL;
@@ -3852,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3852 3900
3853 /* the process need read permission on control file */ 3901 /* the process need read permission on control file */
3854 /* AV: shouldn't we check that it's been opened for read instead? */ 3902 /* AV: shouldn't we check that it's been opened for read instead? */
3855 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); 3903 ret = inode_permission(file_inode(cfile), MAY_READ);
3856 if (ret < 0) 3904 if (ret < 0)
3857 goto fail; 3905 goto fail;
3858 3906
@@ -3862,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3862 goto fail; 3910 goto fail;
3863 } 3911 }
3864 3912
3913 /*
3914 * The file to be monitored must be in the same cgroup as
3915 * cgroup.event_control is.
3916 */
3917 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
3918 if (cgrp_cfile != cgrp) {
3919 ret = -EINVAL;
3920 goto fail;
3921 }
3922
3865 if (!event->cft->register_event || !event->cft->unregister_event) { 3923 if (!event->cft->register_event || !event->cft->unregister_event) {
3866 ret = -EINVAL; 3924 ret = -EINVAL;
3867 goto fail; 3925 goto fail;
@@ -4135,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4135 4193
4136 init_cgroup_housekeeping(cgrp); 4194 init_cgroup_housekeeping(cgrp);
4137 4195
4196 dentry->d_fsdata = cgrp;
4197 cgrp->dentry = dentry;
4198
4138 cgrp->parent = parent; 4199 cgrp->parent = parent;
4139 cgrp->root = parent->root; 4200 cgrp->root = parent->root;
4140 cgrp->top_cgroup = parent->top_cgroup; 4201 cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4172 lockdep_assert_held(&dentry->d_inode->i_mutex); 4233 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173 4234
4174 /* allocation complete, commit to creation */ 4235 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4236 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++; 4238 root->number_of_cgroups++;
@@ -4340,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 /* 4399 /*
4341 * Unregister events and notify userspace. 4400 * Unregister events and notify userspace.
4342 * Notify userspace about cgroup removing only after rmdir of cgroup 4401 * Notify userspace about cgroup removing only after rmdir of cgroup
4343 * directory to avoid race between userspace and kernelspace. Use 4402 * directory to avoid race between userspace and kernelspace.
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4347 */ 4403 */
4348 spin_lock(&cgrp->event_list_lock); 4404 spin_lock(&cgrp->event_list_lock);
4349 list_splice_init(&cgrp->event_list, &tmp_list); 4405 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4350 spin_unlock(&cgrp->event_list_lock);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list); 4406 list_del_init(&event->list);
4353 remove_wait_queue(event->wqh, &event->wait);
4354 eventfd_signal(event->eventfd, 1);
4355 schedule_work(&event->remove); 4407 schedule_work(&event->remove);
4356 } 4408 }
4409 spin_unlock(&cgrp->event_list_lock);
4357 4410
4358 return 0; 4411 return 0;
4359} 4412}
@@ -4438,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4438{ 4491{
4439 struct cgroup_subsys_state *css; 4492 struct cgroup_subsys_state *css;
4440 int i, ret; 4493 int i, ret;
4494 struct hlist_node *tmp;
4495 struct css_set *cg;
4496 unsigned long key;
4441 4497
4442 /* check name and function validity */ 4498 /* check name and function validity */
4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4499 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4503 * this is all done under the css_set_lock. 4559 * this is all done under the css_set_lock.
4504 */ 4560 */
4505 write_lock(&css_set_lock); 4561 write_lock(&css_set_lock);
4506 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 4562 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
4507 struct css_set *cg; 4563 /* skip entries that we already rehashed */
4508 struct hlist_node *node, *tmp; 4564 if (cg->subsys[ss->subsys_id])
4509 struct hlist_head *bucket = &css_set_table[i], *new_bucket; 4565 continue;
4510 4566 /* remove existing entry */
4511 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { 4567 hash_del(&cg->hlist);
4512 /* skip entries that we already rehashed */ 4568 /* set new value */
4513 if (cg->subsys[ss->subsys_id]) 4569 cg->subsys[ss->subsys_id] = css;
4514 continue; 4570 /* recompute hash and restore entry */
4515 /* remove existing entry */ 4571 key = css_set_hash(cg->subsys);
4516 hlist_del(&cg->hlist); 4572 hash_add(css_set_table, &cg->hlist, key);
4517 /* set new value */
4518 cg->subsys[ss->subsys_id] = css;
4519 /* recompute hash and restore entry */
4520 new_bucket = css_set_hash(cg->subsys);
4521 hlist_add_head(&cg->hlist, new_bucket);
4522 }
4523 } 4573 }
4524 write_unlock(&css_set_lock); 4574 write_unlock(&css_set_lock);
4525 4575
@@ -4551,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4551void cgroup_unload_subsys(struct cgroup_subsys *ss) 4601void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552{ 4602{
4553 struct cg_cgroup_link *link; 4603 struct cg_cgroup_link *link;
4554 struct hlist_head *hhead;
4555 4604
4556 BUG_ON(ss->module == NULL); 4605 BUG_ON(ss->module == NULL);
4557 4606
@@ -4567,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 offline_css(ss, dummytop); 4616 offline_css(ss, dummytop);
4568 ss->active = 0; 4617 ss->active = 0;
4569 4618
4570 if (ss->use_id) { 4619 if (ss->use_id)
4571 idr_remove_all(&ss->idr);
4572 idr_destroy(&ss->idr); 4620 idr_destroy(&ss->idr);
4573 }
4574 4621
4575 /* deassign the subsys_id */ 4622 /* deassign the subsys_id */
4576 subsys[ss->subsys_id] = NULL; 4623 subsys[ss->subsys_id] = NULL;
@@ -4585,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4585 write_lock(&css_set_lock); 4632 write_lock(&css_set_lock);
4586 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4633 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4587 struct css_set *cg = link->cg; 4634 struct css_set *cg = link->cg;
4635 unsigned long key;
4588 4636
4589 hlist_del(&cg->hlist); 4637 hash_del(&cg->hlist);
4590 cg->subsys[ss->subsys_id] = NULL; 4638 cg->subsys[ss->subsys_id] = NULL;
4591 hhead = css_set_hash(cg->subsys); 4639 key = css_set_hash(cg->subsys);
4592 hlist_add_head(&cg->hlist, hhead); 4640 hash_add(css_set_table, &cg->hlist, key);
4593 } 4641 }
4594 write_unlock(&css_set_lock); 4642 write_unlock(&css_set_lock);
4595 4643
@@ -4631,9 +4679,6 @@ int __init cgroup_init_early(void)
4631 list_add(&init_css_set_link.cg_link_list, 4679 list_add(&init_css_set_link.cg_link_list,
4632 &init_css_set.cg_links); 4680 &init_css_set.cg_links);
4633 4681
4634 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4635 INIT_HLIST_HEAD(&css_set_table[i]);
4636
4637 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4682 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4638 struct cgroup_subsys *ss = subsys[i]; 4683 struct cgroup_subsys *ss = subsys[i];
4639 4684
@@ -4667,7 +4712,7 @@ int __init cgroup_init(void)
4667{ 4712{
4668 int err; 4713 int err;
4669 int i; 4714 int i;
4670 struct hlist_head *hhead; 4715 unsigned long key;
4671 4716
4672 err = bdi_init(&cgroup_backing_dev_info); 4717 err = bdi_init(&cgroup_backing_dev_info);
4673 if (err) 4718 if (err)
@@ -4686,8 +4731,8 @@ int __init cgroup_init(void)
4686 } 4731 }
4687 4732
4688 /* Add init_css_set to the hash table */ 4733 /* Add init_css_set to the hash table */
4689 hhead = css_set_hash(init_css_set.subsys); 4734 key = css_set_hash(init_css_set.subsys);
4690 hlist_add_head(&init_css_set.hlist, hhead); 4735 hash_add(css_set_table, &init_css_set.hlist, key);
4691 BUG_ON(!init_root_id(&rootnode)); 4736 BUG_ON(!init_root_id(&rootnode));
4692 4737
4693 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4738 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4982 } 5027 }
4983 task_unlock(tsk); 5028 task_unlock(tsk);
4984 5029
4985 if (cg) 5030 put_css_set_taskexit(cg);
4986 put_css_set_taskexit(cg);
4987} 5031}
4988 5032
4989/** 5033/**
@@ -5274,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
5274static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) 5318static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5275{ 5319{
5276 struct css_id *newid; 5320 struct css_id *newid;
5277 int myid, error, size; 5321 int ret, size;
5278 5322
5279 BUG_ON(!ss->use_id); 5323 BUG_ON(!ss->use_id);
5280 5324
@@ -5282,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5282 newid = kzalloc(size, GFP_KERNEL); 5326 newid = kzalloc(size, GFP_KERNEL);
5283 if (!newid) 5327 if (!newid)
5284 return ERR_PTR(-ENOMEM); 5328 return ERR_PTR(-ENOMEM);
5285 /* get id */ 5329
5286 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { 5330 idr_preload(GFP_KERNEL);
5287 error = -ENOMEM;
5288 goto err_out;
5289 }
5290 spin_lock(&ss->id_lock); 5331 spin_lock(&ss->id_lock);
5291 /* Don't use 0. allocates an ID of 1-65535 */ 5332 /* Don't use 0. allocates an ID of 1-65535 */
5292 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 5333 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
5293 spin_unlock(&ss->id_lock); 5334 spin_unlock(&ss->id_lock);
5335 idr_preload_end();
5294 5336
5295 /* Returns error when there are no free spaces for new ID.*/ 5337 /* Returns error when there are no free spaces for new ID.*/
5296 if (error) { 5338 if (ret < 0)
5297 error = -ENOSPC;
5298 goto err_out; 5339 goto err_out;
5299 }
5300 if (myid > CSS_ID_MAX)
5301 goto remove_idr;
5302 5340
5303 newid->id = myid; 5341 newid->id = ret;
5304 newid->depth = depth; 5342 newid->depth = depth;
5305 return newid; 5343 return newid;
5306remove_idr:
5307 error = -ENOSPC;
5308 spin_lock(&ss->id_lock);
5309 idr_remove(&ss->idr, myid);
5310 spin_unlock(&ss->id_lock);
5311err_out: 5344err_out:
5312 kfree(newid); 5345 kfree(newid);
5313 return ERR_PTR(error); 5346 return ERR_PTR(ret);
5314 5347
5315} 5348}
5316 5349
@@ -5441,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5441 struct inode *inode; 5474 struct inode *inode;
5442 struct cgroup_subsys_state *css; 5475 struct cgroup_subsys_state *css;
5443 5476
5444 inode = f->f_dentry->d_inode; 5477 inode = file_inode(f);
5445 /* check in cgroup filesystem dir */ 5478 /* check in cgroup filesystem dir */
5446 if (inode->i_op != &cgroup_dir_inode_operations) 5479 if (inode->i_op != &cgroup_dir_inode_operations)
5447 return ERR_PTR(-EBADF); 5480 return ERR_PTR(-EBADF);
diff --git a/kernel/compat.c b/kernel/compat.c
index 36700e9e2be9..19971d8c7299 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
290 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); 290 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
291} 291}
292 292
293asmlinkage long compat_sys_getitimer(int which, 293COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
294 struct compat_itimerval __user *it) 294 struct compat_itimerval __user *, it)
295{ 295{
296 struct itimerval kit; 296 struct itimerval kit;
297 int error; 297 int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
302 return error; 302 return error;
303} 303}
304 304
305asmlinkage long compat_sys_setitimer(int which, 305COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
306 struct compat_itimerval __user *in, 306 struct compat_itimerval __user *, in,
307 struct compat_itimerval __user *out) 307 struct compat_itimerval __user *, out)
308{ 308{
309 struct itimerval kin, kout; 309 struct itimerval kin, kout;
310 int error; 310 int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
381 memcpy(blocked->sig, &set, sizeof(set)); 381 memcpy(blocked->sig, &set, sizeof(set));
382} 382}
383 383
384asmlinkage long compat_sys_sigprocmask(int how, 384COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
385 compat_old_sigset_t __user *nset, 385 compat_old_sigset_t __user *, nset,
386 compat_old_sigset_t __user *oset) 386 compat_old_sigset_t __user *, oset)
387{ 387{
388 old_sigset_t old_set, new_set; 388 old_sigset_t old_set, new_set;
389 sigset_t new_blocked; 389 sigset_t new_blocked;
@@ -593,7 +593,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
593 else 593 else
594 ret = put_compat_rusage(&ru, uru); 594 ret = put_compat_rusage(&ru, uru);
595 if (ret) 595 if (ret)
596 return ret; 596 return -EFAULT;
597 } 597 }
598 598
599 BUG_ON(info.si_code & __SI_MASK); 599 BUG_ON(info.si_code & __SI_MASK);
@@ -971,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
971} 971}
972 972
973void 973void
974sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 974sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
975{ 975{
976 switch (_NSIG_WORDS) { 976 switch (_NSIG_WORDS) {
977 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 977 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -982,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
982} 982}
983EXPORT_SYMBOL_GPL(sigset_from_compat); 983EXPORT_SYMBOL_GPL(sigset_from_compat);
984 984
985asmlinkage long 985void
986compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 986sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
987 struct compat_siginfo __user *uinfo, 987{
988 struct compat_timespec __user *uts, compat_size_t sigsetsize) 988 switch (_NSIG_WORDS) {
989 case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
990 case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
991 case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
992 case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
993 }
994}
995
996COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
997 struct compat_siginfo __user *, uinfo,
998 struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
989{ 999{
990 compat_sigset_t s32; 1000 compat_sigset_t s32;
991 sigset_t s; 1001 sigset_t s;
@@ -1013,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
1013 } 1023 }
1014 1024
1015 return ret; 1025 return ret;
1016
1017}
1018
1019asmlinkage long
1020compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
1021 struct compat_siginfo __user *uinfo)
1022{
1023 siginfo_t info;
1024
1025 if (copy_siginfo_from_user32(&info, uinfo))
1026 return -EFAULT;
1027 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
1028} 1026}
1029 1027
1030#ifdef __ARCH_WANT_COMPAT_SYS_TIME 1028#ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1067,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
1067 1065
1068#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ 1066#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
1069 1067
1070#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
1071asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
1072{
1073 sigset_t newset;
1074 compat_sigset_t newset32;
1075
1076 /* XXX: Don't preclude handling different sized sigset_t's. */
1077 if (sigsetsize != sizeof(sigset_t))
1078 return -EINVAL;
1079
1080 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
1081 return -EFAULT;
1082 sigset_from_compat(&newset, &newset32);
1083 return sigsuspend(&newset);
1084}
1085#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
1086
1087asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1068asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
1088{ 1069{
1089 struct timex txc; 1070 struct timex txc;
@@ -1222,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1222 return 0; 1203 return 0;
1223} 1204}
1224 1205
1225#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL 1206COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
1226asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, 1207 compat_pid_t, pid,
1227 struct compat_timespec __user *interval) 1208 struct compat_timespec __user *, interval)
1228{ 1209{
1229 struct timespec t; 1210 struct timespec t;
1230 int ret; 1211 int ret;
@@ -1237,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
1237 return -EFAULT; 1218 return -EFAULT;
1238 return ret; 1219 return ret;
1239} 1220}
1240#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
1241 1221
1242/* 1222/*
1243 * Allocate user-space memory for the duration of a single system call, 1223 * Allocate user-space memory for the duration of a single system call,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e0e07fd55508..65349f07b878 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,29 +1,41 @@
1/*
2 * Context tracking: Probe on high level context boundaries such as kernel
3 * and userspace. This includes syscalls and exceptions entry/exit.
4 *
5 * This is used by RCU to remove its dependency on the timer tick while a CPU
6 * runs in userspace.
7 *
8 * Started by Frederic Weisbecker:
9 *
10 * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
11 *
12 * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
13 * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
14 *
15 */
16
1#include <linux/context_tracking.h> 17#include <linux/context_tracking.h>
18#include <linux/kvm_host.h>
2#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
3#include <linux/sched.h> 20#include <linux/sched.h>
4#include <linux/percpu.h>
5#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/export.h>
6 23
7struct context_tracking { 24DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
8 /*
9 * When active is false, hooks are not set to
10 * minimize overhead: TIF flags are cleared
11 * and calls to user_enter/exit are ignored. This
12 * may be further optimized using static keys.
13 */
14 bool active;
15 enum {
16 IN_KERNEL = 0,
17 IN_USER,
18 } state;
19};
20
21static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
22#ifdef CONFIG_CONTEXT_TRACKING_FORCE 25#ifdef CONFIG_CONTEXT_TRACKING_FORCE
23 .active = true, 26 .active = true,
24#endif 27#endif
25}; 28};
26 29
30/**
31 * user_enter - Inform the context tracking that the CPU is going to
32 * enter userspace mode.
33 *
34 * This function must be called right before we switch from the kernel
35 * to userspace, when it's guaranteed the remaining kernel instructions
36 * to execute won't use any RCU read side critical section because this
37 * function sets RCU in extended quiescent state.
38 */
27void user_enter(void) 39void user_enter(void)
28{ 40{
29 unsigned long flags; 41 unsigned long flags;
@@ -39,40 +51,90 @@ void user_enter(void)
39 if (in_interrupt()) 51 if (in_interrupt())
40 return; 52 return;
41 53
54 /* Kernel threads aren't supposed to go to userspace */
42 WARN_ON_ONCE(!current->mm); 55 WARN_ON_ONCE(!current->mm);
43 56
44 local_irq_save(flags); 57 local_irq_save(flags);
45 if (__this_cpu_read(context_tracking.active) && 58 if (__this_cpu_read(context_tracking.active) &&
46 __this_cpu_read(context_tracking.state) != IN_USER) { 59 __this_cpu_read(context_tracking.state) != IN_USER) {
47 __this_cpu_write(context_tracking.state, IN_USER); 60 /*
61 * At this stage, only low level arch entry code remains and
62 * then we'll run in userspace. We can assume there won't be
63 * any RCU read-side critical section until the next call to
64 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
65 * on the tick.
66 */
67 vtime_user_enter(current);
48 rcu_user_enter(); 68 rcu_user_enter();
69 __this_cpu_write(context_tracking.state, IN_USER);
49 } 70 }
50 local_irq_restore(flags); 71 local_irq_restore(flags);
51} 72}
52 73
74
75/**
76 * user_exit - Inform the context tracking that the CPU is
77 * exiting userspace mode and entering the kernel.
78 *
79 * This function must be called after we entered the kernel from userspace
80 * before any use of RCU read side critical section. This potentially include
81 * any high level kernel code like syscalls, exceptions, signal handling, etc...
82 *
83 * This call supports re-entrancy. This way it can be called from any exception
84 * handler without needing to know if we came from userspace or not.
85 */
53void user_exit(void) 86void user_exit(void)
54{ 87{
55 unsigned long flags; 88 unsigned long flags;
56 89
57 /*
58 * Some contexts may involve an exception occuring in an irq,
59 * leading to that nesting:
60 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62 * helpers are enough to protect RCU uses inside the exception. So
63 * just return immediately if we detect we are in an IRQ.
64 */
65 if (in_interrupt()) 90 if (in_interrupt())
66 return; 91 return;
67 92
68 local_irq_save(flags); 93 local_irq_save(flags);
69 if (__this_cpu_read(context_tracking.state) == IN_USER) { 94 if (__this_cpu_read(context_tracking.state) == IN_USER) {
70 __this_cpu_write(context_tracking.state, IN_KERNEL); 95 /*
96 * We are going to run code that may use RCU. Inform
97 * RCU core about that (ie: we may need the tick again).
98 */
71 rcu_user_exit(); 99 rcu_user_exit();
100 vtime_user_exit(current);
101 __this_cpu_write(context_tracking.state, IN_KERNEL);
72 } 102 }
73 local_irq_restore(flags); 103 local_irq_restore(flags);
74} 104}
75 105
106void guest_enter(void)
107{
108 if (vtime_accounting_enabled())
109 vtime_guest_enter(current);
110 else
111 __guest_enter();
112}
113EXPORT_SYMBOL_GPL(guest_enter);
114
115void guest_exit(void)
116{
117 if (vtime_accounting_enabled())
118 vtime_guest_exit(current);
119 else
120 __guest_exit();
121}
122EXPORT_SYMBOL_GPL(guest_exit);
123
124
125/**
126 * context_tracking_task_switch - context switch the syscall callbacks
127 * @prev: the task that is being switched out
128 * @next: the task that is being switched in
129 *
130 * The context tracking uses the syscall slow path to implement its user-kernel
131 * boundaries probes on syscalls. This way it doesn't impact the syscall fast
132 * path on CPUs that don't do context tracking.
133 *
134 * But we need to clear the flag on the previous task because it may later
135 * migrate to some CPU that doesn't do the context tracking. As such the TIF
136 * flag may not be desired there.
137 */
76void context_tracking_task_switch(struct task_struct *prev, 138void context_tracking_task_switch(struct task_struct *prev,
77 struct task_struct *next) 139 struct task_struct *next)
78{ 140{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a503242c..b5e4ab2d427e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)
224static inline void check_for_tasks(int cpu) 224static inline void check_for_tasks(int cpu)
225{ 225{
226 struct task_struct *p; 226 struct task_struct *p;
227 cputime_t utime, stime;
227 228
228 write_lock_irq(&tasklist_lock); 229 write_lock_irq(&tasklist_lock);
229 for_each_process(p) { 230 for_each_process(p) {
231 task_cputime(p, &utime, &stime);
230 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 232 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
231 (p->utime || p->stime)) 233 (utime || stime))
232 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 234 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
233 "(state = %ld, flags = %x)\n", 235 "(state = %ld, flags = %x)\n",
234 p->comm, task_pid_nr(p), cpu, 236 p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)
254 return err; 256 return err;
255 257
256 cpu_notify(CPU_DYING | param->mod, param->hcpu); 258 cpu_notify(CPU_DYING | param->mod, param->hcpu);
259 /* Park the stopper thread */
260 kthread_park(current);
257 return 0; 261 return 0;
258} 262}
259 263
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb8..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62 62
63/* 63/*
64 * Workqueue for cpuset related tasks.
65 *
66 * Using kevent workqueue may cause deadlock when memory_migrate
67 * is set. So we create a separate workqueue thread for cpuset.
68 */
69static struct workqueue_struct *cpuset_wq;
70
71/*
72 * Tracks how many cpusets are currently defined in system. 64 * Tracks how many cpusets are currently defined in system.
73 * When there is only one cpuset (the root cpuset) we can 65 * When there is only one cpuset (the root cpuset) we can
74 * short circuit some hooks. 66 * short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
95 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
96 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
97 89
98 struct cpuset *parent; /* my parent */
99
100 struct fmeter fmeter; /* memory_pressure filter */ 90 struct fmeter fmeter; /* memory_pressure filter */
101 91
92 /*
93 * Tasks are being attached to this cpuset. Used to prevent
94 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
95 */
96 int attach_in_progress;
97
102 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
103 int pn; 99 int pn;
104 100
105 /* for custom sched domain */ 101 /* for custom sched domain */
106 int relax_domain_level; 102 int relax_domain_level;
107 103
108 /* used for walking a cpuset hierarchy */ 104 struct work_struct hotplug_work;
109 struct list_head stack_list;
110}; 105};
111 106
112/* Retrieve the cpuset for a cgroup */ 107/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
123 struct cpuset, css); 118 struct cpuset, css);
124} 119}
125 120
121static inline struct cpuset *parent_cs(const struct cpuset *cs)
122{
123 struct cgroup *pcgrp = cs->css.cgroup->parent;
124
125 if (pcgrp)
126 return cgroup_cs(pcgrp);
127 return NULL;
128}
129
126#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task) 131static inline bool task_has_mempolicy(struct task_struct *task)
128{ 132{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
138 142
139/* bits in struct cpuset flags field */ 143/* bits in struct cpuset flags field */
140typedef enum { 144typedef enum {
145 CS_ONLINE,
141 CS_CPU_EXCLUSIVE, 146 CS_CPU_EXCLUSIVE,
142 CS_MEM_EXCLUSIVE, 147 CS_MEM_EXCLUSIVE,
143 CS_MEM_HARDWALL, 148 CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 152 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 153} cpuset_flagbits_t;
149 154
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
156/* convenient tests for these bits */ 155/* convenient tests for these bits */
156static inline bool is_cpuset_online(const struct cpuset *cs)
157{
158 return test_bit(CS_ONLINE, &cs->flags);
159}
160
157static inline int is_cpu_exclusive(const struct cpuset *cs) 161static inline int is_cpu_exclusive(const struct cpuset *cs)
158{ 162{
159 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 163 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
190} 194}
191 195
192static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
193 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
198 (1 << CS_MEM_EXCLUSIVE)),
194}; 199};
195 200
201/**
202 * cpuset_for_each_child - traverse online children of a cpuset
203 * @child_cs: loop cursor pointing to the current child
204 * @pos_cgrp: used for iteration
205 * @parent_cs: target cpuset to walk children of
206 *
207 * Walk @child_cs through the online children of @parent_cs. Must be used
208 * with RCU read locked.
209 */
210#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
211 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
212 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
213
214/**
215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
216 * @des_cs: loop cursor pointing to the current descendant
217 * @pos_cgrp: used for iteration
218 * @root_cs: target cpuset to walk ancestor of
219 *
220 * Walk @des_cs through the online descendants of @root_cs. Must be used
221 * with RCU read locked. The caller may modify @pos_cgrp by calling
222 * cgroup_rightmost_descendant() to skip subtree.
223 */
224#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
225 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
226 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
227
196/* 228/*
197 * There are two global mutexes guarding cpuset structures. The first 229 * There are two global mutexes guarding cpuset structures - cpuset_mutex
198 * is the main control groups cgroup_mutex, accessed via 230 * and callback_mutex. The latter may nest inside the former. We also
199 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific 231 * require taking task_lock() when dereferencing a task's cpuset pointer.
200 * callback_mutex, below. They can nest. It is ok to first take 232 * See "The task_lock() exception", at the end of this comment.
201 * cgroup_mutex, then nest callback_mutex. We also require taking 233 *
202 * task_lock() when dereferencing a task's cpuset pointer. See "The 234 * A task must hold both mutexes to modify cpusets. If a task holds
203 * task_lock() exception", at the end of this comment. 235 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
204 * 236 * is the only task able to also acquire callback_mutex and be able to
205 * A task must hold both mutexes to modify cpusets. If a task 237 * modify cpusets. It can perform various checks on the cpuset structure
206 * holds cgroup_mutex, then it blocks others wanting that mutex, 238 * first, knowing nothing will change. It can also allocate memory while
207 * ensuring that it is the only task able to also acquire callback_mutex 239 * just holding cpuset_mutex. While it is performing these checks, various
208 * and be able to modify cpusets. It can perform various checks on 240 * callback routines can briefly acquire callback_mutex to query cpusets.
209 * the cpuset structure first, knowing nothing will change. It can 241 * Once it is ready to make the changes, it takes callback_mutex, blocking
210 * also allocate memory while just holding cgroup_mutex. While it is 242 * everyone else.
211 * performing these checks, various callback routines can briefly
212 * acquire callback_mutex to query cpusets. Once it is ready to make
213 * the changes, it takes callback_mutex, blocking everyone else.
214 * 243 *
215 * Calls to the kernel memory allocator can not be made while holding 244 * Calls to the kernel memory allocator can not be made while holding
216 * callback_mutex, as that would risk double tripping on callback_mutex 245 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
232 * guidelines for accessing subsystem state in kernel/cgroup.c 261 * guidelines for accessing subsystem state in kernel/cgroup.c
233 */ 262 */
234 263
264static DEFINE_MUTEX(cpuset_mutex);
235static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
236 266
237/* 267/*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
246static DEFINE_SPINLOCK(cpuset_buffer_lock); 276static DEFINE_SPINLOCK(cpuset_buffer_lock);
247 277
248/* 278/*
279 * CPU / memory hotplug is handled asynchronously.
280 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq;
282
283static void cpuset_hotplug_workfn(struct work_struct *work);
284static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
285static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
286
287static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
288
289/*
249 * This is ugly, but preserves the userspace API for existing cpuset 290 * This is ugly, but preserves the userspace API for existing cpuset
250 * users. If someone tries to mount the "cpuset" filesystem, we 291 * users. If someone tries to mount the "cpuset" filesystem, we
251 * silently switch it to mount "cgroup" instead 292 * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
289 struct cpumask *pmask) 330 struct cpumask *pmask)
290{ 331{
291 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 332 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
292 cs = cs->parent; 333 cs = parent_cs(cs);
293 if (cs) 334 if (cs)
294 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 335 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
295 else 336 else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 355{
315 while (cs && !nodes_intersects(cs->mems_allowed, 356 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_MEMORY])) 357 node_states[N_MEMORY]))
317 cs = cs->parent; 358 cs = parent_cs(cs);
318 if (cs) 359 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 360 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_MEMORY]); 361 node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
326/* 367/*
327 * update task's spread flag if cpuset's page/slab spread flag is set 368 * update task's spread flag if cpuset's page/slab spread flag is set
328 * 369 *
329 * Called with callback_mutex/cgroup_mutex held 370 * Called with callback_mutex/cpuset_mutex held
330 */ 371 */
331static void cpuset_update_task_spread_flag(struct cpuset *cs, 372static void cpuset_update_task_spread_flag(struct cpuset *cs,
332 struct task_struct *tsk) 373 struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
346 * 387 *
347 * One cpuset is a subset of another if all its allowed CPUs and 388 * One cpuset is a subset of another if all its allowed CPUs and
348 * Memory Nodes are a subset of the other, and its exclusive flags 389 * Memory Nodes are a subset of the other, and its exclusive flags
349 * are only set if the other's are set. Call holding cgroup_mutex. 390 * are only set if the other's are set. Call holding cpuset_mutex.
350 */ 391 */
351 392
352static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 393static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
395 * If we replaced the flag and mask values of the current cpuset 436 * If we replaced the flag and mask values of the current cpuset
396 * (cur) with those values in the trial cpuset (trial), would 437 * (cur) with those values in the trial cpuset (trial), would
397 * our various subset and exclusive rules still be valid? Presumes 438 * our various subset and exclusive rules still be valid? Presumes
398 * cgroup_mutex held. 439 * cpuset_mutex held.
399 * 440 *
400 * 'cur' is the address of an actual, in-use cpuset. Operations 441 * 'cur' is the address of an actual, in-use cpuset. Operations
401 * such as list traversal that depend on the actual address of the 442 * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
412{ 453{
413 struct cgroup *cont; 454 struct cgroup *cont;
414 struct cpuset *c, *par; 455 struct cpuset *c, *par;
456 int ret;
457
458 rcu_read_lock();
415 459
416 /* Each of our child cpusets must be a subset of us */ 460 /* Each of our child cpusets must be a subset of us */
417 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 461 ret = -EBUSY;
418 if (!is_cpuset_subset(cgroup_cs(cont), trial)) 462 cpuset_for_each_child(c, cont, cur)
419 return -EBUSY; 463 if (!is_cpuset_subset(c, trial))
420 } 464 goto out;
421 465
422 /* Remaining checks don't apply to root cpuset */ 466 /* Remaining checks don't apply to root cpuset */
467 ret = 0;
423 if (cur == &top_cpuset) 468 if (cur == &top_cpuset)
424 return 0; 469 goto out;
425 470
426 par = cur->parent; 471 par = parent_cs(cur);
427 472
428 /* We must be a subset of our parent cpuset */ 473 /* We must be a subset of our parent cpuset */
474 ret = -EACCES;
429 if (!is_cpuset_subset(trial, par)) 475 if (!is_cpuset_subset(trial, par))
430 return -EACCES; 476 goto out;
431 477
432 /* 478 /*
433 * If either I or some sibling (!= me) is exclusive, we can't 479 * If either I or some sibling (!= me) is exclusive, we can't
434 * overlap 480 * overlap
435 */ 481 */
436 list_for_each_entry(cont, &par->css.cgroup->children, sibling) { 482 ret = -EINVAL;
437 c = cgroup_cs(cont); 483 cpuset_for_each_child(c, cont, par) {
438 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 484 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
439 c != cur && 485 c != cur &&
440 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 486 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
441 return -EINVAL; 487 goto out;
442 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 488 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
443 c != cur && 489 c != cur &&
444 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 490 nodes_intersects(trial->mems_allowed, c->mems_allowed))
445 return -EINVAL; 491 goto out;
446 } 492 }
447 493
448 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 494 /*
449 if (cgroup_task_count(cur->css.cgroup)) { 495 * Cpusets with tasks - existing or newly being attached - can't
450 if (cpumask_empty(trial->cpus_allowed) || 496 * have empty cpus_allowed or mems_allowed.
451 nodes_empty(trial->mems_allowed)) { 497 */
452 return -ENOSPC; 498 ret = -ENOSPC;
453 } 499 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
454 } 500 (cpumask_empty(trial->cpus_allowed) ||
501 nodes_empty(trial->mems_allowed)))
502 goto out;
455 503
456 return 0; 504 ret = 0;
505out:
506 rcu_read_unlock();
507 return ret;
457} 508}
458 509
459#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
474 return; 525 return;
475} 526}
476 527
477static void 528static void update_domain_attr_tree(struct sched_domain_attr *dattr,
478update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) 529 struct cpuset *root_cs)
479{ 530{
480 LIST_HEAD(q); 531 struct cpuset *cp;
481 532 struct cgroup *pos_cgrp;
482 list_add(&c->stack_list, &q);
483 while (!list_empty(&q)) {
484 struct cpuset *cp;
485 struct cgroup *cont;
486 struct cpuset *child;
487
488 cp = list_first_entry(&q, struct cpuset, stack_list);
489 list_del(q.next);
490 533
491 if (cpumask_empty(cp->cpus_allowed)) 534 rcu_read_lock();
535 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
536 /* skip the whole subtree if @cp doesn't have any CPU */
537 if (cpumask_empty(cp->cpus_allowed)) {
538 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
492 continue; 539 continue;
540 }
493 541
494 if (is_sched_load_balance(cp)) 542 if (is_sched_load_balance(cp))
495 update_domain_attr(dattr, cp); 543 update_domain_attr(dattr, cp);
496
497 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
498 child = cgroup_cs(cont);
499 list_add_tail(&child->stack_list, &q);
500 }
501 } 544 }
545 rcu_read_unlock();
502} 546}
503 547
504/* 548/*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
520 * domains when operating in the severe memory shortage situations 564 * domains when operating in the severe memory shortage situations
521 * that could cause allocation failures below. 565 * that could cause allocation failures below.
522 * 566 *
523 * Must be called with cgroup_lock held. 567 * Must be called with cpuset_mutex held.
524 * 568 *
525 * The three key local variables below are: 569 * The three key local variables below are:
526 * q - a linked-list queue of cpuset pointers, used to implement a 570 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
558static int generate_sched_domains(cpumask_var_t **domains, 602static int generate_sched_domains(cpumask_var_t **domains,
559 struct sched_domain_attr **attributes) 603 struct sched_domain_attr **attributes)
560{ 604{
561 LIST_HEAD(q); /* queue of cpusets to be scanned */
562 struct cpuset *cp; /* scans q */ 605 struct cpuset *cp; /* scans q */
563 struct cpuset **csa; /* array of all cpuset ptrs */ 606 struct cpuset **csa; /* array of all cpuset ptrs */
564 int csn; /* how many cpuset ptrs in csa so far */ 607 int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
567 struct sched_domain_attr *dattr; /* attributes for custom domains */ 610 struct sched_domain_attr *dattr; /* attributes for custom domains */
568 int ndoms = 0; /* number of sched domains in result */ 611 int ndoms = 0; /* number of sched domains in result */
569 int nslot; /* next empty doms[] struct cpumask slot */ 612 int nslot; /* next empty doms[] struct cpumask slot */
613 struct cgroup *pos_cgrp;
570 614
571 doms = NULL; 615 doms = NULL;
572 dattr = NULL; 616 dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
594 goto done; 638 goto done;
595 csn = 0; 639 csn = 0;
596 640
597 list_add(&top_cpuset.stack_list, &q); 641 rcu_read_lock();
598 while (!list_empty(&q)) { 642 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
599 struct cgroup *cont;
600 struct cpuset *child; /* scans child cpusets of cp */
601
602 cp = list_first_entry(&q, struct cpuset, stack_list);
603 list_del(q.next);
604
605 if (cpumask_empty(cp->cpus_allowed))
606 continue;
607
608 /* 643 /*
609 * All child cpusets contain a subset of the parent's cpus, so 644 * Continue traversing beyond @cp iff @cp has some CPUs and
610 * just skip them, and then we call update_domain_attr_tree() 645 * isn't load balancing. The former is obvious. The
611 * to calc relax_domain_level of the corresponding sched 646 * latter: All child cpusets contain a subset of the
612 * domain. 647 * parent's cpus, so just skip them, and then we call
648 * update_domain_attr_tree() to calc relax_domain_level of
649 * the corresponding sched domain.
613 */ 650 */
614 if (is_sched_load_balance(cp)) { 651 if (!cpumask_empty(cp->cpus_allowed) &&
615 csa[csn++] = cp; 652 !is_sched_load_balance(cp))
616 continue; 653 continue;
617 }
618 654
619 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 655 if (is_sched_load_balance(cp))
620 child = cgroup_cs(cont); 656 csa[csn++] = cp;
621 list_add_tail(&child->stack_list, &q); 657
622 } 658 /* skip @cp's subtree */
623 } 659 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
660 }
661 rcu_read_unlock();
624 662
625 for (i = 0; i < csn; i++) 663 for (i = 0; i < csn; i++)
626 csa[i]->pn = i; 664 csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
725/* 763/*
726 * Rebuild scheduler domains. 764 * Rebuild scheduler domains.
727 * 765 *
728 * Call with neither cgroup_mutex held nor within get_online_cpus(). 766 * If the flag 'sched_load_balance' of any cpuset with non-empty
729 * Takes both cgroup_mutex and get_online_cpus(). 767 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
768 * which has that flag enabled, or if any cpuset with a non-empty
769 * 'cpus' is removed, then call this routine to rebuild the
770 * scheduler's dynamic sched domains.
730 * 771 *
731 * Cannot be directly called from cpuset code handling changes 772 * Call with cpuset_mutex held. Takes get_online_cpus().
732 * to the cpuset pseudo-filesystem, because it cannot be called
733 * from code that already holds cgroup_mutex.
734 */ 773 */
735static void do_rebuild_sched_domains(struct work_struct *unused) 774static void rebuild_sched_domains_locked(void)
736{ 775{
737 struct sched_domain_attr *attr; 776 struct sched_domain_attr *attr;
738 cpumask_var_t *doms; 777 cpumask_var_t *doms;
739 int ndoms; 778 int ndoms;
740 779
780 lockdep_assert_held(&cpuset_mutex);
741 get_online_cpus(); 781 get_online_cpus();
742 782
743 /* Generate domain masks and attrs */ 783 /* Generate domain masks and attrs */
744 cgroup_lock();
745 ndoms = generate_sched_domains(&doms, &attr); 784 ndoms = generate_sched_domains(&doms, &attr);
746 cgroup_unlock();
747 785
748 /* Have scheduler rebuild the domains */ 786 /* Have scheduler rebuild the domains */
749 partition_sched_domains(ndoms, doms, attr); 787 partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
751 put_online_cpus(); 789 put_online_cpus();
752} 790}
753#else /* !CONFIG_SMP */ 791#else /* !CONFIG_SMP */
754static void do_rebuild_sched_domains(struct work_struct *unused) 792static void rebuild_sched_domains_locked(void)
755{ 793{
756} 794}
757 795
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
763} 801}
764#endif /* CONFIG_SMP */ 802#endif /* CONFIG_SMP */
765 803
766static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
767
768/*
769 * Rebuild scheduler domains, asynchronously via workqueue.
770 *
771 * If the flag 'sched_load_balance' of any cpuset with non-empty
772 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
773 * which has that flag enabled, or if any cpuset with a non-empty
774 * 'cpus' is removed, then call this routine to rebuild the
775 * scheduler's dynamic sched domains.
776 *
777 * The rebuild_sched_domains() and partition_sched_domains()
778 * routines must nest cgroup_lock() inside get_online_cpus(),
779 * but such cpuset changes as these must nest that locking the
780 * other way, holding cgroup_lock() for much of the code.
781 *
782 * So in order to avoid an ABBA deadlock, the cpuset code handling
783 * these user changes delegates the actual sched domain rebuilding
784 * to a separate workqueue thread, which ends up processing the
785 * above do_rebuild_sched_domains() function.
786 */
787static void async_rebuild_sched_domains(void)
788{
789 queue_work(cpuset_wq, &rebuild_sched_domains_work);
790}
791
792/*
793 * Accomplishes the same scheduler domain rebuild as the above
794 * async_rebuild_sched_domains(), however it directly calls the
795 * rebuild routine synchronously rather than calling it via an
796 * asynchronous work thread.
797 *
798 * This can only be called from code that is not holding
799 * cgroup_mutex (not nested in a cgroup_lock() call.)
800 */
801void rebuild_sched_domains(void) 804void rebuild_sched_domains(void)
802{ 805{
803 do_rebuild_sched_domains(NULL); 806 mutex_lock(&cpuset_mutex);
807 rebuild_sched_domains_locked();
808 mutex_unlock(&cpuset_mutex);
804} 809}
805 810
806/** 811/**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
808 * @tsk: task to test 813 * @tsk: task to test
809 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner 814 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
810 * 815 *
811 * Call with cgroup_mutex held. May take callback_mutex during call. 816 * Call with cpuset_mutex held. May take callback_mutex during call.
812 * Called for each task in a cgroup by cgroup_scan_tasks(). 817 * Called for each task in a cgroup by cgroup_scan_tasks().
813 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 818 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
814 * words, if its mask is not equal to its cpuset's mask). 819 * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829 * cpus_allowed mask needs to be changed. 834 * cpus_allowed mask needs to be changed.
830 * 835 *
831 * We don't need to re-check for the cgroup/cpuset membership, since we're 836 * We don't need to re-check for the cgroup/cpuset membership, since we're
832 * holding cgroup_lock() at this point. 837 * holding cpuset_mutex at this point.
833 */ 838 */
834static void cpuset_change_cpumask(struct task_struct *tsk, 839static void cpuset_change_cpumask(struct task_struct *tsk,
835 struct cgroup_scanner *scan) 840 struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
842 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 847 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
843 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 848 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
844 * 849 *
845 * Called with cgroup_mutex held 850 * Called with cpuset_mutex held
846 * 851 *
847 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 852 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
848 * calling callback functions for each. 853 * calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 heap_free(&heap); 925 heap_free(&heap);
921 926
922 if (is_load_balanced) 927 if (is_load_balanced)
923 async_rebuild_sched_domains(); 928 rebuild_sched_domains_locked();
924 return 0; 929 return 0;
925} 930}
926 931
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
932 * Temporarilly set tasks mems_allowed to target nodes of migration, 937 * Temporarilly set tasks mems_allowed to target nodes of migration,
933 * so that the migration code can allocate pages on these nodes. 938 * so that the migration code can allocate pages on these nodes.
934 * 939 *
935 * Call holding cgroup_mutex, so current's cpuset won't change 940 * Call holding cpuset_mutex, so current's cpuset won't change
936 * during this call, as manage_mutex holds off any cpuset_attach() 941 * during this call, as manage_mutex holds off any cpuset_attach()
937 * calls. Therefore we don't need to take task_lock around the 942 * calls. Therefore we don't need to take task_lock around the
938 * call to guarantee_online_mems(), as we know no one is changing 943 * call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007/* 1012/*
1008 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1013 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1009 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1014 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1010 * memory_migrate flag is set. Called with cgroup_mutex held. 1015 * memory_migrate flag is set. Called with cpuset_mutex held.
1011 */ 1016 */
1012static void cpuset_change_nodemask(struct task_struct *p, 1017static void cpuset_change_nodemask(struct task_struct *p,
1013 struct cgroup_scanner *scan) 1018 struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1016 struct cpuset *cs; 1021 struct cpuset *cs;
1017 int migrate; 1022 int migrate;
1018 const nodemask_t *oldmem = scan->data; 1023 const nodemask_t *oldmem = scan->data;
1019 static nodemask_t newmems; /* protected by cgroup_mutex */ 1024 static nodemask_t newmems; /* protected by cpuset_mutex */
1020 1025
1021 cs = cgroup_cs(scan->cg); 1026 cs = cgroup_cs(scan->cg);
1022 guarantee_online_mems(cs, &newmems); 1027 guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
1043 * @oldmem: old mems_allowed of cpuset cs 1048 * @oldmem: old mems_allowed of cpuset cs
1044 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1049 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1045 * 1050 *
1046 * Called with cgroup_mutex held 1051 * Called with cpuset_mutex held
1047 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1052 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1048 * if @heap != NULL. 1053 * if @heap != NULL.
1049 */ 1054 */
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1065 * take while holding tasklist_lock. Forks can happen - the 1070 * take while holding tasklist_lock. Forks can happen - the
1066 * mpol_dup() cpuset_being_rebound check will catch such forks, 1071 * mpol_dup() cpuset_being_rebound check will catch such forks,
1067 * and rebind their vma mempolicies too. Because we still hold 1072 * and rebind their vma mempolicies too. Because we still hold
1068 * the global cgroup_mutex, we know that no other rebind effort 1073 * the global cpuset_mutex, we know that no other rebind effort
1069 * will be contending for the global variable cpuset_being_rebound. 1074 * will be contending for the global variable cpuset_being_rebound.
1070 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1075 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1071 * is idempotent. Also migrate pages in each mm to new nodes. 1076 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1084 * mempolicies and if the cpuset is marked 'memory_migrate', 1089 * mempolicies and if the cpuset is marked 'memory_migrate',
1085 * migrate the tasks pages to the new memory. 1090 * migrate the tasks pages to the new memory.
1086 * 1091 *
1087 * Call with cgroup_mutex held. May take callback_mutex during call. 1092 * Call with cpuset_mutex held. May take callback_mutex during call.
1088 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1093 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1089 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1094 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1090 * their mempolicies to the cpusets new mems_allowed. 1095 * their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1168 cs->relax_domain_level = val; 1173 cs->relax_domain_level = val;
1169 if (!cpumask_empty(cs->cpus_allowed) && 1174 if (!cpumask_empty(cs->cpus_allowed) &&
1170 is_sched_load_balance(cs)) 1175 is_sched_load_balance(cs))
1171 async_rebuild_sched_domains(); 1176 rebuild_sched_domains_locked();
1172 } 1177 }
1173 1178
1174 return 0; 1179 return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1182 * Called by cgroup_scan_tasks() for each task in a cgroup. 1187 * Called by cgroup_scan_tasks() for each task in a cgroup.
1183 * 1188 *
1184 * We don't need to re-check for the cgroup/cpuset membership, since we're 1189 * We don't need to re-check for the cgroup/cpuset membership, since we're
1185 * holding cgroup_lock() at this point. 1190 * holding cpuset_mutex at this point.
1186 */ 1191 */
1187static void cpuset_change_flag(struct task_struct *tsk, 1192static void cpuset_change_flag(struct task_struct *tsk,
1188 struct cgroup_scanner *scan) 1193 struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
1195 * @cs: the cpuset in which each task's spread flags needs to be changed 1200 * @cs: the cpuset in which each task's spread flags needs to be changed
1196 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1201 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1197 * 1202 *
1198 * Called with cgroup_mutex held 1203 * Called with cpuset_mutex held
1199 * 1204 *
1200 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1205 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1201 * calling callback functions for each. 1206 * calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1220 * cs: the cpuset to update 1225 * cs: the cpuset to update
1221 * turning_on: whether the flag is being set or cleared 1226 * turning_on: whether the flag is being set or cleared
1222 * 1227 *
1223 * Call with cgroup_mutex held. 1228 * Call with cpuset_mutex held.
1224 */ 1229 */
1225 1230
1226static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1231static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1260 mutex_unlock(&callback_mutex); 1265 mutex_unlock(&callback_mutex);
1261 1266
1262 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1267 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1263 async_rebuild_sched_domains(); 1268 rebuild_sched_domains_locked();
1264 1269
1265 if (spread_flag_changed) 1270 if (spread_flag_changed)
1266 update_tasks_flags(cs, &heap); 1271 update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
1368 return val; 1373 return val;
1369} 1374}
1370 1375
1371/* 1376/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1372 * Protected by cgroup_lock. The nodemasks must be stored globally because
1373 * dynamically allocating them is not allowed in can_attach, and they must
1374 * persist until attach.
1375 */
1376static cpumask_var_t cpus_attach;
1377static nodemask_t cpuset_attach_nodemask_from;
1378static nodemask_t cpuset_attach_nodemask_to;
1379
1380/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1381static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1377static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1382{ 1378{
1383 struct cpuset *cs = cgroup_cs(cgrp); 1379 struct cpuset *cs = cgroup_cs(cgrp);
1384 struct task_struct *task; 1380 struct task_struct *task;
1385 int ret; 1381 int ret;
1386 1382
1383 mutex_lock(&cpuset_mutex);
1384
1385 ret = -ENOSPC;
1387 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1386 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1388 return -ENOSPC; 1387 goto out_unlock;
1389 1388
1390 cgroup_taskset_for_each(task, cgrp, tset) { 1389 cgroup_taskset_for_each(task, cgrp, tset) {
1391 /* 1390 /*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1397 * set_cpus_allowed_ptr() on all attached tasks before 1396 * set_cpus_allowed_ptr() on all attached tasks before
1398 * cpus_allowed may be changed. 1397 * cpus_allowed may be changed.
1399 */ 1398 */
1399 ret = -EINVAL;
1400 if (task->flags & PF_THREAD_BOUND) 1400 if (task->flags & PF_THREAD_BOUND)
1401 return -EINVAL; 1401 goto out_unlock;
1402 if ((ret = security_task_setscheduler(task))) 1402 ret = security_task_setscheduler(task);
1403 return ret; 1403 if (ret)
1404 goto out_unlock;
1404 } 1405 }
1405 1406
1406 /* prepare for attach */ 1407 /*
1407 if (cs == &top_cpuset) 1408 * Mark attach is in progress. This makes validate_change() fail
1408 cpumask_copy(cpus_attach, cpu_possible_mask); 1409 * changes which zero cpus/mems_allowed.
1409 else 1410 */
1410 guarantee_online_cpus(cs, cpus_attach); 1411 cs->attach_in_progress++;
1411 1412 ret = 0;
1412 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1413out_unlock:
1414 mutex_unlock(&cpuset_mutex);
1415 return ret;
1416}
1413 1417
1414 return 0; 1418static void cpuset_cancel_attach(struct cgroup *cgrp,
1419 struct cgroup_taskset *tset)
1420{
1421 mutex_lock(&cpuset_mutex);
1422 cgroup_cs(cgrp)->attach_in_progress--;
1423 mutex_unlock(&cpuset_mutex);
1415} 1424}
1416 1425
1426/*
1427 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
1428 * but we can't allocate it dynamically there. Define it global and
1429 * allocate from cpuset_init().
1430 */
1431static cpumask_var_t cpus_attach;
1432
1417static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1433static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1418{ 1434{
1435 /* static bufs protected by cpuset_mutex */
1436 static nodemask_t cpuset_attach_nodemask_from;
1437 static nodemask_t cpuset_attach_nodemask_to;
1419 struct mm_struct *mm; 1438 struct mm_struct *mm;
1420 struct task_struct *task; 1439 struct task_struct *task;
1421 struct task_struct *leader = cgroup_taskset_first(tset); 1440 struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1423 struct cpuset *cs = cgroup_cs(cgrp); 1442 struct cpuset *cs = cgroup_cs(cgrp);
1424 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1443 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1425 1444
1445 mutex_lock(&cpuset_mutex);
1446
1447 /* prepare for attach */
1448 if (cs == &top_cpuset)
1449 cpumask_copy(cpus_attach, cpu_possible_mask);
1450 else
1451 guarantee_online_cpus(cs, cpus_attach);
1452
1453 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1454
1426 cgroup_taskset_for_each(task, cgrp, tset) { 1455 cgroup_taskset_for_each(task, cgrp, tset) {
1427 /* 1456 /*
1428 * can_attach beforehand should guarantee that this doesn't 1457 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1448 &cpuset_attach_nodemask_to); 1477 &cpuset_attach_nodemask_to);
1449 mmput(mm); 1478 mmput(mm);
1450 } 1479 }
1480
1481 cs->attach_in_progress--;
1482
1483 /*
1484 * We may have raced with CPU/memory hotunplug. Trigger hotplug
1485 * propagation if @cs doesn't have any CPU or memory. It will move
1486 * the newly added tasks to the nearest parent which can execute.
1487 */
1488 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1489 schedule_cpuset_propagate_hotplug(cs);
1490
1491 mutex_unlock(&cpuset_mutex);
1451} 1492}
1452 1493
1453/* The various types of files and directories in a cpuset file system */ 1494/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
1469 1510
1470static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1511static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1471{ 1512{
1472 int retval = 0;
1473 struct cpuset *cs = cgroup_cs(cgrp); 1513 struct cpuset *cs = cgroup_cs(cgrp);
1474 cpuset_filetype_t type = cft->private; 1514 cpuset_filetype_t type = cft->private;
1515 int retval = -ENODEV;
1475 1516
1476 if (!cgroup_lock_live_group(cgrp)) 1517 mutex_lock(&cpuset_mutex);
1477 return -ENODEV; 1518 if (!is_cpuset_online(cs))
1519 goto out_unlock;
1478 1520
1479 switch (type) { 1521 switch (type) {
1480 case FILE_CPU_EXCLUSIVE: 1522 case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1508 retval = -EINVAL; 1550 retval = -EINVAL;
1509 break; 1551 break;
1510 } 1552 }
1511 cgroup_unlock(); 1553out_unlock:
1554 mutex_unlock(&cpuset_mutex);
1512 return retval; 1555 return retval;
1513} 1556}
1514 1557
1515static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1558static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1516{ 1559{
1517 int retval = 0;
1518 struct cpuset *cs = cgroup_cs(cgrp); 1560 struct cpuset *cs = cgroup_cs(cgrp);
1519 cpuset_filetype_t type = cft->private; 1561 cpuset_filetype_t type = cft->private;
1562 int retval = -ENODEV;
1520 1563
1521 if (!cgroup_lock_live_group(cgrp)) 1564 mutex_lock(&cpuset_mutex);
1522 return -ENODEV; 1565 if (!is_cpuset_online(cs))
1566 goto out_unlock;
1523 1567
1524 switch (type) { 1568 switch (type) {
1525 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1569 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1529 retval = -EINVAL; 1573 retval = -EINVAL;
1530 break; 1574 break;
1531 } 1575 }
1532 cgroup_unlock(); 1576out_unlock:
1577 mutex_unlock(&cpuset_mutex);
1533 return retval; 1578 return retval;
1534} 1579}
1535 1580
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1539static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1584static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1540 const char *buf) 1585 const char *buf)
1541{ 1586{
1542 int retval = 0;
1543 struct cpuset *cs = cgroup_cs(cgrp); 1587 struct cpuset *cs = cgroup_cs(cgrp);
1544 struct cpuset *trialcs; 1588 struct cpuset *trialcs;
1589 int retval = -ENODEV;
1590
1591 /*
1592 * CPU or memory hotunplug may leave @cs w/o any execution
1593 * resources, in which case the hotplug code asynchronously updates
1594 * configuration and transfers all tasks to the nearest ancestor
1595 * which can execute.
1596 *
1597 * As writes to "cpus" or "mems" may restore @cs's execution
1598 * resources, wait for the previously scheduled operations before
1599 * proceeding, so that we don't end up keep removing tasks added
1600 * after execution capability is restored.
1601 *
1602 * Flushing cpuset_hotplug_work is enough to synchronize against
1603 * hotplug hanlding; however, cpuset_attach() may schedule
1604 * propagation work directly. Flush the workqueue too.
1605 */
1606 flush_work(&cpuset_hotplug_work);
1607 flush_workqueue(cpuset_propagate_hotplug_wq);
1545 1608
1546 if (!cgroup_lock_live_group(cgrp)) 1609 mutex_lock(&cpuset_mutex);
1547 return -ENODEV; 1610 if (!is_cpuset_online(cs))
1611 goto out_unlock;
1548 1612
1549 trialcs = alloc_trial_cpuset(cs); 1613 trialcs = alloc_trial_cpuset(cs);
1550 if (!trialcs) { 1614 if (!trialcs) {
1551 retval = -ENOMEM; 1615 retval = -ENOMEM;
1552 goto out; 1616 goto out_unlock;
1553 } 1617 }
1554 1618
1555 switch (cft->private) { 1619 switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1565 } 1629 }
1566 1630
1567 free_trial_cpuset(trialcs); 1631 free_trial_cpuset(trialcs);
1568out: 1632out_unlock:
1569 cgroup_unlock(); 1633 mutex_unlock(&cpuset_mutex);
1570 return retval; 1634 return retval;
1571} 1635}
1572 1636
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
1790 1854
1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1855static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1792{ 1856{
1793 struct cgroup *parent_cg = cont->parent; 1857 struct cpuset *cs;
1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1796 1858
1797 if (!parent_cg) 1859 if (!cont->parent)
1798 return &top_cpuset.css; 1860 return &top_cpuset.css;
1799 parent = cgroup_cs(parent_cg);
1800 1861
1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1862 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1802 if (!cs) 1863 if (!cs)
1803 return ERR_PTR(-ENOMEM); 1864 return ERR_PTR(-ENOMEM);
1804 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1865 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1806 return ERR_PTR(-ENOMEM); 1867 return ERR_PTR(-ENOMEM);
1807 } 1868 }
1808 1869
1809 cs->flags = 0;
1810 if (is_spread_page(parent))
1811 set_bit(CS_SPREAD_PAGE, &cs->flags);
1812 if (is_spread_slab(parent))
1813 set_bit(CS_SPREAD_SLAB, &cs->flags);
1814 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1815 cpumask_clear(cs->cpus_allowed); 1871 cpumask_clear(cs->cpus_allowed);
1816 nodes_clear(cs->mems_allowed); 1872 nodes_clear(cs->mems_allowed);
1817 fmeter_init(&cs->fmeter); 1873 fmeter_init(&cs->fmeter);
1874 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1818 cs->relax_domain_level = -1; 1875 cs->relax_domain_level = -1;
1819 1876
1820 cs->parent = parent; 1877 return &cs->css;
1878}
1879
1880static int cpuset_css_online(struct cgroup *cgrp)
1881{
1882 struct cpuset *cs = cgroup_cs(cgrp);
1883 struct cpuset *parent = parent_cs(cs);
1884 struct cpuset *tmp_cs;
1885 struct cgroup *pos_cg;
1886
1887 if (!parent)
1888 return 0;
1889
1890 mutex_lock(&cpuset_mutex);
1891
1892 set_bit(CS_ONLINE, &cs->flags);
1893 if (is_spread_page(parent))
1894 set_bit(CS_SPREAD_PAGE, &cs->flags);
1895 if (is_spread_slab(parent))
1896 set_bit(CS_SPREAD_SLAB, &cs->flags);
1897
1821 number_of_cpusets++; 1898 number_of_cpusets++;
1822 1899
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) 1900 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
1824 goto skip_clone; 1901 goto out_unlock;
1825 1902
1826 /* 1903 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 1904 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 1913 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup. 1914 * (and likewise for mems) to the new cgroup.
1838 */ 1915 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { 1916 rcu_read_lock();
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg); 1917 cpuset_for_each_child(tmp_cs, pos_cg, parent) {
1841 1918 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) 1919 rcu_read_unlock();
1843 goto skip_clone; 1920 goto out_unlock;
1921 }
1844 } 1922 }
1923 rcu_read_unlock();
1845 1924
1846 mutex_lock(&callback_mutex); 1925 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed; 1926 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1927 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex); 1928 mutex_unlock(&callback_mutex);
1850skip_clone: 1929out_unlock:
1851 return &cs->css; 1930 mutex_unlock(&cpuset_mutex);
1931 return 0;
1932}
1933
1934static void cpuset_css_offline(struct cgroup *cgrp)
1935{
1936 struct cpuset *cs = cgroup_cs(cgrp);
1937
1938 mutex_lock(&cpuset_mutex);
1939
1940 if (is_sched_load_balance(cs))
1941 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1942
1943 number_of_cpusets--;
1944 clear_bit(CS_ONLINE, &cs->flags);
1945
1946 mutex_unlock(&cpuset_mutex);
1852} 1947}
1853 1948
1854/* 1949/*
1855 * If the cpuset being removed has its flag 'sched_load_balance' 1950 * If the cpuset being removed has its flag 'sched_load_balance'
1856 * enabled, then simulate turning sched_load_balance off, which 1951 * enabled, then simulate turning sched_load_balance off, which
1857 * will call async_rebuild_sched_domains(). 1952 * will call rebuild_sched_domains_locked().
1858 */ 1953 */
1859 1954
1860static void cpuset_css_free(struct cgroup *cont) 1955static void cpuset_css_free(struct cgroup *cont)
1861{ 1956{
1862 struct cpuset *cs = cgroup_cs(cont); 1957 struct cpuset *cs = cgroup_cs(cont);
1863 1958
1864 if (is_sched_load_balance(cs))
1865 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1866
1867 number_of_cpusets--;
1868 free_cpumask_var(cs->cpus_allowed); 1959 free_cpumask_var(cs->cpus_allowed);
1869 kfree(cs); 1960 kfree(cs);
1870} 1961}
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
1872struct cgroup_subsys cpuset_subsys = { 1963struct cgroup_subsys cpuset_subsys = {
1873 .name = "cpuset", 1964 .name = "cpuset",
1874 .css_alloc = cpuset_css_alloc, 1965 .css_alloc = cpuset_css_alloc,
1966 .css_online = cpuset_css_online,
1967 .css_offline = cpuset_css_offline,
1875 .css_free = cpuset_css_free, 1968 .css_free = cpuset_css_free,
1876 .can_attach = cpuset_can_attach, 1969 .can_attach = cpuset_can_attach,
1970 .cancel_attach = cpuset_cancel_attach,
1877 .attach = cpuset_attach, 1971 .attach = cpuset_attach,
1878 .subsys_id = cpuset_subsys_id, 1972 .subsys_id = cpuset_subsys_id,
1879 .base_cftypes = files, 1973 .base_cftypes = files,
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1924{ 2018{
1925 struct cgroup *new_cgroup = scan->data; 2019 struct cgroup *new_cgroup = scan->data;
1926 2020
2021 cgroup_lock();
1927 cgroup_attach_task(new_cgroup, tsk); 2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
1928} 2024}
1929 2025
1930/** 2026/**
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932 * @from: cpuset in which the tasks currently reside 2028 * @from: cpuset in which the tasks currently reside
1933 * @to: cpuset to which the tasks will be moved 2029 * @to: cpuset to which the tasks will be moved
1934 * 2030 *
1935 * Called with cgroup_mutex held 2031 * Called with cpuset_mutex held
1936 * callback_mutex must not be held, as cpuset_attach() will take it. 2032 * callback_mutex must not be held, as cpuset_attach() will take it.
1937 * 2033 *
1938 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1959 * removing that CPU or node from all cpusets. If this removes the 2055 * removing that CPU or node from all cpusets. If this removes the
1960 * last CPU or node from a cpuset, then move the tasks in the empty 2056 * last CPU or node from a cpuset, then move the tasks in the empty
1961 * cpuset to its next-highest non-empty parent. 2057 * cpuset to its next-highest non-empty parent.
1962 *
1963 * Called with cgroup_mutex held
1964 * callback_mutex must not be held, as cpuset_attach() will take it.
1965 */ 2058 */
1966static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2059static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1967{ 2060{
1968 struct cpuset *parent; 2061 struct cpuset *parent;
1969 2062
1970 /* 2063 /*
1971 * The cgroup's css_sets list is in use if there are tasks
1972 * in the cpuset; the list is empty if there are none;
1973 * the cs->css.refcnt seems always 0.
1974 */
1975 if (list_empty(&cs->css.cgroup->css_sets))
1976 return;
1977
1978 /*
1979 * Find its next-highest non-empty parent, (top cpuset 2064 * Find its next-highest non-empty parent, (top cpuset
1980 * has online cpus, so can't be empty). 2065 * has online cpus, so can't be empty).
1981 */ 2066 */
1982 parent = cs->parent; 2067 parent = parent_cs(cs);
1983 while (cpumask_empty(parent->cpus_allowed) || 2068 while (cpumask_empty(parent->cpus_allowed) ||
1984 nodes_empty(parent->mems_allowed)) 2069 nodes_empty(parent->mems_allowed))
1985 parent = parent->parent; 2070 parent = parent_cs(parent);
1986 2071
1987 move_member_tasks_to_cpuset(cs, parent); 2072 move_member_tasks_to_cpuset(cs, parent);
1988} 2073}
1989 2074
1990/* 2075/**
1991 * Helper function to traverse cpusets. 2076 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
1992 * It can be used to walk the cpuset tree from top to bottom, completing 2077 * @cs: cpuset in interest
1993 * one layer before dropping down to the next (thus always processing a 2078 *
1994 * node before any of its children). 2079 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2080 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2081 * all its tasks are moved to the nearest ancestor with both resources.
1995 */ 2082 */
1996static struct cpuset *cpuset_next(struct list_head *queue) 2083static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
1997{ 2084{
1998 struct cpuset *cp; 2085 static cpumask_t off_cpus;
1999 struct cpuset *child; /* scans child cpusets of cp */ 2086 static nodemask_t off_mems, tmp_mems;
2000 struct cgroup *cont; 2087 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2088 bool is_empty;
2001 2089
2002 if (list_empty(queue)) 2090 mutex_lock(&cpuset_mutex);
2003 return NULL; 2091
2092 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2093 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2004 2094
2005 cp = list_first_entry(queue, struct cpuset, stack_list); 2095 /* remove offline cpus from @cs */
2006 list_del(queue->next); 2096 if (!cpumask_empty(&off_cpus)) {
2007 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2097 mutex_lock(&callback_mutex);
2008 child = cgroup_cs(cont); 2098 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2009 list_add_tail(&child->stack_list, queue); 2099 mutex_unlock(&callback_mutex);
2100 update_tasks_cpumask(cs, NULL);
2101 }
2102
2103 /* remove offline mems from @cs */
2104 if (!nodes_empty(off_mems)) {
2105 tmp_mems = cs->mems_allowed;
2106 mutex_lock(&callback_mutex);
2107 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2108 mutex_unlock(&callback_mutex);
2109 update_tasks_nodemask(cs, &tmp_mems, NULL);
2010 } 2110 }
2011 2111
2012 return cp; 2112 is_empty = cpumask_empty(cs->cpus_allowed) ||
2113 nodes_empty(cs->mems_allowed);
2114
2115 mutex_unlock(&cpuset_mutex);
2116
2117 /*
2118 * If @cs became empty, move tasks to the nearest ancestor with
2119 * execution resources. This is full cgroup operation which will
2120 * also call back into cpuset. Should be done outside any lock.
2121 */
2122 if (is_empty)
2123 remove_tasks_in_empty_cpuset(cs);
2124
2125 /* the following may free @cs, should be the last operation */
2126 css_put(&cs->css);
2013} 2127}
2014 2128
2129/**
2130 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2131 * @cs: cpuset of interest
2132 *
2133 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2134 * memory masks according to top_cpuset.
2135 */
2136static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2137{
2138 /*
2139 * Pin @cs. The refcnt will be released when the work item
2140 * finishes executing.
2141 */
2142 if (!css_tryget(&cs->css))
2143 return;
2015 2144
2016/* 2145 /*
2017 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory 2146 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2018 * online/offline) and update the cpusets accordingly. 2147 * cpuset_propagate_hotplug_wq is ordered and propagation will
2019 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such 2148 * happen in the order this function is called.
2020 * cpuset must be moved to a parent cpuset. 2149 */
2150 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2151 css_put(&cs->css);
2152}
2153
2154/**
2155 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2021 * 2156 *
2022 * Called with cgroup_mutex held. We take callback_mutex to modify 2157 * This function is called after either CPU or memory configuration has
2023 * cpus_allowed and mems_allowed. 2158 * changed and updates cpuset accordingly. The top_cpuset is always
2159 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
2160 * order to make cpusets transparent (of no affect) on systems that are
2161 * actively using CPU hotplug but making no active use of cpusets.
2024 * 2162 *
2025 * This walk processes the tree from top to bottom, completing one layer 2163 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2026 * before dropping down to the next. It always processes a node before 2164 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
2027 * any of its children. 2165 * descendants.
2028 * 2166 *
2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY 2167 * Note that CPU offlining during suspend is ignored. We don't modify
2030 * if all present pages from a node are offlined. 2168 * cpusets across suspend/resume cycles at all.
2031 */ 2169 */
2032static void 2170static void cpuset_hotplug_workfn(struct work_struct *work)
2033scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2034{ 2171{
2035 LIST_HEAD(queue); 2172 static cpumask_t new_cpus, tmp_cpus;
2036 struct cpuset *cp; /* scans cpusets being updated */ 2173 static nodemask_t new_mems, tmp_mems;
2037 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2174 bool cpus_updated, mems_updated;
2175 bool cpus_offlined, mems_offlined;
2038 2176
2039 list_add_tail((struct list_head *)&root->stack_list, &queue); 2177 mutex_lock(&cpuset_mutex);
2040 2178
2041 switch (event) { 2179 /* fetch the available cpus/mems and find out which changed how */
2042 case CPUSET_CPU_OFFLINE: 2180 cpumask_copy(&new_cpus, cpu_active_mask);
2043 while ((cp = cpuset_next(&queue)) != NULL) { 2181 new_mems = node_states[N_MEMORY];
2044 2182
2045 /* Continue past cpusets with all cpus online */ 2183 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2046 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) 2184 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2047 continue; 2185 &new_cpus);
2048 2186
2049 /* Remove offline cpus from this cpuset. */ 2187 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2050 mutex_lock(&callback_mutex); 2188 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2051 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2189 mems_offlined = !nodes_empty(tmp_mems);
2052 cpu_active_mask);
2053 mutex_unlock(&callback_mutex);
2054 2190
2055 /* Move tasks from the empty cpuset to a parent */ 2191 /* synchronize cpus_allowed to cpu_active_mask */
2056 if (cpumask_empty(cp->cpus_allowed)) 2192 if (cpus_updated) {
2057 remove_tasks_in_empty_cpuset(cp); 2193 mutex_lock(&callback_mutex);
2058 else 2194 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2059 update_tasks_cpumask(cp, NULL); 2195 mutex_unlock(&callback_mutex);
2060 } 2196 /* we don't mess with cpumasks of tasks in top_cpuset */
2061 break; 2197 }
2062 2198
2063 case CPUSET_MEM_OFFLINE: 2199 /* synchronize mems_allowed to N_MEMORY */
2064 while ((cp = cpuset_next(&queue)) != NULL) { 2200 if (mems_updated) {
2201 tmp_mems = top_cpuset.mems_allowed;
2202 mutex_lock(&callback_mutex);
2203 top_cpuset.mems_allowed = new_mems;
2204 mutex_unlock(&callback_mutex);
2205 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
2206 }
2065 2207
2066 /* Continue past cpusets with all mems online */ 2208 /* if cpus or mems went down, we need to propagate to descendants */
2067 if (nodes_subset(cp->mems_allowed, 2209 if (cpus_offlined || mems_offlined) {
2068 node_states[N_MEMORY])) 2210 struct cpuset *cs;
2069 continue; 2211 struct cgroup *pos_cgrp;
2070 2212
2071 oldmems = cp->mems_allowed; 2213 rcu_read_lock();
2214 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
2215 schedule_cpuset_propagate_hotplug(cs);
2216 rcu_read_unlock();
2217 }
2072 2218
2073 /* Remove offline mems from this cpuset. */ 2219 mutex_unlock(&cpuset_mutex);
2074 mutex_lock(&callback_mutex);
2075 nodes_and(cp->mems_allowed, cp->mems_allowed,
2076 node_states[N_MEMORY]);
2077 mutex_unlock(&callback_mutex);
2078 2220
2079 /* Move tasks from the empty cpuset to a parent */ 2221 /* wait for propagations to finish */
2080 if (nodes_empty(cp->mems_allowed)) 2222 flush_workqueue(cpuset_propagate_hotplug_wq);
2081 remove_tasks_in_empty_cpuset(cp); 2223
2082 else 2224 /* rebuild sched domains if cpus_allowed has changed */
2083 update_tasks_nodemask(cp, &oldmems, NULL); 2225 if (cpus_updated) {
2084 } 2226 struct sched_domain_attr *attr;
2227 cpumask_var_t *doms;
2228 int ndoms;
2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2085 } 2235 }
2086} 2236}
2087 2237
2088/*
2089 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2090 * period. This is necessary in order to make cpusets transparent
2091 * (of no affect) on systems that are actively using CPU hotplug
2092 * but making no active use of cpusets.
2093 *
2094 * The only exception to this is suspend/resume, where we don't
2095 * modify cpusets at all.
2096 *
2097 * This routine ensures that top_cpuset.cpus_allowed tracks
2098 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2099 *
2100 * Called within get_online_cpus(). Needs to call cgroup_lock()
2101 * before calling generate_sched_domains().
2102 *
2103 * @cpu_online: Indicates whether this is a CPU online event (true) or
2104 * a CPU offline event (false).
2105 */
2106void cpuset_update_active_cpus(bool cpu_online) 2238void cpuset_update_active_cpus(bool cpu_online)
2107{ 2239{
2108 struct sched_domain_attr *attr; 2240 /*
2109 cpumask_var_t *doms; 2241 * We're inside cpu hotplug critical region which usually nests
2110 int ndoms; 2242 * inside cgroup synchronization. Bounce actual hotplug processing
2111 2243 * to a work item to avoid reverse locking order.
2112 cgroup_lock(); 2244 *
2113 mutex_lock(&callback_mutex); 2245 * We still need to do partition_sched_domains() synchronously;
2114 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2246 * otherwise, the scheduler will get confused and put tasks to the
2115 mutex_unlock(&callback_mutex); 2247 * dead CPU. Fall back to the default single domain.
2116 2248 * cpuset_hotplug_workfn() will rebuild it as necessary.
2117 if (!cpu_online) 2249 */
2118 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); 2250 partition_sched_domains(1, NULL, NULL);
2119 2251 schedule_work(&cpuset_hotplug_work);
2120 ndoms = generate_sched_domains(&doms, &attr);
2121 cgroup_unlock();
2122
2123 /* Have scheduler rebuild the domains */
2124 partition_sched_domains(ndoms, doms, attr);
2125} 2252}
2126 2253
2127#ifdef CONFIG_MEMORY_HOTPLUG 2254#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
2133static int cpuset_track_online_nodes(struct notifier_block *self, 2260static int cpuset_track_online_nodes(struct notifier_block *self,
2134 unsigned long action, void *arg) 2261 unsigned long action, void *arg)
2135{ 2262{
2136 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2263 schedule_work(&cpuset_hotplug_work);
2137
2138 cgroup_lock();
2139 switch (action) {
2140 case MEM_ONLINE:
2141 oldmems = top_cpuset.mems_allowed;
2142 mutex_lock(&callback_mutex);
2143 top_cpuset.mems_allowed = node_states[N_MEMORY];
2144 mutex_unlock(&callback_mutex);
2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2146 break;
2147 case MEM_OFFLINE:
2148 /*
2149 * needn't update top_cpuset.mems_allowed explicitly because
2150 * scan_cpusets_upon_hotplug() will update it.
2151 */
2152 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2153 break;
2154 default:
2155 break;
2156 }
2157 cgroup_unlock();
2158
2159 return NOTIFY_OK; 2264 return NOTIFY_OK;
2160} 2265}
2161#endif 2266#endif
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
2173 2278
2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2279 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2175 2280
2176 cpuset_wq = create_singlethread_workqueue("cpuset"); 2281 cpuset_propagate_hotplug_wq =
2177 BUG_ON(!cpuset_wq); 2282 alloc_ordered_workqueue("cpuset_hotplug", 0);
2283 BUG_ON(!cpuset_propagate_hotplug_wq);
2178} 2284}
2179 2285
2180/** 2286/**
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2273 */ 2379 */
2274static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2380static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2275{ 2381{
2276 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) 2382 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2277 cs = cs->parent; 2383 cs = parent_cs(cs);
2278 return cs; 2384 return cs;
2279} 2385}
2280 2386
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2412} 2518}
2413 2519
2414/** 2520/**
2415 * cpuset_unlock - release lock on cpuset changes
2416 *
2417 * Undo the lock taken in a previous cpuset_lock() call.
2418 */
2419
2420void cpuset_unlock(void)
2421{
2422 mutex_unlock(&callback_mutex);
2423}
2424
2425/**
2426 * cpuset_mem_spread_node() - On which node to begin search for a file page 2521 * cpuset_mem_spread_node() - On which node to begin search for a file page
2427 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2522 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2428 * 2523 *
@@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2511 2606
2512 dentry = task_cs(tsk)->css.cgroup->dentry; 2607 dentry = task_cs(tsk)->css.cgroup->dentry;
2513 spin_lock(&cpuset_buffer_lock); 2608 spin_lock(&cpuset_buffer_lock);
2514 snprintf(cpuset_name, CPUSET_NAME_LEN, 2609
2515 dentry ? (const char *)dentry->d_name.name : "/"); 2610 if (!dentry) {
2611 strcpy(cpuset_name, "/");
2612 } else {
2613 spin_lock(&dentry->d_lock);
2614 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2615 CPUSET_NAME_LEN);
2616 spin_unlock(&dentry->d_lock);
2617 }
2618
2516 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2619 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2517 tsk->mems_allowed); 2620 tsk->mems_allowed);
2518 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2621 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
2560 * - Used for /proc/<pid>/cpuset. 2663 * - Used for /proc/<pid>/cpuset.
2561 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2664 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2562 * doesn't really matter if tsk->cpuset changes after we read it, 2665 * doesn't really matter if tsk->cpuset changes after we read it,
2563 * and we take cgroup_mutex, keeping cpuset_attach() from changing it 2666 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2564 * anyway. 2667 * anyway.
2565 */ 2668 */
2566static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2669static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2582 if (!tsk) 2685 if (!tsk)
2583 goto out_free; 2686 goto out_free;
2584 2687
2585 retval = -EINVAL; 2688 rcu_read_lock();
2586 cgroup_lock();
2587 css = task_subsys_state(tsk, cpuset_subsys_id); 2689 css = task_subsys_state(tsk, cpuset_subsys_id);
2588 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2690 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2691 rcu_read_unlock();
2589 if (retval < 0) 2692 if (retval < 0)
2590 goto out_unlock; 2693 goto out_put_task;
2591 seq_puts(m, buf); 2694 seq_puts(m, buf);
2592 seq_putc(m, '\n'); 2695 seq_putc(m, '\n');
2593out_unlock: 2696out_put_task:
2594 cgroup_unlock();
2595 put_task_struct(tsk); 2697 put_task_struct(tsk);
2596out_free: 2698out_free:
2597 kfree(buf); 2699 kfree(buf);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc8..c26278fd4851 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
29 */ 29 */
30#include <linux/pid_namespace.h> 30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h> 31#include <linux/clocksource.h>
32#include <linux/serial_core.h>
32#include <linux/interrupt.h> 33#include <linux/interrupt.h>
33#include <linux/spinlock.h> 34#include <linux/spinlock.h>
34#include <linux/console.h> 35#include <linux/console.h>
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7a..2235967e78b0 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -72,6 +72,8 @@ extern int dbg_kdb_mode;
72#ifdef CONFIG_KGDB_KDB 72#ifdef CONFIG_KGDB_KDB
73extern int kdb_stub(struct kgdb_state *ks); 73extern int kdb_stub(struct kgdb_state *ks);
74extern int kdb_parse(const char *cmdstr); 74extern int kdb_parse(const char *cmdstr);
75extern int kdb_common_init_state(struct kgdb_state *ks);
76extern int kdb_common_deinit_state(void);
75#else /* ! CONFIG_KGDB_KDB */ 77#else /* ! CONFIG_KGDB_KDB */
76static inline int kdb_stub(struct kgdb_state *ks) 78static inline int kdb_stub(struct kgdb_state *ks)
77{ 79{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e064482..19d9a578c753 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/kgdb.h> 32#include <linux/kgdb.h>
33#include <linux/kdb.h> 33#include <linux/kdb.h>
34#include <linux/serial_core.h>
34#include <linux/reboot.h> 35#include <linux/reboot.h>
35#include <linux/uaccess.h> 36#include <linux/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
782 len = len / 2; 783 len = len / 2;
783 remcom_out_buffer[len++] = 0; 784 remcom_out_buffer[len++] = 0;
784 785
786 kdb_common_init_state(ks);
785 kdb_parse(remcom_out_buffer); 787 kdb_parse(remcom_out_buffer);
788 kdb_common_deinit_state();
789
786 strcpy(remcom_out_buffer, "OK"); 790 strcpy(remcom_out_buffer, "OK");
787 } 791 }
788 break; 792 break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5d..70a504601dc3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
486/* 486/*
487 * kdb_ss 487 * kdb_ss
488 * 488 *
489 * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) 489 * Process the 'ss' (Single Step) command.
490 * commands.
491 * 490 *
492 * ss 491 * ss
493 * ssb
494 * 492 *
495 * Parameters: 493 * Parameters:
496 * argc Argument count 494 * argc Argument count
@@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
498 * Outputs: 496 * Outputs:
499 * None. 497 * None.
500 * Returns: 498 * Returns:
501 * KDB_CMD_SS[B] for success, a kdb error if failure. 499 * KDB_CMD_SS for success, a kdb error if failure.
502 * Locking: 500 * Locking:
503 * None. 501 * None.
504 * Remarks: 502 * Remarks:
505 * 503 *
506 * Set the arch specific option to trigger a debug trap after the next 504 * Set the arch specific option to trigger a debug trap after the next
507 * instruction. 505 * instruction.
508 *
509 * For 'ssb', set the trace flag in the debug trap handler
510 * after printing the current insn and return directly without
511 * invoking the kdb command processor, until a branch instruction
512 * is encountered.
513 */ 506 */
514 507
515static int kdb_ss(int argc, const char **argv) 508static int kdb_ss(int argc, const char **argv)
516{ 509{
517 int ssb = 0;
518
519 ssb = (strcmp(argv[0], "ssb") == 0);
520 if (argc != 0) 510 if (argc != 0)
521 return KDB_ARGCOUNT; 511 return KDB_ARGCOUNT;
522 /* 512 /*
523 * Set trace flag and go. 513 * Set trace flag and go.
524 */ 514 */
525 KDB_STATE_SET(DOING_SS); 515 KDB_STATE_SET(DOING_SS);
526 if (ssb) {
527 KDB_STATE_SET(DOING_SSB);
528 return KDB_CMD_SSB;
529 }
530 return KDB_CMD_SS; 516 return KDB_CMD_SS;
531} 517}
532 518
@@ -561,8 +547,6 @@ void __init kdb_initbptab(void)
561 547
562 kdb_register_repeat("ss", kdb_ss, "", 548 kdb_register_repeat("ss", kdb_ss, "",
563 "Single Step", 1, KDB_REPEAT_NO_ARGS); 549 "Single Step", 1, KDB_REPEAT_NO_ARGS);
564 kdb_register_repeat("ssb", kdb_ss, "",
565 "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
566 /* 550 /*
567 * Architecture dependent initialization. 551 * Architecture dependent initialization.
568 */ 552 */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d30..328d18ef31e4 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
34 34
35static struct kgdb_state *kdb_ks; 35static struct kgdb_state *kdb_ks;
36 36
37int kdb_common_init_state(struct kgdb_state *ks)
38{
39 kdb_initial_cpu = atomic_read(&kgdb_active);
40 kdb_current_task = kgdb_info[ks->cpu].task;
41 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
42 return 0;
43}
44
45int kdb_common_deinit_state(void)
46{
47 kdb_initial_cpu = -1;
48 kdb_current_task = NULL;
49 kdb_current_regs = NULL;
50 return 0;
51}
52
37int kdb_stub(struct kgdb_state *ks) 53int kdb_stub(struct kgdb_state *ks)
38{ 54{
39 int error = 0; 55 int error = 0;
@@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks)
94 } 110 }
95 /* Set initial kdb state variables */ 111 /* Set initial kdb state variables */
96 KDB_STATE_CLEAR(KGDB_TRANS); 112 KDB_STATE_CLEAR(KGDB_TRANS);
97 kdb_initial_cpu = atomic_read(&kgdb_active); 113 kdb_common_init_state(ks);
98 kdb_current_task = kgdb_info[ks->cpu].task;
99 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
100 /* Remove any breakpoints as needed by kdb and clear single step */ 114 /* Remove any breakpoints as needed by kdb and clear single step */
101 kdb_bp_remove(); 115 kdb_bp_remove();
102 KDB_STATE_CLEAR(DOING_SS); 116 KDB_STATE_CLEAR(DOING_SS);
103 KDB_STATE_CLEAR(DOING_SSB);
104 KDB_STATE_SET(PAGER); 117 KDB_STATE_SET(PAGER);
105 /* zero out any offline cpu data */ 118 /* zero out any offline cpu data */
106 for_each_present_cpu(i) { 119 for_each_present_cpu(i) {
@@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks)
125 * Upon exit from the kdb main loop setup break points and restart 138 * Upon exit from the kdb main loop setup break points and restart
126 * the system based on the requested continue state 139 * the system based on the requested continue state
127 */ 140 */
128 kdb_initial_cpu = -1; 141 kdb_common_deinit_state();
129 kdb_current_task = NULL;
130 kdb_current_regs = NULL;
131 KDB_STATE_CLEAR(PAGER); 142 KDB_STATE_CLEAR(PAGER);
132 kdbnearsym_cleanup(); 143 kdbnearsym_cleanup();
133 if (error == KDB_CMD_KGDB) { 144 if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 8875254120b6..00eb8f7fbf41 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
124}; 124};
125#undef KDBMSG 125#undef KDBMSG
126 126
127static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); 127static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
128 128
129 129
130/* 130/*
@@ -175,7 +175,7 @@ static char *__env[] = {
175 (char *)0, 175 (char *)0,
176}; 176};
177 177
178static const int __nenv = (sizeof(__env) / sizeof(char *)); 178static const int __nenv = ARRAY_SIZE(__env);
179 179
180struct task_struct *kdb_curr_task(int cpu) 180struct task_struct *kdb_curr_task(int cpu)
181{ 181{
@@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
681 } 681 }
682 if (argc != 3) 682 if (argc != 3)
683 return KDB_ARGCOUNT; 683 return KDB_ARGCOUNT;
684 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), 684 if (in_dbg_master()) {
685 GFP_KDB); 685 kdb_printf("Command only available during kdb_init()\n");
686 if (!defcmd_set) {
687 kdb_printf("Could not allocate new defcmd_set entry for %s\n",
688 argv[1]);
689 defcmd_set = save_defcmd_set;
690 return KDB_NOTIMP; 686 return KDB_NOTIMP;
691 } 687 }
688 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
689 GFP_KDB);
690 if (!defcmd_set)
691 goto fail_defcmd;
692 memcpy(defcmd_set, save_defcmd_set, 692 memcpy(defcmd_set, save_defcmd_set,
693 defcmd_set_count * sizeof(*defcmd_set)); 693 defcmd_set_count * sizeof(*defcmd_set));
694 kfree(save_defcmd_set);
695 s = defcmd_set + defcmd_set_count; 694 s = defcmd_set + defcmd_set_count;
696 memset(s, 0, sizeof(*s)); 695 memset(s, 0, sizeof(*s));
697 s->usable = 1; 696 s->usable = 1;
698 s->name = kdb_strdup(argv[1], GFP_KDB); 697 s->name = kdb_strdup(argv[1], GFP_KDB);
698 if (!s->name)
699 goto fail_name;
699 s->usage = kdb_strdup(argv[2], GFP_KDB); 700 s->usage = kdb_strdup(argv[2], GFP_KDB);
701 if (!s->usage)
702 goto fail_usage;
700 s->help = kdb_strdup(argv[3], GFP_KDB); 703 s->help = kdb_strdup(argv[3], GFP_KDB);
704 if (!s->help)
705 goto fail_help;
701 if (s->usage[0] == '"') { 706 if (s->usage[0] == '"') {
702 strcpy(s->usage, s->usage+1); 707 strcpy(s->usage, argv[2]+1);
703 s->usage[strlen(s->usage)-1] = '\0'; 708 s->usage[strlen(s->usage)-1] = '\0';
704 } 709 }
705 if (s->help[0] == '"') { 710 if (s->help[0] == '"') {
706 strcpy(s->help, s->help+1); 711 strcpy(s->help, argv[3]+1);
707 s->help[strlen(s->help)-1] = '\0'; 712 s->help[strlen(s->help)-1] = '\0';
708 } 713 }
709 ++defcmd_set_count; 714 ++defcmd_set_count;
710 defcmd_in_progress = 1; 715 defcmd_in_progress = 1;
716 kfree(save_defcmd_set);
711 return 0; 717 return 0;
718fail_help:
719 kfree(s->usage);
720fail_usage:
721 kfree(s->name);
722fail_name:
723 kfree(defcmd_set);
724fail_defcmd:
725 kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
726 defcmd_set = save_defcmd_set;
727 return KDB_NOTIMP;
712} 728}
713 729
714/* 730/*
@@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
1112 * KDB_CMD_GO User typed 'go'. 1128 * KDB_CMD_GO User typed 'go'.
1113 * KDB_CMD_CPU User switched to another cpu. 1129 * KDB_CMD_CPU User switched to another cpu.
1114 * KDB_CMD_SS Single step. 1130 * KDB_CMD_SS Single step.
1115 * KDB_CMD_SSB Single step until branch.
1116 */ 1131 */
1117static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, 1132static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1118 kdb_dbtrap_t db_result) 1133 kdb_dbtrap_t db_result)
@@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1151 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", 1166 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
1152 instruction_pointer(regs)); 1167 instruction_pointer(regs));
1153 break; 1168 break;
1154 case KDB_DB_SSB:
1155 /*
1156 * In the midst of ssb command. Just return.
1157 */
1158 KDB_DEBUG_STATE("kdb_local 3", reason);
1159 return KDB_CMD_SSB; /* Continue with SSB command */
1160
1161 break;
1162 case KDB_DB_SS: 1169 case KDB_DB_SS:
1163 break; 1170 break;
1164 case KDB_DB_SSBPT: 1171 case KDB_DB_SSBPT:
@@ -1281,7 +1288,6 @@ do_full_getstr:
1281 if (diag == KDB_CMD_GO 1288 if (diag == KDB_CMD_GO
1282 || diag == KDB_CMD_CPU 1289 || diag == KDB_CMD_CPU
1283 || diag == KDB_CMD_SS 1290 || diag == KDB_CMD_SS
1284 || diag == KDB_CMD_SSB
1285 || diag == KDB_CMD_KGDB) 1291 || diag == KDB_CMD_KGDB)
1286 break; 1292 break;
1287 1293
@@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1368 break; 1374 break;
1369 } 1375 }
1370 1376
1371 if (result == KDB_CMD_SSB) {
1372 KDB_STATE_SET(DOING_SS);
1373 KDB_STATE_SET(DOING_SSB);
1374 break;
1375 }
1376
1377 if (result == KDB_CMD_KGDB) { 1377 if (result == KDB_CMD_KGDB) {
1378 if (!KDB_STATE(DOING_KGDB)) 1378 if (!KDB_STATE(DOING_KGDB))
1379 kdb_printf("Entering please attach debugger " 1379 kdb_printf("Entering please attach debugger "
@@ -2350,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv)
2350 return 0; 2350 return 0;
2351} 2351}
2352 2352
2353/*
2354 * kdb_ll - This function implements the 'll' command which follows a
2355 * linked list and executes an arbitrary command for each
2356 * element.
2357 */
2358static int kdb_ll(int argc, const char **argv)
2359{
2360 int diag = 0;
2361 unsigned long addr;
2362 long offset = 0;
2363 unsigned long va;
2364 unsigned long linkoffset;
2365 int nextarg;
2366 const char *command;
2367
2368 if (argc != 3)
2369 return KDB_ARGCOUNT;
2370
2371 nextarg = 1;
2372 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
2373 if (diag)
2374 return diag;
2375
2376 diag = kdbgetularg(argv[2], &linkoffset);
2377 if (diag)
2378 return diag;
2379
2380 /*
2381 * Using the starting address as
2382 * the first element in the list, and assuming that
2383 * the list ends with a null pointer.
2384 */
2385
2386 va = addr;
2387 command = kdb_strdup(argv[3], GFP_KDB);
2388 if (!command) {
2389 kdb_printf("%s: cannot duplicate command\n", __func__);
2390 return 0;
2391 }
2392 /* Recursive use of kdb_parse, do not use argv after this point */
2393 argv = NULL;
2394
2395 while (va) {
2396 char buf[80];
2397
2398 if (KDB_FLAG(CMD_INTERRUPT))
2399 goto out;
2400
2401 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2402 diag = kdb_parse(buf);
2403 if (diag)
2404 goto out;
2405
2406 addr = va + linkoffset;
2407 if (kdb_getword(&va, addr, sizeof(va)))
2408 goto out;
2409 }
2410
2411out:
2412 kfree(command);
2413 return diag;
2414}
2415
2416static int kdb_kgdb(int argc, const char **argv) 2353static int kdb_kgdb(int argc, const char **argv)
2417{ 2354{
2418 return KDB_CMD_KGDB; 2355 return KDB_CMD_KGDB;
@@ -2430,11 +2367,15 @@ static int kdb_help(int argc, const char **argv)
2430 kdb_printf("-----------------------------" 2367 kdb_printf("-----------------------------"
2431 "-----------------------------\n"); 2368 "-----------------------------\n");
2432 for_each_kdbcmd(kt, i) { 2369 for_each_kdbcmd(kt, i) {
2433 if (kt->cmd_name) 2370 char *space = "";
2434 kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
2435 kt->cmd_usage, kt->cmd_help);
2436 if (KDB_FLAG(CMD_INTERRUPT)) 2371 if (KDB_FLAG(CMD_INTERRUPT))
2437 return 0; 2372 return 0;
2373 if (!kt->cmd_name)
2374 continue;
2375 if (strlen(kt->cmd_usage) > 20)
2376 space = "\n ";
2377 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
2378 kt->cmd_usage, space, kt->cmd_help);
2438 } 2379 }
2439 return 0; 2380 return 0;
2440} 2381}
@@ -2739,7 +2680,7 @@ int kdb_register_repeat(char *cmd,
2739 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); 2680 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
2740 kfree(kdb_commands); 2681 kfree(kdb_commands);
2741 } 2682 }
2742 memset(new + kdb_max_commands, 0, 2683 memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
2743 kdb_command_extend * sizeof(*new)); 2684 kdb_command_extend * sizeof(*new));
2744 kdb_commands = new; 2685 kdb_commands = new;
2745 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; 2686 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2843,15 +2784,13 @@ static void __init kdb_inittab(void)
2843 "Stack traceback", 1, KDB_REPEAT_NONE); 2784 "Stack traceback", 1, KDB_REPEAT_NONE);
2844 kdb_register_repeat("btp", kdb_bt, "<pid>", 2785 kdb_register_repeat("btp", kdb_bt, "<pid>",
2845 "Display stack for process <pid>", 0, KDB_REPEAT_NONE); 2786 "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
2846 kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", 2787 kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
2847 "Display stack all processes", 0, KDB_REPEAT_NONE); 2788 "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
2848 kdb_register_repeat("btc", kdb_bt, "", 2789 kdb_register_repeat("btc", kdb_bt, "",
2849 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); 2790 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
2850 kdb_register_repeat("btt", kdb_bt, "<vaddr>", 2791 kdb_register_repeat("btt", kdb_bt, "<vaddr>",
2851 "Backtrace process given its struct task address", 0, 2792 "Backtrace process given its struct task address", 0,
2852 KDB_REPEAT_NONE); 2793 KDB_REPEAT_NONE);
2853 kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
2854 "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
2855 kdb_register_repeat("env", kdb_env, "", 2794 kdb_register_repeat("env", kdb_env, "",
2856 "Show environment variables", 0, KDB_REPEAT_NONE); 2795 "Show environment variables", 0, KDB_REPEAT_NONE);
2857 kdb_register_repeat("set", kdb_set, "", 2796 kdb_register_repeat("set", kdb_set, "",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a25844..7afd3c8c41d5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
19#define KDB_CMD_GO (-1001) 19#define KDB_CMD_GO (-1001)
20#define KDB_CMD_CPU (-1002) 20#define KDB_CMD_CPU (-1002)
21#define KDB_CMD_SS (-1003) 21#define KDB_CMD_SS (-1003)
22#define KDB_CMD_SSB (-1004)
23#define KDB_CMD_KGDB (-1005) 22#define KDB_CMD_KGDB (-1005)
24 23
25/* Internal debug flags */ 24/* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
125 * kdb control */ 124 * kdb control */
126#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ 125#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
127#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ 126#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
128#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
129 * DOING_SS is also set */
130#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint 127#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
131 * after one ss, independent of 128 * after one ss, independent of
132 * DOING_SS */ 129 * DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
191typedef enum { 188typedef enum {
192 KDB_DB_BPT, /* Breakpoint */ 189 KDB_DB_BPT, /* Breakpoint */
193 KDB_DB_SS, /* Single-step trap */ 190 KDB_DB_SS, /* Single-step trap */
194 KDB_DB_SSB, /* Single step to branch */
195 KDB_DB_SSBPT, /* Single step over breakpoint */ 191 KDB_DB_SSBPT, /* Single step over breakpoint */
196 KDB_DB_NOBPT /* Spurious breakpoint */ 192 KDB_DB_NOBPT /* Spurious breakpoint */
197} kdb_dbtrap_t; 193} kdb_dbtrap_t;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053aa..d473988c1d0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
106 unsigned long long t2, t3; 106 unsigned long long t2, t3;
107 unsigned long flags; 107 unsigned long flags;
108 struct timespec ts; 108 struct timespec ts;
109 cputime_t utime, stime, stimescaled, utimescaled;
109 110
110 /* Though tsk->delays accessed later, early exit avoids 111 /* Though tsk->delays accessed later, early exit avoids
111 * unnecessary returning of other data 112 * unnecessary returning of other data
@@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
114 goto done; 115 goto done;
115 116
116 tmp = (s64)d->cpu_run_real_total; 117 tmp = (s64)d->cpu_run_real_total;
117 cputime_to_timespec(tsk->utime + tsk->stime, &ts); 118 task_cputime(tsk, &utime, &stime);
119 cputime_to_timespec(utime + stime, &ts);
118 tmp += timespec_to_ns(&ts); 120 tmp += timespec_to_ns(&ts);
119 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; 121 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
120 122
121 tmp = (s64)d->cpu_scaled_run_real_total; 123 tmp = (s64)d->cpu_scaled_run_real_total;
122 cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); 124 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
125 cputime_to_timespec(utimescaled + stimescaled, &ts);
123 tmp += timespec_to_ns(&ts); 126 tmp += timespec_to_ns(&ts);
124 d->cpu_scaled_run_real_total = 127 d->cpu_scaled_run_real_total =
125 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; 128 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b6646a8c067..b0cd86501c30 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3691,7 +3691,7 @@ unlock:
3691 3691
3692static int perf_fasync(int fd, struct file *filp, int on) 3692static int perf_fasync(int fd, struct file *filp, int on)
3693{ 3693{
3694 struct inode *inode = filp->f_path.dentry->d_inode; 3694 struct inode *inode = file_inode(filp);
3695 struct perf_event *event = filp->private_data; 3695 struct perf_event *event = filp->private_data;
3696 int retval; 3696 int retval;
3697 3697
@@ -5126,7 +5126,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5126{ 5126{
5127 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5127 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5128 struct perf_event *event; 5128 struct perf_event *event;
5129 struct hlist_node *node;
5130 struct hlist_head *head; 5129 struct hlist_head *head;
5131 5130
5132 rcu_read_lock(); 5131 rcu_read_lock();
@@ -5134,7 +5133,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5134 if (!head) 5133 if (!head)
5135 goto end; 5134 goto end;
5136 5135
5137 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5136 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5138 if (perf_swevent_match(event, type, event_id, data, regs)) 5137 if (perf_swevent_match(event, type, event_id, data, regs))
5139 perf_swevent_event(event, nr, data, regs); 5138 perf_swevent_event(event, nr, data, regs);
5140 } 5139 }
@@ -5419,7 +5418,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5419{ 5418{
5420 struct perf_sample_data data; 5419 struct perf_sample_data data;
5421 struct perf_event *event; 5420 struct perf_event *event;
5422 struct hlist_node *node;
5423 5421
5424 struct perf_raw_record raw = { 5422 struct perf_raw_record raw = {
5425 .size = entry_size, 5423 .size = entry_size,
@@ -5429,7 +5427,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5429 perf_sample_data_init(&data, addr, 0); 5427 perf_sample_data_init(&data, addr, 0);
5430 data.raw = &raw; 5428 data.raw = &raw;
5431 5429
5432 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5430 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5433 if (perf_tp_event_match(event, &data, regs)) 5431 if (perf_tp_event_match(event, &data, regs))
5434 perf_swevent_event(event, count, &data, regs); 5432 perf_swevent_event(event, count, &data, regs);
5435 } 5433 }
@@ -5965,13 +5963,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
5965 pmu->name = name; 5963 pmu->name = name;
5966 5964
5967 if (type < 0) { 5965 if (type < 0) {
5968 int err = idr_pre_get(&pmu_idr, GFP_KERNEL); 5966 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
5969 if (!err) 5967 if (type < 0) {
5970 goto free_pdc; 5968 ret = type;
5971
5972 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5973 if (err) {
5974 ret = err;
5975 goto free_pdc; 5969 goto free_pdc;
5976 } 5970 }
5977 } 5971 }
@@ -6171,11 +6165,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6171 6165
6172 if (task) { 6166 if (task) {
6173 event->attach_state = PERF_ATTACH_TASK; 6167 event->attach_state = PERF_ATTACH_TASK;
6168
6169 if (attr->type == PERF_TYPE_TRACEPOINT)
6170 event->hw.tp_target = task;
6174#ifdef CONFIG_HAVE_HW_BREAKPOINT 6171#ifdef CONFIG_HAVE_HW_BREAKPOINT
6175 /* 6172 /*
6176 * hw_breakpoint is a bit difficult here.. 6173 * hw_breakpoint is a bit difficult here..
6177 */ 6174 */
6178 if (attr->type == PERF_TYPE_BREAKPOINT) 6175 else if (attr->type == PERF_TYPE_BREAKPOINT)
6179 event->hw.bp_target = task; 6176 event->hw.bp_target = task;
6180#endif 6177#endif
6181 } 6178 }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507ed..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)
676 err_alloc: 676 err_alloc:
677 for_each_possible_cpu(err_cpu) { 677 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++) 678 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], cpu)); 679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
680 if (err_cpu == cpu) 680 if (err_cpu == cpu)
681 break; 681 break;
682 } 682 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acfbb071..a567c8c7ef31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
27#include <linux/pagemap.h> /* read_mapping_page */ 27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/export.h>
30#include <linux/rmap.h> /* anon_vma_prepare */ 31#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */ 32#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */ 33#include <linux/swap.h> /* try_to_free_swap */
@@ -41,58 +42,31 @@
41#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE 42#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
42 43
43static struct rb_root uprobes_tree = RB_ROOT; 44static struct rb_root uprobes_tree = RB_ROOT;
44
45static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
46
47#define UPROBES_HASH_SZ 13
48
49/* 45/*
50 * We need separate register/unregister and mmap/munmap lock hashes because 46 * allows us to skip the uprobe_mmap if there are no uprobe events active
51 * of mmap_sem nesting. 47 * at this time. Probably a fine grained per inode count is better?
52 *
53 * uprobe_register() needs to install probes on (potentially) all processes
54 * and thus needs to acquire multiple mmap_sems (consequtively, not
55 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
56 * for the particular process doing the mmap.
57 *
58 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
59 * because of lock order against i_mmap_mutex. This means there's a hole in
60 * the register vma iteration where a mmap() can happen.
61 *
62 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
63 * install a probe where one is already installed.
64 */ 48 */
49#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
65 50
66/* serialize (un)register */ 51static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
67static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
68
69#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
70 52
53#define UPROBES_HASH_SZ 13
71/* serialize uprobe->pending_list */ 54/* serialize uprobe->pending_list */
72static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 55static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
73#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 56#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
74 57
75static struct percpu_rw_semaphore dup_mmap_sem; 58static struct percpu_rw_semaphore dup_mmap_sem;
76 59
77/*
78 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
79 * events active at this time. Probably a fine grained per inode count is
80 * better?
81 */
82static atomic_t uprobe_events = ATOMIC_INIT(0);
83
84/* Have a copy of original instruction */ 60/* Have a copy of original instruction */
85#define UPROBE_COPY_INSN 0 61#define UPROBE_COPY_INSN 0
86/* Dont run handlers when first register/ last unregister in progress*/
87#define UPROBE_RUN_HANDLER 1
88/* Can skip singlestep */ 62/* Can skip singlestep */
89#define UPROBE_SKIP_SSTEP 2 63#define UPROBE_SKIP_SSTEP 1
90 64
91struct uprobe { 65struct uprobe {
92 struct rb_node rb_node; /* node in the rb tree */ 66 struct rb_node rb_node; /* node in the rb tree */
93 atomic_t ref; 67 atomic_t ref;
68 struct rw_semaphore register_rwsem;
94 struct rw_semaphore consumer_rwsem; 69 struct rw_semaphore consumer_rwsem;
95 struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
96 struct list_head pending_list; 70 struct list_head pending_list;
97 struct uprobe_consumer *consumers; 71 struct uprobe_consumer *consumers;
98 struct inode *inode; /* Also hold a ref to inode */ 72 struct inode *inode; /* Also hold a ref to inode */
@@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
430 u = __insert_uprobe(uprobe); 404 u = __insert_uprobe(uprobe);
431 spin_unlock(&uprobes_treelock); 405 spin_unlock(&uprobes_treelock);
432 406
433 /* For now assume that the instruction need not be single-stepped */
434 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
435
436 return u; 407 return u;
437} 408}
438 409
@@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
452 423
453 uprobe->inode = igrab(inode); 424 uprobe->inode = igrab(inode);
454 uprobe->offset = offset; 425 uprobe->offset = offset;
426 init_rwsem(&uprobe->register_rwsem);
455 init_rwsem(&uprobe->consumer_rwsem); 427 init_rwsem(&uprobe->consumer_rwsem);
456 mutex_init(&uprobe->copy_mutex); 428 /* For now assume that the instruction need not be single-stepped */
429 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
457 430
458 /* add to uprobes_tree, sorted on inode:offset */ 431 /* add to uprobes_tree, sorted on inode:offset */
459 cur_uprobe = insert_uprobe(uprobe); 432 cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
463 kfree(uprobe); 436 kfree(uprobe);
464 uprobe = cur_uprobe; 437 uprobe = cur_uprobe;
465 iput(inode); 438 iput(inode);
466 } else {
467 atomic_inc(&uprobe_events);
468 } 439 }
469 440
470 return uprobe; 441 return uprobe;
471} 442}
472 443
473static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) 444static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
474{
475 struct uprobe_consumer *uc;
476
477 if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
478 return;
479
480 down_read(&uprobe->consumer_rwsem);
481 for (uc = uprobe->consumers; uc; uc = uc->next) {
482 if (!uc->filter || uc->filter(uc, current))
483 uc->handler(uc, regs);
484 }
485 up_read(&uprobe->consumer_rwsem);
486}
487
488/* Returns the previous consumer */
489static struct uprobe_consumer *
490consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
491{ 445{
492 down_write(&uprobe->consumer_rwsem); 446 down_write(&uprobe->consumer_rwsem);
493 uc->next = uprobe->consumers; 447 uc->next = uprobe->consumers;
494 uprobe->consumers = uc; 448 uprobe->consumers = uc;
495 up_write(&uprobe->consumer_rwsem); 449 up_write(&uprobe->consumer_rwsem);
496
497 return uc->next;
498} 450}
499 451
500/* 452/*
@@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
588 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 540 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
589 return ret; 541 return ret;
590 542
591 mutex_lock(&uprobe->copy_mutex); 543 /* TODO: move this into _register, until then we abuse this sem. */
544 down_write(&uprobe->consumer_rwsem);
592 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 545 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
593 goto out; 546 goto out;
594 547
@@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
612 set_bit(UPROBE_COPY_INSN, &uprobe->flags); 565 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
613 566
614 out: 567 out:
615 mutex_unlock(&uprobe->copy_mutex); 568 up_write(&uprobe->consumer_rwsem);
569
570 return ret;
571}
572
573static inline bool consumer_filter(struct uprobe_consumer *uc,
574 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
575{
576 return !uc->filter || uc->filter(uc, ctx, mm);
577}
578
579static bool filter_chain(struct uprobe *uprobe,
580 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
581{
582 struct uprobe_consumer *uc;
583 bool ret = false;
584
585 down_read(&uprobe->consumer_rwsem);
586 for (uc = uprobe->consumers; uc; uc = uc->next) {
587 ret = consumer_filter(uc, ctx, mm);
588 if (ret)
589 break;
590 }
591 up_read(&uprobe->consumer_rwsem);
616 592
617 return ret; 593 return ret;
618} 594}
@@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
624 bool first_uprobe; 600 bool first_uprobe;
625 int ret; 601 int ret;
626 602
627 /*
628 * If probe is being deleted, unregister thread could be done with
629 * the vma-rmap-walk through. Adding a probe now can be fatal since
630 * nobody will be able to cleanup. Also we could be from fork or
631 * mremap path, where the probe might have already been inserted.
632 * Hence behave as if probe already existed.
633 */
634 if (!uprobe->consumers)
635 return 0;
636
637 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); 603 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
638 if (ret) 604 if (ret)
639 return ret; 605 return ret;
@@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
658static int 624static int
659remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) 625remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
660{ 626{
661 /* can happen if uprobe_register() fails */
662 if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
663 return 0;
664
665 set_bit(MMF_RECALC_UPROBES, &mm->flags); 627 set_bit(MMF_RECALC_UPROBES, &mm->flags);
666 return set_orig_insn(&uprobe->arch, mm, vaddr); 628 return set_orig_insn(&uprobe->arch, mm, vaddr);
667} 629}
668 630
631static inline bool uprobe_is_active(struct uprobe *uprobe)
632{
633 return !RB_EMPTY_NODE(&uprobe->rb_node);
634}
669/* 635/*
670 * There could be threads that have already hit the breakpoint. They 636 * There could be threads that have already hit the breakpoint. They
671 * will recheck the current insn and restart if find_uprobe() fails. 637 * will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
673 */ 639 */
674static void delete_uprobe(struct uprobe *uprobe) 640static void delete_uprobe(struct uprobe *uprobe)
675{ 641{
642 if (WARN_ON(!uprobe_is_active(uprobe)))
643 return;
644
676 spin_lock(&uprobes_treelock); 645 spin_lock(&uprobes_treelock);
677 rb_erase(&uprobe->rb_node, &uprobes_tree); 646 rb_erase(&uprobe->rb_node, &uprobes_tree);
678 spin_unlock(&uprobes_treelock); 647 spin_unlock(&uprobes_treelock);
648 RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
679 iput(uprobe->inode); 649 iput(uprobe->inode);
680 put_uprobe(uprobe); 650 put_uprobe(uprobe);
681 atomic_dec(&uprobe_events);
682} 651}
683 652
684struct map_info { 653struct map_info {
@@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
764 return curr; 733 return curr;
765} 734}
766 735
767static int register_for_each_vma(struct uprobe *uprobe, bool is_register) 736static int
737register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
768{ 738{
739 bool is_register = !!new;
769 struct map_info *info; 740 struct map_info *info;
770 int err = 0; 741 int err = 0;
771 742
@@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
794 vaddr_to_offset(vma, info->vaddr) != uprobe->offset) 765 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
795 goto unlock; 766 goto unlock;
796 767
797 if (is_register) 768 if (is_register) {
798 err = install_breakpoint(uprobe, mm, vma, info->vaddr); 769 /* consult only the "caller", new consumer. */
799 else 770 if (consumer_filter(new,
800 err |= remove_breakpoint(uprobe, mm, info->vaddr); 771 UPROBE_FILTER_REGISTER, mm))
772 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
773 } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
774 if (!filter_chain(uprobe,
775 UPROBE_FILTER_UNREGISTER, mm))
776 err |= remove_breakpoint(uprobe, mm, info->vaddr);
777 }
801 778
802 unlock: 779 unlock:
803 up_write(&mm->mmap_sem); 780 up_write(&mm->mmap_sem);
@@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
810 return err; 787 return err;
811} 788}
812 789
813static int __uprobe_register(struct uprobe *uprobe) 790static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
814{ 791{
815 return register_for_each_vma(uprobe, true); 792 consumer_add(uprobe, uc);
793 return register_for_each_vma(uprobe, uc);
816} 794}
817 795
818static void __uprobe_unregister(struct uprobe *uprobe) 796static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
819{ 797{
820 if (!register_for_each_vma(uprobe, false)) 798 int err;
821 delete_uprobe(uprobe); 799
800 if (!consumer_del(uprobe, uc)) /* WARN? */
801 return;
822 802
803 err = register_for_each_vma(uprobe, NULL);
823 /* TODO : cant unregister? schedule a worker thread */ 804 /* TODO : cant unregister? schedule a worker thread */
805 if (!uprobe->consumers && !err)
806 delete_uprobe(uprobe);
824} 807}
825 808
826/* 809/*
@@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
845 struct uprobe *uprobe; 828 struct uprobe *uprobe;
846 int ret; 829 int ret;
847 830
848 if (!inode || !uc || uc->next) 831 /* Racy, just to catch the obvious mistakes */
849 return -EINVAL;
850
851 if (offset > i_size_read(inode)) 832 if (offset > i_size_read(inode))
852 return -EINVAL; 833 return -EINVAL;
853 834
854 ret = 0; 835 retry:
855 mutex_lock(uprobes_hash(inode));
856 uprobe = alloc_uprobe(inode, offset); 836 uprobe = alloc_uprobe(inode, offset);
857 837 if (!uprobe)
858 if (!uprobe) { 838 return -ENOMEM;
859 ret = -ENOMEM; 839 /*
860 } else if (!consumer_add(uprobe, uc)) { 840 * We can race with uprobe_unregister()->delete_uprobe().
861 ret = __uprobe_register(uprobe); 841 * Check uprobe_is_active() and retry if it is false.
862 if (ret) { 842 */
863 uprobe->consumers = NULL; 843 down_write(&uprobe->register_rwsem);
864 __uprobe_unregister(uprobe); 844 ret = -EAGAIN;
865 } else { 845 if (likely(uprobe_is_active(uprobe))) {
866 set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 846 ret = __uprobe_register(uprobe, uc);
867 } 847 if (ret)
848 __uprobe_unregister(uprobe, uc);
868 } 849 }
850 up_write(&uprobe->register_rwsem);
851 put_uprobe(uprobe);
869 852
870 mutex_unlock(uprobes_hash(inode)); 853 if (unlikely(ret == -EAGAIN))
871 if (uprobe) 854 goto retry;
872 put_uprobe(uprobe); 855 return ret;
856}
857EXPORT_SYMBOL_GPL(uprobe_register);
858
859/*
860 * uprobe_apply - unregister a already registered probe.
861 * @inode: the file in which the probe has to be removed.
862 * @offset: offset from the start of the file.
863 * @uc: consumer which wants to add more or remove some breakpoints
864 * @add: add or remove the breakpoints
865 */
866int uprobe_apply(struct inode *inode, loff_t offset,
867 struct uprobe_consumer *uc, bool add)
868{
869 struct uprobe *uprobe;
870 struct uprobe_consumer *con;
871 int ret = -ENOENT;
872
873 uprobe = find_uprobe(inode, offset);
874 if (!uprobe)
875 return ret;
876
877 down_write(&uprobe->register_rwsem);
878 for (con = uprobe->consumers; con && con != uc ; con = con->next)
879 ;
880 if (con)
881 ret = register_for_each_vma(uprobe, add ? uc : NULL);
882 up_write(&uprobe->register_rwsem);
883 put_uprobe(uprobe);
873 884
874 return ret; 885 return ret;
875} 886}
@@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
884{ 895{
885 struct uprobe *uprobe; 896 struct uprobe *uprobe;
886 897
887 if (!inode || !uc)
888 return;
889
890 uprobe = find_uprobe(inode, offset); 898 uprobe = find_uprobe(inode, offset);
891 if (!uprobe) 899 if (!uprobe)
892 return; 900 return;
893 901
894 mutex_lock(uprobes_hash(inode)); 902 down_write(&uprobe->register_rwsem);
903 __uprobe_unregister(uprobe, uc);
904 up_write(&uprobe->register_rwsem);
905 put_uprobe(uprobe);
906}
907EXPORT_SYMBOL_GPL(uprobe_unregister);
895 908
896 if (consumer_del(uprobe, uc)) { 909static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
897 if (!uprobe->consumers) { 910{
898 __uprobe_unregister(uprobe); 911 struct vm_area_struct *vma;
899 clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 912 int err = 0;
900 } 913
914 down_read(&mm->mmap_sem);
915 for (vma = mm->mmap; vma; vma = vma->vm_next) {
916 unsigned long vaddr;
917 loff_t offset;
918
919 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode)
921 continue;
922
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
924 if (uprobe->offset < offset ||
925 uprobe->offset >= offset + vma->vm_end - vma->vm_start)
926 continue;
927
928 vaddr = offset_to_vaddr(vma, uprobe->offset);
929 err |= remove_breakpoint(uprobe, mm, vaddr);
901 } 930 }
931 up_read(&mm->mmap_sem);
902 932
903 mutex_unlock(uprobes_hash(inode)); 933 return err;
904 if (uprobe)
905 put_uprobe(uprobe);
906} 934}
907 935
908static struct rb_node * 936static struct rb_node *
@@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
979 struct uprobe *uprobe, *u; 1007 struct uprobe *uprobe, *u;
980 struct inode *inode; 1008 struct inode *inode;
981 1009
982 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) 1010 if (no_uprobe_events() || !valid_vma(vma, true))
983 return 0; 1011 return 0;
984 1012
985 inode = vma->vm_file->f_mapping->host; 1013 inode = vma->vm_file->f_mapping->host;
@@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
988 1016
989 mutex_lock(uprobes_mmap_hash(inode)); 1017 mutex_lock(uprobes_mmap_hash(inode));
990 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); 1018 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
991 1019 /*
1020 * We can race with uprobe_unregister(), this uprobe can be already
1021 * removed. But in this case filter_chain() must return false, all
1022 * consumers have gone away.
1023 */
992 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1024 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
993 if (!fatal_signal_pending(current)) { 1025 if (!fatal_signal_pending(current) &&
1026 filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
994 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); 1027 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
995 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1028 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
996 } 1029 }
@@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1025 */ 1058 */
1026void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1059void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1027{ 1060{
1028 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1061 if (no_uprobe_events() || !valid_vma(vma, false))
1029 return; 1062 return;
1030 1063
1031 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ 1064 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1042/* Slot allocation for XOL */ 1075/* Slot allocation for XOL */
1043static int xol_add_vma(struct xol_area *area) 1076static int xol_add_vma(struct xol_area *area)
1044{ 1077{
1045 struct mm_struct *mm; 1078 struct mm_struct *mm = current->mm;
1046 int ret; 1079 int ret = -EALREADY;
1047
1048 area->page = alloc_page(GFP_HIGHUSER);
1049 if (!area->page)
1050 return -ENOMEM;
1051
1052 ret = -EALREADY;
1053 mm = current->mm;
1054 1080
1055 down_write(&mm->mmap_sem); 1081 down_write(&mm->mmap_sem);
1056 if (mm->uprobes_state.xol_area) 1082 if (mm->uprobes_state.xol_area)
1057 goto fail; 1083 goto fail;
1058 1084
1059 ret = -ENOMEM; 1085 ret = -ENOMEM;
1060
1061 /* Try to map as high as possible, this is only a hint. */ 1086 /* Try to map as high as possible, this is only a hint. */
1062 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1087 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1063 if (area->vaddr & ~PAGE_MASK) { 1088 if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
1073 smp_wmb(); /* pairs with get_xol_area() */ 1098 smp_wmb(); /* pairs with get_xol_area() */
1074 mm->uprobes_state.xol_area = area; 1099 mm->uprobes_state.xol_area = area;
1075 ret = 0; 1100 ret = 0;
1076 1101 fail:
1077fail:
1078 up_write(&mm->mmap_sem); 1102 up_write(&mm->mmap_sem);
1079 if (ret)
1080 __free_page(area->page);
1081 1103
1082 return ret; 1104 return ret;
1083} 1105}
1084 1106
1085static struct xol_area *get_xol_area(struct mm_struct *mm)
1086{
1087 struct xol_area *area;
1088
1089 area = mm->uprobes_state.xol_area;
1090 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1091
1092 return area;
1093}
1094
1095/* 1107/*
1096 * xol_alloc_area - Allocate process's xol_area. 1108 * get_xol_area - Allocate process's xol_area if necessary.
1097 * This area will be used for storing instructions for execution out of 1109 * This area will be used for storing instructions for execution out of line.
1098 * line.
1099 * 1110 *
1100 * Returns the allocated area or NULL. 1111 * Returns the allocated area or NULL.
1101 */ 1112 */
1102static struct xol_area *xol_alloc_area(void) 1113static struct xol_area *get_xol_area(void)
1103{ 1114{
1115 struct mm_struct *mm = current->mm;
1104 struct xol_area *area; 1116 struct xol_area *area;
1105 1117
1118 area = mm->uprobes_state.xol_area;
1119 if (area)
1120 goto ret;
1121
1106 area = kzalloc(sizeof(*area), GFP_KERNEL); 1122 area = kzalloc(sizeof(*area), GFP_KERNEL);
1107 if (unlikely(!area)) 1123 if (unlikely(!area))
1108 return NULL; 1124 goto out;
1109 1125
1110 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); 1126 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1111
1112 if (!area->bitmap) 1127 if (!area->bitmap)
1113 goto fail; 1128 goto free_area;
1129
1130 area->page = alloc_page(GFP_HIGHUSER);
1131 if (!area->page)
1132 goto free_bitmap;
1114 1133
1115 init_waitqueue_head(&area->wq); 1134 init_waitqueue_head(&area->wq);
1116 if (!xol_add_vma(area)) 1135 if (!xol_add_vma(area))
1117 return area; 1136 return area;
1118 1137
1119fail: 1138 __free_page(area->page);
1139 free_bitmap:
1120 kfree(area->bitmap); 1140 kfree(area->bitmap);
1141 free_area:
1121 kfree(area); 1142 kfree(area);
1122 1143 out:
1123 return get_xol_area(current->mm); 1144 area = mm->uprobes_state.xol_area;
1145 ret:
1146 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1147 return area;
1124} 1148}
1125 1149
1126/* 1150/*
@@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1186} 1210}
1187 1211
1188/* 1212/*
1189 * xol_get_insn_slot - If was not allocated a slot, then 1213 * xol_get_insn_slot - allocate a slot for xol.
1190 * allocate a slot.
1191 * Returns the allocated slot address or 0. 1214 * Returns the allocated slot address or 0.
1192 */ 1215 */
1193static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) 1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1194{ 1217{
1195 struct xol_area *area; 1218 struct xol_area *area;
1196 unsigned long offset; 1219 unsigned long offset;
1220 unsigned long xol_vaddr;
1197 void *vaddr; 1221 void *vaddr;
1198 1222
1199 area = get_xol_area(current->mm); 1223 area = get_xol_area();
1200 if (!area) { 1224 if (!area)
1201 area = xol_alloc_area(); 1225 return 0;
1202 if (!area)
1203 return 0;
1204 }
1205 current->utask->xol_vaddr = xol_take_insn_slot(area);
1206 1226
1207 /* 1227 xol_vaddr = xol_take_insn_slot(area);
1208 * Initialize the slot if xol_vaddr points to valid 1228 if (unlikely(!xol_vaddr))
1209 * instruction slot.
1210 */
1211 if (unlikely(!current->utask->xol_vaddr))
1212 return 0; 1229 return 0;
1213 1230
1214 current->utask->vaddr = slot_addr; 1231 /* Initialize the slot */
1215 offset = current->utask->xol_vaddr & ~PAGE_MASK; 1232 offset = xol_vaddr & ~PAGE_MASK;
1216 vaddr = kmap_atomic(area->page); 1233 vaddr = kmap_atomic(area->page);
1217 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); 1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1218 kunmap_atomic(vaddr); 1235 kunmap_atomic(vaddr);
@@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
1222 */ 1239 */
1223 flush_dcache_page(area->page); 1240 flush_dcache_page(area->page);
1224 1241
1225 return current->utask->xol_vaddr; 1242 return xol_vaddr;
1226} 1243}
1227 1244
1228/* 1245/*
@@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1240 return; 1257 return;
1241 1258
1242 slot_addr = tsk->utask->xol_vaddr; 1259 slot_addr = tsk->utask->xol_vaddr;
1243 1260 if (unlikely(!slot_addr))
1244 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1245 return; 1261 return;
1246 1262
1247 area = tsk->mm->uprobes_state.xol_area; 1263 area = tsk->mm->uprobes_state.xol_area;
@@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
1303} 1319}
1304 1320
1305/* 1321/*
1306 * Allocate a uprobe_task object for the task. 1322 * Allocate a uprobe_task object for the task if if necessary.
1307 * Called when the thread hits a breakpoint for the first time. 1323 * Called when the thread hits a breakpoint.
1308 * 1324 *
1309 * Returns: 1325 * Returns:
1310 * - pointer to new uprobe_task on success 1326 * - pointer to new uprobe_task on success
1311 * - NULL otherwise 1327 * - NULL otherwise
1312 */ 1328 */
1313static struct uprobe_task *add_utask(void) 1329static struct uprobe_task *get_utask(void)
1314{ 1330{
1315 struct uprobe_task *utask; 1331 if (!current->utask)
1316 1332 current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1317 utask = kzalloc(sizeof *utask, GFP_KERNEL); 1333 return current->utask;
1318 if (unlikely(!utask))
1319 return NULL;
1320
1321 current->utask = utask;
1322 return utask;
1323} 1334}
1324 1335
1325/* Prepare to single-step probed instruction out of line. */ 1336/* Prepare to single-step probed instruction out of line. */
1326static int 1337static int
1327pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) 1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1328{ 1339{
1329 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) 1340 struct uprobe_task *utask;
1330 return 0; 1341 unsigned long xol_vaddr;
1342 int err;
1343
1344 utask = get_utask();
1345 if (!utask)
1346 return -ENOMEM;
1347
1348 xol_vaddr = xol_get_insn_slot(uprobe);
1349 if (!xol_vaddr)
1350 return -ENOMEM;
1351
1352 utask->xol_vaddr = xol_vaddr;
1353 utask->vaddr = bp_vaddr;
1354
1355 err = arch_uprobe_pre_xol(&uprobe->arch, regs);
1356 if (unlikely(err)) {
1357 xol_free_insn_slot(current);
1358 return err;
1359 }
1331 1360
1332 return -EFAULT; 1361 utask->active_uprobe = uprobe;
1362 utask->state = UTASK_SSTEP;
1363 return 0;
1333} 1364}
1334 1365
1335/* 1366/*
@@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1391 * This is not strictly accurate, we can race with 1422 * This is not strictly accurate, we can race with
1392 * uprobe_unregister() and see the already removed 1423 * uprobe_unregister() and see the already removed
1393 * uprobe if delete_uprobe() was not yet called. 1424 * uprobe if delete_uprobe() was not yet called.
1425 * Or this uprobe can be filtered out.
1394 */ 1426 */
1395 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) 1427 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1396 return; 1428 return;
@@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1452 return uprobe; 1484 return uprobe;
1453} 1485}
1454 1486
1487static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1488{
1489 struct uprobe_consumer *uc;
1490 int remove = UPROBE_HANDLER_REMOVE;
1491
1492 down_read(&uprobe->register_rwsem);
1493 for (uc = uprobe->consumers; uc; uc = uc->next) {
1494 int rc = uc->handler(uc, regs);
1495
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc;
1499 }
1500
1501 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm);
1504 }
1505 up_read(&uprobe->register_rwsem);
1506}
1507
1455/* 1508/*
1456 * Run handler and ask thread to singlestep. 1509 * Run handler and ask thread to singlestep.
1457 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1510 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1458 */ 1511 */
1459static void handle_swbp(struct pt_regs *regs) 1512static void handle_swbp(struct pt_regs *regs)
1460{ 1513{
1461 struct uprobe_task *utask;
1462 struct uprobe *uprobe; 1514 struct uprobe *uprobe;
1463 unsigned long bp_vaddr; 1515 unsigned long bp_vaddr;
1464 int uninitialized_var(is_swbp); 1516 int uninitialized_var(is_swbp);
@@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
1483 } 1535 }
1484 return; 1536 return;
1485 } 1537 }
1538
1539 /* change it in advance for ->handler() and restart */
1540 instruction_pointer_set(regs, bp_vaddr);
1541
1486 /* 1542 /*
1487 * TODO: move copy_insn/etc into _register and remove this hack. 1543 * TODO: move copy_insn/etc into _register and remove this hack.
1488 * After we hit the bp, _unregister + _register can install the 1544 * After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
1490 */ 1546 */
1491 smp_rmb(); /* pairs with wmb() in install_breakpoint() */ 1547 smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1492 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) 1548 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1493 goto restart; 1549 goto out;
1494
1495 utask = current->utask;
1496 if (!utask) {
1497 utask = add_utask();
1498 /* Cannot allocate; re-execute the instruction. */
1499 if (!utask)
1500 goto restart;
1501 }
1502 1550
1503 handler_chain(uprobe, regs); 1551 handler_chain(uprobe, regs);
1504 if (can_skip_sstep(uprobe, regs)) 1552 if (can_skip_sstep(uprobe, regs))
1505 goto out; 1553 goto out;
1506 1554
1507 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1555 if (!pre_ssout(uprobe, regs, bp_vaddr))
1508 utask->active_uprobe = uprobe;
1509 utask->state = UTASK_SSTEP;
1510 return; 1556 return;
1511 }
1512 1557
1513restart: 1558 /* can_skip_sstep() succeeded, or restart if can't singlestep */
1514 /*
1515 * cannot singlestep; cannot skip instruction;
1516 * re-execute the instruction.
1517 */
1518 instruction_pointer_set(regs, bp_vaddr);
1519out: 1559out:
1520 put_uprobe(uprobe); 1560 put_uprobe(uprobe);
1521} 1561}
@@ -1609,10 +1649,8 @@ static int __init init_uprobes(void)
1609{ 1649{
1610 int i; 1650 int i;
1611 1651
1612 for (i = 0; i < UPROBES_HASH_SZ; i++) { 1652 for (i = 0; i < UPROBES_HASH_SZ; i++)
1613 mutex_init(&uprobes_mutex[i]);
1614 mutex_init(&uprobes_mmap_mutex[i]); 1653 mutex_init(&uprobes_mmap_mutex[i]);
1615 }
1616 1654
1617 if (percpu_init_rwsem(&dup_mmap_sem)) 1655 if (percpu_init_rwsem(&dup_mmap_sem))
1618 return -ENOMEM; 1656 return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df21937216..51e485ca9935 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
20#include <linux/tsacct_kern.h> 20#include <linux/tsacct_kern.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/fdtable.h> 22#include <linux/fdtable.h>
23#include <linux/freezer.h>
23#include <linux/binfmts.h> 24#include <linux/binfmts.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h> 26#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
31#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
32#include <linux/taskstats_kern.h> 33#include <linux/taskstats_kern.h>
33#include <linux/delayacct.h> 34#include <linux/delayacct.h>
34#include <linux/freezer.h>
35#include <linux/cgroup.h> 35#include <linux/cgroup.h>
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/signal.h> 37#include <linux/signal.h>
@@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)
85 bool group_dead = thread_group_leader(tsk); 85 bool group_dead = thread_group_leader(tsk);
86 struct sighand_struct *sighand; 86 struct sighand_struct *sighand;
87 struct tty_struct *uninitialized_var(tty); 87 struct tty_struct *uninitialized_var(tty);
88 cputime_t utime, stime;
88 89
89 sighand = rcu_dereference_check(tsk->sighand, 90 sighand = rcu_dereference_check(tsk->sighand,
90 lockdep_tasklist_lock_is_held()); 91 lockdep_tasklist_lock_is_held());
@@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)
123 * We won't ever get here for the group leader, since it 124 * We won't ever get here for the group leader, since it
124 * will have been the last reference on the signal_struct. 125 * will have been the last reference on the signal_struct.
125 */ 126 */
126 sig->utime += tsk->utime; 127 task_cputime(tsk, &utime, &stime);
127 sig->stime += tsk->stime; 128 sig->utime += utime;
128 sig->gtime += tsk->gtime; 129 sig->stime += stime;
130 sig->gtime += task_gtime(tsk);
129 sig->min_flt += tsk->min_flt; 131 sig->min_flt += tsk->min_flt;
130 sig->maj_flt += tsk->maj_flt; 132 sig->maj_flt += tsk->maj_flt;
131 sig->nvcsw += tsk->nvcsw; 133 sig->nvcsw += tsk->nvcsw;
@@ -483,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
483 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 485 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
484 if (!self.task) /* see coredump_finish() */ 486 if (!self.task) /* see coredump_finish() */
485 break; 487 break;
486 schedule(); 488 freezable_schedule();
487 } 489 }
488 __set_task_state(tsk, TASK_RUNNING); 490 __set_task_state(tsk, TASK_RUNNING);
489 down_read(&mm->mmap_sem); 491 down_read(&mm->mmap_sem);
@@ -833,7 +835,7 @@ void do_exit(long code)
833 /* 835 /*
834 * Make sure we are holding no locks: 836 * Make sure we are holding no locks:
835 */ 837 */
836 debug_check_no_locks_held(tsk); 838 debug_check_no_locks_held();
837 /* 839 /*
838 * We can do this unlocked here. The futex code uses this flag 840 * We can do this unlocked here. The futex code uses this flag
839 * just to verify whether the pi state cleanup has been done 841 * just to verify whether the pi state cleanup has been done
@@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1092 sig = p->signal; 1094 sig = p->signal;
1093 psig->cutime += tgutime + sig->cutime; 1095 psig->cutime += tgutime + sig->cutime;
1094 psig->cstime += tgstime + sig->cstime; 1096 psig->cstime += tgstime + sig->cstime;
1095 psig->cgtime += p->gtime + sig->gtime + sig->cgtime; 1097 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1096 psig->cmin_flt += 1098 psig->cmin_flt +=
1097 p->min_flt + sig->min_flt + sig->cmin_flt; 1099 p->min_flt + sig->min_flt + sig->cmin_flt;
1098 psig->cmaj_flt += 1100 psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index c535f33bbb9c..1766d324d5e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
413 tmp->vm_next = tmp->vm_prev = NULL; 413 tmp->vm_next = tmp->vm_prev = NULL;
414 file = tmp->vm_file; 414 file = tmp->vm_file;
415 if (file) { 415 if (file) {
416 struct inode *inode = file->f_path.dentry->d_inode; 416 struct inode *inode = file_inode(file);
417 struct address_space *mapping = file->f_mapping; 417 struct address_space *mapping = file->f_mapping;
418 418
419 get_file(file); 419 get_file(file);
@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1142 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
1143 1143
1144 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1145 return ERR_PTR(-EINVAL);
1146
1144 /* 1147 /*
1145 * Thread groups must share signals as well, and detached threads 1148 * Thread groups must share signals as well, and detached threads
1146 * can only be started up within the thread group. 1149 * can only be started up within the thread group.
@@ -1233,6 +1236,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1236#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1234 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1237 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1235#endif 1238#endif
1239#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1240 seqlock_init(&p->vtime_seqlock);
1241 p->vtime_snap = 0;
1242 p->vtime_snap_whence = VTIME_SLEEPING;
1243#endif
1244
1236#if defined(SPLIT_RSS_COUNTING) 1245#if defined(SPLIT_RSS_COUNTING)
1237 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1246 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1238#endif 1247#endif
@@ -1801,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1801 * If unsharing a user namespace must also unshare the thread. 1810 * If unsharing a user namespace must also unshare the thread.
1802 */ 1811 */
1803 if (unshare_flags & CLONE_NEWUSER) 1812 if (unshare_flags & CLONE_NEWUSER)
1804 unshare_flags |= CLONE_THREAD; 1813 unshare_flags |= CLONE_THREAD | CLONE_FS;
1805 /* 1814 /*
1806 * If unsharing a pid namespace must also unshare the thread. 1815 * If unsharing a pid namespace must also unshare the thread.
1807 */ 1816 */
@@ -1855,10 +1864,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1855 exit_sem(current); 1864 exit_sem(current);
1856 } 1865 }
1857 1866
1858 if (new_nsproxy) { 1867 if (new_nsproxy)
1859 switch_task_namespaces(current, new_nsproxy); 1868 switch_task_namespaces(current, new_nsproxy);
1860 new_nsproxy = NULL;
1861 }
1862 1869
1863 task_lock(current); 1870 task_lock(current);
1864 1871
@@ -1888,9 +1895,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1888 } 1895 }
1889 } 1896 }
1890 1897
1891 if (new_nsproxy)
1892 put_nsproxy(new_nsproxy);
1893
1894bad_unshare_cleanup_cred: 1898bad_unshare_cleanup_cred:
1895 if (new_cred) 1899 if (new_cred)
1896 put_cred(new_cred); 1900 put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089ca003..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
60#include <linux/pid.h> 60#include <linux/pid.h>
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h> 62#include <linux/ptrace.h>
63#include <linux/sched/rt.h>
63 64
64#include <asm/futex.h> 65#include <asm/futex.h>
65 66
@@ -222,10 +223,11 @@ static void drop_futex_key_refs(union futex_key *key)
222 * @rw: mapping needs to be read/write (values: VERIFY_READ, 223 * @rw: mapping needs to be read/write (values: VERIFY_READ,
223 * VERIFY_WRITE) 224 * VERIFY_WRITE)
224 * 225 *
225 * Returns a negative error code or 0 226 * Return: a negative error code or 0
227 *
226 * The key words are stored in *key on success. 228 * The key words are stored in *key on success.
227 * 229 *
228 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 230 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
229 * offset_within_page). For private mappings, it's (uaddr, current->mm). 231 * offset_within_page). For private mappings, it's (uaddr, current->mm).
230 * We can usually work out the index without swapping in the page. 232 * We can usually work out the index without swapping in the page.
231 * 233 *
@@ -704,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
704 * be "current" except in the case of requeue pi. 706 * be "current" except in the case of requeue pi.
705 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 707 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
706 * 708 *
707 * Returns: 709 * Return:
708 * 0 - ready to wait 710 * 0 - ready to wait;
709 * 1 - acquired the lock 711 * 1 - acquired the lock;
710 * <0 - error 712 * <0 - error
711 * 713 *
712 * The hb->lock and futex_key refs shall be held by the caller. 714 * The hb->lock and futex_key refs shall be held by the caller.
@@ -1190,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1190 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1192 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1191 * hb1 and hb2 must be held by the caller. 1193 * hb1 and hb2 must be held by the caller.
1192 * 1194 *
1193 * Returns: 1195 * Return:
1194 * 0 - failed to acquire the lock atomicly 1196 * 0 - failed to acquire the lock atomically;
1195 * 1 - acquired the lock 1197 * 1 - acquired the lock;
1196 * <0 - error 1198 * <0 - error
1197 */ 1199 */
1198static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1200static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1253,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1253 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1255 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1254 * uaddr2 atomically on behalf of the top waiter. 1256 * uaddr2 atomically on behalf of the top waiter.
1255 * 1257 *
1256 * Returns: 1258 * Return:
1257 * >=0 - on success, the number of tasks requeued or woken 1259 * >=0 - on success, the number of tasks requeued or woken;
1258 * <0 - on error 1260 * <0 - on error
1259 */ 1261 */
1260static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1262static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1535,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1535 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 1537 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1536 * be paired with exactly one earlier call to queue_me(). 1538 * be paired with exactly one earlier call to queue_me().
1537 * 1539 *
1538 * Returns: 1540 * Return:
1539 * 1 - if the futex_q was still queued (and we removed unqueued it) 1541 * 1 - if the futex_q was still queued (and we removed unqueued it);
1540 * 0 - if the futex_q was already removed by the waking thread 1542 * 0 - if the futex_q was already removed by the waking thread
1541 */ 1543 */
1542static int unqueue_me(struct futex_q *q) 1544static int unqueue_me(struct futex_q *q)
@@ -1706,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
1706 * the pi_state owner as well as handle race conditions that may allow us to 1708 * the pi_state owner as well as handle race conditions that may allow us to
1707 * acquire the lock. Must be called with the hb lock held. 1709 * acquire the lock. Must be called with the hb lock held.
1708 * 1710 *
1709 * Returns: 1711 * Return:
1710 * 1 - success, lock taken 1712 * 1 - success, lock taken;
1711 * 0 - success, lock not taken 1713 * 0 - success, lock not taken;
1712 * <0 - on error (-EFAULT) 1714 * <0 - on error (-EFAULT)
1713 */ 1715 */
1714static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 1716static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1823,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1823 * Return with the hb lock held and a q.key reference on success, and unlocked 1825 * Return with the hb lock held and a q.key reference on success, and unlocked
1824 * with no q.key reference on failure. 1826 * with no q.key reference on failure.
1825 * 1827 *
1826 * Returns: 1828 * Return:
1827 * 0 - uaddr contains val and hb has been locked 1829 * 0 - uaddr contains val and hb has been locked;
1828 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 1830 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1829 */ 1831 */
1830static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 1832static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2202,9 +2204,9 @@ pi_faulted:
2202 * the wakeup and return the appropriate error code to the caller. Must be 2204 * the wakeup and return the appropriate error code to the caller. Must be
2203 * called with the hb lock held. 2205 * called with the hb lock held.
2204 * 2206 *
2205 * Returns 2207 * Return:
2206 * 0 - no early wakeup detected 2208 * 0 = no early wakeup detected;
2207 * <0 - -ETIMEDOUT or -ERESTARTNOINTR 2209 * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2208 */ 2210 */
2209static inline 2211static inline
2210int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2212int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2246,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2246 * @val: the expected value of uaddr 2248 * @val: the expected value of uaddr
2247 * @abs_time: absolute timeout 2249 * @abs_time: absolute timeout
2248 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 2250 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2249 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2250 * @uaddr2: the pi futex we will take prior to returning to user-space 2251 * @uaddr2: the pi futex we will take prior to returning to user-space
2251 * 2252 *
2252 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2253 * The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2257,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2257 * there was a need to. 2258 * there was a need to.
2258 * 2259 *
2259 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2260 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2260 * via the following: 2261 * via the following--
2261 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2262 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2262 * 2) wakeup on uaddr2 after a requeue 2263 * 2) wakeup on uaddr2 after a requeue
2263 * 3) signal 2264 * 3) signal
@@ -2275,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2275 * 2276 *
2276 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2277 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2277 * 2278 *
2278 * Returns: 2279 * Return:
2279 * 0 - On success 2280 * 0 - On success;
2280 * <0 - On error 2281 * <0 - On error
2281 */ 2282 */
2282static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2283static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2471,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2471 if (!futex_cmpxchg_enabled) 2472 if (!futex_cmpxchg_enabled)
2472 return -ENOSYS; 2473 return -ENOSYS;
2473 2474
2474 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
2475
2476 rcu_read_lock(); 2475 rcu_read_lock();
2477 2476
2478 ret = -ESRCH; 2477 ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005fc..f9f44fd4d34d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
11#include <linux/nsproxy.h> 11#include <linux/nsproxy.h>
12#include <linux/futex.h> 12#include <linux/futex.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/syscalls.h>
14 15
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
116 } 117 }
117} 118}
118 119
119asmlinkage long 120COMPAT_SYSCALL_DEFINE2(set_robust_list,
120compat_sys_set_robust_list(struct compat_robust_list_head __user *head, 121 struct compat_robust_list_head __user *, head,
121 compat_size_t len) 122 compat_size_t, len)
122{ 123{
123 if (!futex_cmpxchg_enabled) 124 if (!futex_cmpxchg_enabled)
124 return -ENOSYS; 125 return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
131 return 0; 132 return 0;
132} 133}
133 134
134asmlinkage long 135COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
135compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, 136 compat_uptr_t __user *, head_ptr,
136 compat_size_t __user *len_ptr) 137 compat_size_t __user *, len_ptr)
137{ 138{
138 struct compat_robust_list_head __user *head; 139 struct compat_robust_list_head __user *head;
139 unsigned long ret; 140 unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
142 if (!futex_cmpxchg_enabled) 143 if (!futex_cmpxchg_enabled)
143 return -ENOSYS; 144 return -ENOSYS;
144 145
145 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
146
147 rcu_read_lock(); 146 rcu_read_lock();
148 147
149 ret = -ESRCH; 148 ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
172 return ret; 171 return ret;
173} 172}
174 173
175asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, 174COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
176 struct compat_timespec __user *utime, u32 __user *uaddr2, 175 struct compat_timespec __user *, utime, u32 __user *, uaddr2,
177 u32 val3) 176 u32, val3)
178{ 177{
179 struct timespec ts; 178 struct timespec ts;
180 ktime_t t, *tp = NULL; 179 ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..d4da55d1fb65 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
35config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
39 default n 39 default n
40 ---help--- 40 ---help---
41 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b5..cc47812d3feb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,8 @@
44#include <linux/err.h> 44#include <linux/err.h>
45#include <linux/debugobjects.h> 45#include <linux/debugobjects.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h>
47#include <linux/timer.h> 49#include <linux/timer.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
@@ -640,21 +642,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
640 * and expiry check is done in the hrtimer_interrupt or in the softirq. 642 * and expiry check is done in the hrtimer_interrupt or in the softirq.
641 */ 643 */
642static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 644static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
643 struct hrtimer_clock_base *base, 645 struct hrtimer_clock_base *base)
644 int wakeup)
645{ 646{
646 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 647 return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
647 if (wakeup) {
648 raw_spin_unlock(&base->cpu_base->lock);
649 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
650 raw_spin_lock(&base->cpu_base->lock);
651 } else
652 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
653
654 return 1;
655 }
656
657 return 0;
658} 648}
659 649
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) 650static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -735,8 +725,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
735static inline void 725static inline void
736hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } 726hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
737static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 727static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
738 struct hrtimer_clock_base *base, 728 struct hrtimer_clock_base *base)
739 int wakeup)
740{ 729{
741 return 0; 730 return 0;
742} 731}
@@ -995,8 +984,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
995 * 984 *
996 * XXX send_remote_softirq() ? 985 * XXX send_remote_softirq() ?
997 */ 986 */
998 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) 987 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
999 hrtimer_enqueue_reprogram(timer, new_base, wakeup); 988 && hrtimer_enqueue_reprogram(timer, new_base)) {
989 if (wakeup) {
990 /*
991 * We need to drop cpu_base->lock to avoid a
992 * lock ordering issue vs. rq->lock.
993 */
994 raw_spin_unlock(&new_base->cpu_base->lock);
995 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
996 local_irq_restore(flags);
997 return ret;
998 } else {
999 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
1000 }
1001 }
1000 1002
1001 unlock_hrtimer_base(timer, &flags); 1003 unlock_hrtimer_base(timer, &flags);
1002 1004
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f29d30e..cbd97ce0b000 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
90EXPORT_SYMBOL(irq_set_handler_data); 90EXPORT_SYMBOL(irq_set_handler_data);
91 91
92/** 92/**
93 * irq_set_msi_desc - set MSI descriptor data for an irq 93 * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
94 * @irq: Interrupt number 94 * @irq_base: Interrupt number base
95 * @entry: Pointer to MSI descriptor data 95 * @irq_offset: Interrupt number offset
96 * @entry: Pointer to MSI descriptor data
96 * 97 *
97 * Set the MSI descriptor entry for an irq 98 * Set the MSI descriptor entry for an irq at offset
98 */ 99 */
99int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) 100int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
101 struct msi_desc *entry)
100{ 102{
101 unsigned long flags; 103 unsigned long flags;
102 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 104 struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
103 105
104 if (!desc) 106 if (!desc)
105 return -EINVAL; 107 return -EINVAL;
106 desc->irq_data.msi_desc = entry; 108 desc->irq_data.msi_desc = entry;
107 if (entry) 109 if (entry && !irq_offset)
108 entry->irq = irq; 110 entry->irq = irq_base;
109 irq_put_desc_unlock(desc, flags); 111 irq_put_desc_unlock(desc, flags);
110 return 0; 112 return 0;
111} 113}
112 114
113/** 115/**
116 * irq_set_msi_desc - set MSI descriptor data for an irq
117 * @irq: Interrupt number
118 * @entry: Pointer to MSI descriptor data
119 *
120 * Set the MSI descriptor entry for an irq
121 */
122int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
123{
124 return irq_set_msi_desc_off(irq, 0, entry);
125}
126
127/**
114 * irq_set_chip_data - set irq chip data for an irq 128 * irq_set_chip_data - set irq chip data for an irq
115 * @irq: Interrupt number 129 * @irq: Interrupt number
116 * @data: Pointer to chip specific data 130 * @data: Pointer to chip specific data
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288fa479..fa17855ca65a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/sched/rt.h>
19#include <linux/task_work.h> 20#include <linux/task_work.h>
20 21
21#include "internals.h" 22#include "internals.h"
@@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
1524out: 1525out:
1525 irq_put_desc_unlock(desc, flags); 1526 irq_put_desc_unlock(desc, flags);
1526} 1527}
1528EXPORT_SYMBOL_GPL(enable_percpu_irq);
1527 1529
1528void disable_percpu_irq(unsigned int irq) 1530void disable_percpu_irq(unsigned int irq)
1529{ 1531{
@@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)
1537 irq_percpu_disable(desc, cpu); 1539 irq_percpu_disable(desc, cpu);
1538 irq_put_desc_unlock(desc, flags); 1540 irq_put_desc_unlock(desc, flags);
1539} 1541}
1542EXPORT_SYMBOL_GPL(disable_percpu_irq);
1540 1543
1541/* 1544/*
1542 * Internal function to unregister a percpu irqaction. 1545 * Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323a..397db02209ed 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
76static ssize_t write_irq_affinity(int type, struct file *file, 76static ssize_t write_irq_affinity(int type, struct file *file,
77 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
78{ 78{
79 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 79 unsigned int irq = (int)(long)PDE(file_inode(file))->data;
80 cpumask_var_t new_value; 80 cpumask_var_t new_value;
81 int err; 81 int err;
82 82
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c45..7b5f012bde9d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
80 80
81 /* 81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the 82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well. 83 * first.
84 */ 84 */
85 action = desc->action; 85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) || 86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || 87 (action->flags & __IRQF_TIMER))
88 (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
89 !action->next)
90 goto out; 88 goto out;
91 89
92 /* Already running on another processor */ 90 /* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
104 do { 102 do {
105 if (handle_irq_event(desc) == IRQ_HANDLED) 103 if (handle_irq_event(desc) == IRQ_HANDLED)
106 ret = IRQ_HANDLED; 104 ret = IRQ_HANDLED;
105 /* Make sure that there is still a valid action */
107 action = desc->action; 106 action = desc->action;
108 } while ((desc->istate & IRQS_PENDING) && action); 107 } while ((desc->istate & IRQS_PENDING) && action);
109 desc->istate &= ~IRQS_POLL_INPROGRESS; 108 desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871b..55fcce6065cf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -12,37 +12,36 @@
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/irqflags.h> 14#include <linux/irqflags.h>
15#include <linux/sched.h>
16#include <linux/tick.h>
17#include <linux/cpu.h>
18#include <linux/notifier.h>
15#include <asm/processor.h> 19#include <asm/processor.h>
16 20
17/*
18 * An entry can be in one of four states:
19 *
20 * free NULL, 0 -> {claimed} : free to be used
21 * claimed NULL, 3 -> {pending} : claimed to be enqueued
22 * pending next, 3 -> {busy} : queued, pending callback
23 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
24 */
25
26#define IRQ_WORK_PENDING 1UL
27#define IRQ_WORK_BUSY 2UL
28#define IRQ_WORK_FLAGS 3UL
29 21
30static DEFINE_PER_CPU(struct llist_head, irq_work_list); 22static DEFINE_PER_CPU(struct llist_head, irq_work_list);
23static DEFINE_PER_CPU(int, irq_work_raised);
31 24
32/* 25/*
33 * Claim the entry so that no one else will poke at it. 26 * Claim the entry so that no one else will poke at it.
34 */ 27 */
35static bool irq_work_claim(struct irq_work *work) 28static bool irq_work_claim(struct irq_work *work)
36{ 29{
37 unsigned long flags, nflags; 30 unsigned long flags, oflags, nflags;
38 31
32 /*
33 * Start with our best wish as a premise but only trust any
34 * flag value after cmpxchg() result.
35 */
36 flags = work->flags & ~IRQ_WORK_PENDING;
39 for (;;) { 37 for (;;) {
40 flags = work->flags;
41 if (flags & IRQ_WORK_PENDING)
42 return false;
43 nflags = flags | IRQ_WORK_FLAGS; 38 nflags = flags | IRQ_WORK_FLAGS;
44 if (cmpxchg(&work->flags, flags, nflags) == flags) 39 oflags = cmpxchg(&work->flags, flags, nflags);
40 if (oflags == flags)
45 break; 41 break;
42 if (oflags & IRQ_WORK_PENDING)
43 return false;
44 flags = oflags;
46 cpu_relax(); 45 cpu_relax();
47 } 46 }
48 47
@@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)
57} 56}
58 57
59/* 58/*
60 * Queue the entry and raise the IPI if needed. 59 * Enqueue the irq_work @entry unless it's already pending
60 * somewhere.
61 *
62 * Can be re-enqueued while the callback is still in progress.
61 */ 63 */
62static void __irq_work_queue(struct irq_work *work) 64void irq_work_queue(struct irq_work *work)
63{ 65{
64 bool empty; 66 /* Only queue if not already pending */
67 if (!irq_work_claim(work))
68 return;
65 69
70 /* Queue the entry and raise the IPI if needed. */
66 preempt_disable(); 71 preempt_disable();
67 72
68 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 73 llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
69 /* The list was empty, raise self-interrupt to start processing. */ 74
70 if (empty) 75 /*
71 arch_irq_work_raise(); 76 * If the work is not "lazy" or the tick is stopped, raise the irq
77 * work interrupt (if supported by the arch), otherwise, just wait
78 * for the next tick.
79 */
80 if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
81 if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
82 arch_irq_work_raise();
83 }
72 84
73 preempt_enable(); 85 preempt_enable();
74} 86}
87EXPORT_SYMBOL_GPL(irq_work_queue);
75 88
76/* 89bool irq_work_needs_cpu(void)
77 * Enqueue the irq_work @entry, returns true on success, failure when the
78 * @entry was already enqueued by someone else.
79 *
80 * Can be re-enqueued while the callback is still in progress.
81 */
82bool irq_work_queue(struct irq_work *work)
83{ 90{
84 if (!irq_work_claim(work)) { 91 struct llist_head *this_list;
85 /* 92
86 * Already enqueued, can't do! 93 this_list = &__get_cpu_var(irq_work_list);
87 */ 94 if (llist_empty(this_list))
88 return false; 95 return false;
89 }
90 96
91 __irq_work_queue(work); 97 /* All work should have been flushed before going offline */
98 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
99
92 return true; 100 return true;
93} 101}
94EXPORT_SYMBOL_GPL(irq_work_queue);
95 102
96/* 103static void __irq_work_run(void)
97 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
98 * context with local IRQs disabled.
99 */
100void irq_work_run(void)
101{ 104{
105 unsigned long flags;
102 struct irq_work *work; 106 struct irq_work *work;
103 struct llist_head *this_list; 107 struct llist_head *this_list;
104 struct llist_node *llnode; 108 struct llist_node *llnode;
105 109
110
111 /*
112 * Reset the "raised" state right before we check the list because
113 * an NMI may enqueue after we find the list empty from the runner.
114 */
115 __this_cpu_write(irq_work_raised, 0);
116 barrier();
117
106 this_list = &__get_cpu_var(irq_work_list); 118 this_list = &__get_cpu_var(irq_work_list);
107 if (llist_empty(this_list)) 119 if (llist_empty(this_list))
108 return; 120 return;
109 121
110 BUG_ON(!in_irq());
111 BUG_ON(!irqs_disabled()); 122 BUG_ON(!irqs_disabled());
112 123
113 llnode = llist_del_all(this_list); 124 llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@ void irq_work_run(void)
119 /* 130 /*
120 * Clear the PENDING bit, after this point the @work 131 * Clear the PENDING bit, after this point the @work
121 * can be re-used. 132 * can be re-used.
133 * Make it immediately visible so that other CPUs trying
134 * to claim that work don't rely on us to handle their data
135 * while we are in the middle of the func.
122 */ 136 */
123 work->flags = IRQ_WORK_BUSY; 137 flags = work->flags & ~IRQ_WORK_PENDING;
138 xchg(&work->flags, flags);
139
124 work->func(work); 140 work->func(work);
125 /* 141 /*
126 * Clear the BUSY bit and return to the free state if 142 * Clear the BUSY bit and return to the free state if
127 * no-one else claimed it meanwhile. 143 * no-one else claimed it meanwhile.
128 */ 144 */
129 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); 145 (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
130 } 146 }
131} 147}
148
149/*
150 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
151 * context with local IRQs disabled.
152 */
153void irq_work_run(void)
154{
155 BUG_ON(!in_irq());
156 __irq_work_run();
157}
132EXPORT_SYMBOL_GPL(irq_work_run); 158EXPORT_SYMBOL_GPL(irq_work_run);
133 159
134/* 160/*
@@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)
143 cpu_relax(); 169 cpu_relax();
144} 170}
145EXPORT_SYMBOL_GPL(irq_work_sync); 171EXPORT_SYMBOL_GPL(irq_work_sync);
172
173#ifdef CONFIG_HOTPLUG_CPU
174static int irq_work_cpu_notify(struct notifier_block *self,
175 unsigned long action, void *hcpu)
176{
177 long cpu = (long)hcpu;
178
179 switch (action) {
180 case CPU_DYING:
181 /* Called from stop_machine */
182 if (WARN_ON_ONCE(cpu != smp_processor_id()))
183 break;
184 __irq_work_run();
185 break;
186 default:
187 break;
188 }
189 return NOTIFY_OK;
190}
191
192static struct notifier_block cpu_notify;
193
194static __init int irq_work_init_cpu_notifier(void)
195{
196 cpu_notify.notifier_call = irq_work_cpu_notify;
197 cpu_notify.priority = 0;
198 register_cpu_notifier(&cpu_notify);
199 return 0;
200}
201device_initcall(irq_work_init_cpu_notifier);
202
203#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5d..bddd3d7a74b6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
54 .end = 0, 54 .end = 0,
55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
56}; 56};
57struct resource crashk_low_res = {
58 .name = "Crash kernel low",
59 .start = 0,
60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
62};
57 63
58int kexec_should_crash(struct task_struct *p) 64int kexec_should_crash(struct task_struct *p)
59{ 65{
@@ -223,6 +229,8 @@ out:
223 229
224} 230}
225 231
232static void kimage_free_page_list(struct list_head *list);
233
226static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 234static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
227 unsigned long nr_segments, 235 unsigned long nr_segments,
228 struct kexec_segment __user *segments) 236 struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
236 if (result) 244 if (result)
237 goto out; 245 goto out;
238 246
239 *rimage = image;
240
241 /* 247 /*
242 * Find a location for the control code buffer, and add it 248 * Find a location for the control code buffer, and add it
243 * the vector of segments so that it's pages will also be 249 * the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
248 get_order(KEXEC_CONTROL_PAGE_SIZE)); 254 get_order(KEXEC_CONTROL_PAGE_SIZE));
249 if (!image->control_code_page) { 255 if (!image->control_code_page) {
250 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 256 printk(KERN_ERR "Could not allocate control_code_buffer\n");
251 goto out; 257 goto out_free;
252 } 258 }
253 259
254 image->swap_page = kimage_alloc_control_pages(image, 0); 260 image->swap_page = kimage_alloc_control_pages(image, 0);
255 if (!image->swap_page) { 261 if (!image->swap_page) {
256 printk(KERN_ERR "Could not allocate swap buffer\n"); 262 printk(KERN_ERR "Could not allocate swap buffer\n");
257 goto out; 263 goto out_free;
258 } 264 }
259 265
260 result = 0; 266 *rimage = image;
261 out: 267 return 0;
262 if (result == 0)
263 *rimage = image;
264 else
265 kfree(image);
266 268
269out_free:
270 kimage_free_page_list(&image->control_pages);
271 kfree(image);
272out:
267 return result; 273 return result;
268} 274}
269 275
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
310 mend = mstart + image->segment[i].memsz - 1; 316 mend = mstart + image->segment[i].memsz - 1;
311 /* Ensure we are within the crash kernel limits */ 317 /* Ensure we are within the crash kernel limits */
312 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 318 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
313 goto out; 319 goto out_free;
314 } 320 }
315 321
316 /* 322 /*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
323 get_order(KEXEC_CONTROL_PAGE_SIZE)); 329 get_order(KEXEC_CONTROL_PAGE_SIZE));
324 if (!image->control_code_page) { 330 if (!image->control_code_page) {
325 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 331 printk(KERN_ERR "Could not allocate control_code_buffer\n");
326 goto out; 332 goto out_free;
327 } 333 }
328 334
329 result = 0; 335 *rimage = image;
330out: 336 return 0;
331 if (result == 0)
332 *rimage = image;
333 else
334 kfree(image);
335 337
338out_free:
339 kfree(image);
340out:
336 return result; 341 return result;
337} 342}
338 343
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
497 502
498 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) 503 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
499 break; 504 break;
500 if (hole_end > crashk_res.end)
501 break;
502 /* See if I overlap any of the segments */ 505 /* See if I overlap any of the segments */
503 for (i = 0; i < image->nr_segments; i++) { 506 for (i = 0; i < image->nr_segments; i++) {
504 unsigned long mstart, mend; 507 unsigned long mstart, mend;
@@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char *cmdline,
1369 * That function is the entry point for command line parsing and should be 1372 * That function is the entry point for command line parsing and should be
1370 * called from the arch-specific code. 1373 * called from the arch-specific code.
1371 */ 1374 */
1372int __init parse_crashkernel(char *cmdline, 1375static int __init __parse_crashkernel(char *cmdline,
1373 unsigned long long system_ram, 1376 unsigned long long system_ram,
1374 unsigned long long *crash_size, 1377 unsigned long long *crash_size,
1375 unsigned long long *crash_base) 1378 unsigned long long *crash_base,
1379 const char *name)
1376{ 1380{
1377 char *p = cmdline, *ck_cmdline = NULL; 1381 char *p = cmdline, *ck_cmdline = NULL;
1378 char *first_colon, *first_space; 1382 char *first_colon, *first_space;
@@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char *cmdline,
1382 *crash_base = 0; 1386 *crash_base = 0;
1383 1387
1384 /* find crashkernel and use the last one if there are more */ 1388 /* find crashkernel and use the last one if there are more */
1385 p = strstr(p, "crashkernel="); 1389 p = strstr(p, name);
1386 while (p) { 1390 while (p) {
1387 ck_cmdline = p; 1391 ck_cmdline = p;
1388 p = strstr(p+1, "crashkernel="); 1392 p = strstr(p+1, name);
1389 } 1393 }
1390 1394
1391 if (!ck_cmdline) 1395 if (!ck_cmdline)
1392 return -EINVAL; 1396 return -EINVAL;
1393 1397
1394 ck_cmdline += 12; /* strlen("crashkernel=") */ 1398 ck_cmdline += strlen(name);
1395 1399
1396 /* 1400 /*
1397 * if the commandline contains a ':', then that's the extended 1401 * if the commandline contains a ':', then that's the extended
@@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char *cmdline,
1409 return 0; 1413 return 0;
1410} 1414}
1411 1415
1416int __init parse_crashkernel(char *cmdline,
1417 unsigned long long system_ram,
1418 unsigned long long *crash_size,
1419 unsigned long long *crash_base)
1420{
1421 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1422 "crashkernel=");
1423}
1424
1425int __init parse_crashkernel_low(char *cmdline,
1426 unsigned long long system_ram,
1427 unsigned long long *crash_size,
1428 unsigned long long *crash_base)
1429{
1430 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1431 "crashkernel_low=");
1432}
1412 1433
1413static void update_vmcoreinfo_note(void) 1434static void update_vmcoreinfo_note(void)
1414{ 1435{
@@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void)
1490 VMCOREINFO_OFFSET(page, _count); 1511 VMCOREINFO_OFFSET(page, _count);
1491 VMCOREINFO_OFFSET(page, mapping); 1512 VMCOREINFO_OFFSET(page, mapping);
1492 VMCOREINFO_OFFSET(page, lru); 1513 VMCOREINFO_OFFSET(page, lru);
1514 VMCOREINFO_OFFSET(page, _mapcount);
1515 VMCOREINFO_OFFSET(page, private);
1493 VMCOREINFO_OFFSET(pglist_data, node_zones); 1516 VMCOREINFO_OFFSET(pglist_data, node_zones);
1494 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1517 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1495#ifdef CONFIG_FLAT_NODE_MEM_MAP 1518#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void)
1512 VMCOREINFO_NUMBER(PG_lru); 1535 VMCOREINFO_NUMBER(PG_lru);
1513 VMCOREINFO_NUMBER(PG_private); 1536 VMCOREINFO_NUMBER(PG_private);
1514 VMCOREINFO_NUMBER(PG_swapcache); 1537 VMCOREINFO_NUMBER(PG_swapcache);
1538 VMCOREINFO_NUMBER(PG_slab);
1539#ifdef CONFIG_MEMORY_FAILURE
1540 VMCOREINFO_NUMBER(PG_hwpoison);
1541#endif
1542 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1515 1543
1516 arch_crash_save_vmcoreinfo(); 1544 arch_crash_save_vmcoreinfo();
1517 update_vmcoreinfo_note(); 1545 update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b81d24..000000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
1/*
2 * A generic kernel FIFO implementation
3 *
4 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 */
21
22#include <linux/kernel.h>
23#include <linux/export.h>
24#include <linux/slab.h>
25#include <linux/err.h>
26#include <linux/log2.h>
27#include <linux/uaccess.h>
28#include <linux/kfifo.h>
29
30/*
31 * internal helper to calculate the unused elements in a fifo
32 */
33static inline unsigned int kfifo_unused(struct __kfifo *fifo)
34{
35 return (fifo->mask + 1) - (fifo->in - fifo->out);
36}
37
38int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
39 size_t esize, gfp_t gfp_mask)
40{
41 /*
42 * round down to the next power of 2, since our 'let the indices
43 * wrap' technique works only in this case.
44 */
45 if (!is_power_of_2(size))
46 size = rounddown_pow_of_two(size);
47
48 fifo->in = 0;
49 fifo->out = 0;
50 fifo->esize = esize;
51
52 if (size < 2) {
53 fifo->data = NULL;
54 fifo->mask = 0;
55 return -EINVAL;
56 }
57
58 fifo->data = kmalloc(size * esize, gfp_mask);
59
60 if (!fifo->data) {
61 fifo->mask = 0;
62 return -ENOMEM;
63 }
64 fifo->mask = size - 1;
65
66 return 0;
67}
68EXPORT_SYMBOL(__kfifo_alloc);
69
70void __kfifo_free(struct __kfifo *fifo)
71{
72 kfree(fifo->data);
73 fifo->in = 0;
74 fifo->out = 0;
75 fifo->esize = 0;
76 fifo->data = NULL;
77 fifo->mask = 0;
78}
79EXPORT_SYMBOL(__kfifo_free);
80
81int __kfifo_init(struct __kfifo *fifo, void *buffer,
82 unsigned int size, size_t esize)
83{
84 size /= esize;
85
86 if (!is_power_of_2(size))
87 size = rounddown_pow_of_two(size);
88
89 fifo->in = 0;
90 fifo->out = 0;
91 fifo->esize = esize;
92 fifo->data = buffer;
93
94 if (size < 2) {
95 fifo->mask = 0;
96 return -EINVAL;
97 }
98 fifo->mask = size - 1;
99
100 return 0;
101}
102EXPORT_SYMBOL(__kfifo_init);
103
104static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
105 unsigned int len, unsigned int off)
106{
107 unsigned int size = fifo->mask + 1;
108 unsigned int esize = fifo->esize;
109 unsigned int l;
110
111 off &= fifo->mask;
112 if (esize != 1) {
113 off *= esize;
114 size *= esize;
115 len *= esize;
116 }
117 l = min(len, size - off);
118
119 memcpy(fifo->data + off, src, l);
120 memcpy(fifo->data, src + l, len - l);
121 /*
122 * make sure that the data in the fifo is up to date before
123 * incrementing the fifo->in index counter
124 */
125 smp_wmb();
126}
127
128unsigned int __kfifo_in(struct __kfifo *fifo,
129 const void *buf, unsigned int len)
130{
131 unsigned int l;
132
133 l = kfifo_unused(fifo);
134 if (len > l)
135 len = l;
136
137 kfifo_copy_in(fifo, buf, len, fifo->in);
138 fifo->in += len;
139 return len;
140}
141EXPORT_SYMBOL(__kfifo_in);
142
143static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
144 unsigned int len, unsigned int off)
145{
146 unsigned int size = fifo->mask + 1;
147 unsigned int esize = fifo->esize;
148 unsigned int l;
149
150 off &= fifo->mask;
151 if (esize != 1) {
152 off *= esize;
153 size *= esize;
154 len *= esize;
155 }
156 l = min(len, size - off);
157
158 memcpy(dst, fifo->data + off, l);
159 memcpy(dst + l, fifo->data, len - l);
160 /*
161 * make sure that the data is copied before
162 * incrementing the fifo->out index counter
163 */
164 smp_wmb();
165}
166
167unsigned int __kfifo_out_peek(struct __kfifo *fifo,
168 void *buf, unsigned int len)
169{
170 unsigned int l;
171
172 l = fifo->in - fifo->out;
173 if (len > l)
174 len = l;
175
176 kfifo_copy_out(fifo, buf, len, fifo->out);
177 return len;
178}
179EXPORT_SYMBOL(__kfifo_out_peek);
180
181unsigned int __kfifo_out(struct __kfifo *fifo,
182 void *buf, unsigned int len)
183{
184 len = __kfifo_out_peek(fifo, buf, len);
185 fifo->out += len;
186 return len;
187}
188EXPORT_SYMBOL(__kfifo_out);
189
190static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
191 const void __user *from, unsigned int len, unsigned int off,
192 unsigned int *copied)
193{
194 unsigned int size = fifo->mask + 1;
195 unsigned int esize = fifo->esize;
196 unsigned int l;
197 unsigned long ret;
198
199 off &= fifo->mask;
200 if (esize != 1) {
201 off *= esize;
202 size *= esize;
203 len *= esize;
204 }
205 l = min(len, size - off);
206
207 ret = copy_from_user(fifo->data + off, from, l);
208 if (unlikely(ret))
209 ret = DIV_ROUND_UP(ret + len - l, esize);
210 else {
211 ret = copy_from_user(fifo->data, from + l, len - l);
212 if (unlikely(ret))
213 ret = DIV_ROUND_UP(ret, esize);
214 }
215 /*
216 * make sure that the data in the fifo is up to date before
217 * incrementing the fifo->in index counter
218 */
219 smp_wmb();
220 *copied = len - ret;
221 /* return the number of elements which are not copied */
222 return ret;
223}
224
225int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
226 unsigned long len, unsigned int *copied)
227{
228 unsigned int l;
229 unsigned long ret;
230 unsigned int esize = fifo->esize;
231 int err;
232
233 if (esize != 1)
234 len /= esize;
235
236 l = kfifo_unused(fifo);
237 if (len > l)
238 len = l;
239
240 ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
241 if (unlikely(ret)) {
242 len -= ret;
243 err = -EFAULT;
244 } else
245 err = 0;
246 fifo->in += len;
247 return err;
248}
249EXPORT_SYMBOL(__kfifo_from_user);
250
251static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
252 unsigned int len, unsigned int off, unsigned int *copied)
253{
254 unsigned int l;
255 unsigned long ret;
256 unsigned int size = fifo->mask + 1;
257 unsigned int esize = fifo->esize;
258
259 off &= fifo->mask;
260 if (esize != 1) {
261 off *= esize;
262 size *= esize;
263 len *= esize;
264 }
265 l = min(len, size - off);
266
267 ret = copy_to_user(to, fifo->data + off, l);
268 if (unlikely(ret))
269 ret = DIV_ROUND_UP(ret + len - l, esize);
270 else {
271 ret = copy_to_user(to + l, fifo->data, len - l);
272 if (unlikely(ret))
273 ret = DIV_ROUND_UP(ret, esize);
274 }
275 /*
276 * make sure that the data is copied before
277 * incrementing the fifo->out index counter
278 */
279 smp_wmb();
280 *copied = len - ret;
281 /* return the number of elements which are not copied */
282 return ret;
283}
284
285int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
286 unsigned long len, unsigned int *copied)
287{
288 unsigned int l;
289 unsigned long ret;
290 unsigned int esize = fifo->esize;
291 int err;
292
293 if (esize != 1)
294 len /= esize;
295
296 l = fifo->in - fifo->out;
297 if (len > l)
298 len = l;
299 ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
300 if (unlikely(ret)) {
301 len -= ret;
302 err = -EFAULT;
303 } else
304 err = 0;
305 fifo->out += len;
306 return err;
307}
308EXPORT_SYMBOL(__kfifo_to_user);
309
310static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
311 int nents, unsigned int len)
312{
313 int n;
314 unsigned int l;
315 unsigned int off;
316 struct page *page;
317
318 if (!nents)
319 return 0;
320
321 if (!len)
322 return 0;
323
324 n = 0;
325 page = virt_to_page(buf);
326 off = offset_in_page(buf);
327 l = 0;
328
329 while (len >= l + PAGE_SIZE - off) {
330 struct page *npage;
331
332 l += PAGE_SIZE;
333 buf += PAGE_SIZE;
334 npage = virt_to_page(buf);
335 if (page_to_phys(page) != page_to_phys(npage) - l) {
336 sg_set_page(sgl, page, l - off, off);
337 sgl = sg_next(sgl);
338 if (++n == nents || sgl == NULL)
339 return n;
340 page = npage;
341 len -= l - off;
342 l = off = 0;
343 }
344 }
345 sg_set_page(sgl, page, len, off);
346 return n + 1;
347}
348
349static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
350 int nents, unsigned int len, unsigned int off)
351{
352 unsigned int size = fifo->mask + 1;
353 unsigned int esize = fifo->esize;
354 unsigned int l;
355 unsigned int n;
356
357 off &= fifo->mask;
358 if (esize != 1) {
359 off *= esize;
360 size *= esize;
361 len *= esize;
362 }
363 l = min(len, size - off);
364
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367
368 return n;
369}
370
371unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
372 struct scatterlist *sgl, int nents, unsigned int len)
373{
374 unsigned int l;
375
376 l = kfifo_unused(fifo);
377 if (len > l)
378 len = l;
379
380 return setup_sgl(fifo, sgl, nents, len, fifo->in);
381}
382EXPORT_SYMBOL(__kfifo_dma_in_prepare);
383
384unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
385 struct scatterlist *sgl, int nents, unsigned int len)
386{
387 unsigned int l;
388
389 l = fifo->in - fifo->out;
390 if (len > l)
391 len = l;
392
393 return setup_sgl(fifo, sgl, nents, len, fifo->out);
394}
395EXPORT_SYMBOL(__kfifo_dma_out_prepare);
396
397unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
398{
399 unsigned int max = (1 << (recsize << 3)) - 1;
400
401 if (len > max)
402 return max;
403 return len;
404}
405EXPORT_SYMBOL(__kfifo_max_r);
406
407#define __KFIFO_PEEK(data, out, mask) \
408 ((data)[(out) & (mask)])
409/*
410 * __kfifo_peek_n internal helper function for determinate the length of
411 * the next record in the fifo
412 */
413static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
414{
415 unsigned int l;
416 unsigned int mask = fifo->mask;
417 unsigned char *data = fifo->data;
418
419 l = __KFIFO_PEEK(data, fifo->out, mask);
420
421 if (--recsize)
422 l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
423
424 return l;
425}
426
427#define __KFIFO_POKE(data, in, mask, val) \
428 ( \
429 (data)[(in) & (mask)] = (unsigned char)(val) \
430 )
431
432/*
433 * __kfifo_poke_n internal helper function for storeing the length of
434 * the record into the fifo
435 */
436static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
437{
438 unsigned int mask = fifo->mask;
439 unsigned char *data = fifo->data;
440
441 __KFIFO_POKE(data, fifo->in, mask, n);
442
443 if (recsize > 1)
444 __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
445}
446
447unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
448{
449 return __kfifo_peek_n(fifo, recsize);
450}
451EXPORT_SYMBOL(__kfifo_len_r);
452
453unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
454 unsigned int len, size_t recsize)
455{
456 if (len + recsize > kfifo_unused(fifo))
457 return 0;
458
459 __kfifo_poke_n(fifo, len, recsize);
460
461 kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
462 fifo->in += len + recsize;
463 return len;
464}
465EXPORT_SYMBOL(__kfifo_in_r);
466
467static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
468 void *buf, unsigned int len, size_t recsize, unsigned int *n)
469{
470 *n = __kfifo_peek_n(fifo, recsize);
471
472 if (len > *n)
473 len = *n;
474
475 kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
476 return len;
477}
478
479unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
480 unsigned int len, size_t recsize)
481{
482 unsigned int n;
483
484 if (fifo->in == fifo->out)
485 return 0;
486
487 return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
488}
489EXPORT_SYMBOL(__kfifo_out_peek_r);
490
491unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
492 unsigned int len, size_t recsize)
493{
494 unsigned int n;
495
496 if (fifo->in == fifo->out)
497 return 0;
498
499 len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
500 fifo->out += n + recsize;
501 return len;
502}
503EXPORT_SYMBOL(__kfifo_out_r);
504
505void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
506{
507 unsigned int n;
508
509 n = __kfifo_peek_n(fifo, recsize);
510 fifo->out += n + recsize;
511}
512EXPORT_SYMBOL(__kfifo_skip_r);
513
514int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
515 unsigned long len, unsigned int *copied, size_t recsize)
516{
517 unsigned long ret;
518
519 len = __kfifo_max_r(len, recsize);
520
521 if (len + recsize > kfifo_unused(fifo)) {
522 *copied = 0;
523 return 0;
524 }
525
526 __kfifo_poke_n(fifo, len, recsize);
527
528 ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
529 if (unlikely(ret)) {
530 *copied = 0;
531 return -EFAULT;
532 }
533 fifo->in += len + recsize;
534 return 0;
535}
536EXPORT_SYMBOL(__kfifo_from_user_r);
537
538int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
539 unsigned long len, unsigned int *copied, size_t recsize)
540{
541 unsigned long ret;
542 unsigned int n;
543
544 if (fifo->in == fifo->out) {
545 *copied = 0;
546 return 0;
547 }
548
549 n = __kfifo_peek_n(fifo, recsize);
550 if (len > n)
551 len = n;
552
553 ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
554 if (unlikely(ret)) {
555 *copied = 0;
556 return -EFAULT;
557 }
558 fifo->out += n + recsize;
559 return 0;
560}
561EXPORT_SYMBOL(__kfifo_to_user_r);
562
563unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
564 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
565{
566 if (!nents)
567 BUG();
568
569 len = __kfifo_max_r(len, recsize);
570
571 if (len + recsize > kfifo_unused(fifo))
572 return 0;
573
574 return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
575}
576EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
577
578void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
579 unsigned int len, size_t recsize)
580{
581 len = __kfifo_max_r(len, recsize);
582 __kfifo_poke_n(fifo, len, recsize);
583 fifo->in += len + recsize;
584}
585EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
586
587unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
588 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
589{
590 if (!nents)
591 BUG();
592
593 len = __kfifo_max_r(len, recsize);
594
595 if (len + recsize > fifo->in - fifo->out)
596 return 0;
597
598 return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
599}
600EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
601
602void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
603{
604 unsigned int len;
605
606 len = __kfifo_peek_n(fifo, recsize);
607 fifo->out += len + recsize;
608}
609EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0023a87e8de6..56dd34976d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -38,6 +38,7 @@
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h> 39#include <linux/rwsem.h>
40#include <linux/ptrace.h> 40#include <linux/ptrace.h>
41#include <linux/async.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42 43
43#include <trace/events/module.h> 44#include <trace/events/module.h>
@@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...)
130#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 131#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
131 static int kmod_loop_msg; 132 static int kmod_loop_msg;
132 133
134 /*
135 * We don't allow synchronous module loading from async. Module
136 * init may invoke async_synchronize_full() which will end up
137 * waiting for this task which already is waiting for the module
138 * loading to complete, leading to a deadlock.
139 */
140 WARN_ON_ONCE(wait && current_is_async());
141
133 va_start(args, fmt); 142 va_start(args, fmt);
134 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 143 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
135 va_end(args); 144 va_end(args);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..e35be53f6613 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
334struct kprobe __kprobes *get_kprobe(void *addr) 334struct kprobe __kprobes *get_kprobe(void *addr)
335{ 335{
336 struct hlist_head *head; 336 struct hlist_head *head;
337 struct hlist_node *node;
338 struct kprobe *p; 337 struct kprobe *p;
339 338
340 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; 339 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
341 hlist_for_each_entry_rcu(p, node, head, hlist) { 340 hlist_for_each_entry_rcu(p, head, hlist) {
342 if (p->addr == addr) 341 if (p->addr == addr)
343 return p; 342 return p;
344 } 343 }
@@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list);
471 470
472static void kprobe_optimizer(struct work_struct *work); 471static void kprobe_optimizer(struct work_struct *work);
473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 472static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
475#define OPTIMIZE_DELAY 5 473#define OPTIMIZE_DELAY 5
476 474
477/* 475/*
@@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
552/* Start optimizer after OPTIMIZE_DELAY passed */ 550/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void) 551static __kprobes void kick_kprobe_optimizer(void)
554{ 552{
555 if (!delayed_work_pending(&optimizing_work)) 553 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557} 554}
558 555
559/* Kprobe jump optimizer */ 556/* Kprobe jump optimizer */
@@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
592 /* Step 5: Kick optimizer again if needed */ 589 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) 590 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer(); 591 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598} 592}
599 593
600/* Wait for completing optimization and unoptimization */ 594/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void) 595static __kprobes void wait_for_kprobe_optimizer(void)
602{ 596{
603 if (delayed_work_pending(&optimizing_work)) 597 mutex_lock(&kprobe_mutex);
604 wait_for_completion(&optimizer_comp); 598
599 while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
600 mutex_unlock(&kprobe_mutex);
601
602 /* this will also make optimizing_work execute immmediately */
603 flush_delayed_work(&optimizing_work);
604 /* @optimizing_work might not have been queued yet, relax */
605 cpu_relax();
606
607 mutex_lock(&kprobe_mutex);
608 }
609
610 mutex_unlock(&kprobe_mutex);
605} 611}
606 612
607/* Optimize kprobe if p is ready to be optimized */ 613/* Optimize kprobe if p is ready to be optimized */
@@ -792,7 +798,6 @@ out:
792static void __kprobes optimize_all_kprobes(void) 798static void __kprobes optimize_all_kprobes(void)
793{ 799{
794 struct hlist_head *head; 800 struct hlist_head *head;
795 struct hlist_node *node;
796 struct kprobe *p; 801 struct kprobe *p;
797 unsigned int i; 802 unsigned int i;
798 803
@@ -803,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void)
803 kprobes_allow_optimization = true; 808 kprobes_allow_optimization = true;
804 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
805 head = &kprobe_table[i]; 810 head = &kprobe_table[i];
806 hlist_for_each_entry_rcu(p, node, head, hlist) 811 hlist_for_each_entry_rcu(p, head, hlist)
807 if (!kprobe_disabled(p)) 812 if (!kprobe_disabled(p))
808 optimize_kprobe(p); 813 optimize_kprobe(p);
809 } 814 }
@@ -814,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void)
814static void __kprobes unoptimize_all_kprobes(void) 819static void __kprobes unoptimize_all_kprobes(void)
815{ 820{
816 struct hlist_head *head; 821 struct hlist_head *head;
817 struct hlist_node *node;
818 struct kprobe *p; 822 struct kprobe *p;
819 unsigned int i; 823 unsigned int i;
820 824
@@ -825,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void)
825 kprobes_allow_optimization = false; 829 kprobes_allow_optimization = false;
826 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 830 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
827 head = &kprobe_table[i]; 831 head = &kprobe_table[i];
828 hlist_for_each_entry_rcu(p, node, head, hlist) { 832 hlist_for_each_entry_rcu(p, head, hlist) {
829 if (!kprobe_disabled(p)) 833 if (!kprobe_disabled(p))
830 unoptimize_kprobe(p, false); 834 unoptimize_kprobe(p, false);
831 } 835 }
@@ -919,7 +923,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
919} 923}
920#endif /* CONFIG_OPTPROBES */ 924#endif /* CONFIG_OPTPROBES */
921 925
922#ifdef KPROBES_CAN_USE_FTRACE 926#ifdef CONFIG_KPROBES_ON_FTRACE
923static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 927static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
924 .func = kprobe_ftrace_handler, 928 .func = kprobe_ftrace_handler,
925 .flags = FTRACE_OPS_FL_SAVE_REGS, 929 .flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +968,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
964 (unsigned long)p->addr, 1, 0); 968 (unsigned long)p->addr, 1, 0);
965 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); 969 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
966} 970}
967#else /* !KPROBES_CAN_USE_FTRACE */ 971#else /* !CONFIG_KPROBES_ON_FTRACE */
968#define prepare_kprobe(p) arch_prepare_kprobe(p) 972#define prepare_kprobe(p) arch_prepare_kprobe(p)
969#define arm_kprobe_ftrace(p) do {} while (0) 973#define arm_kprobe_ftrace(p) do {} while (0)
970#define disarm_kprobe_ftrace(p) do {} while (0) 974#define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1141,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1141{ 1145{
1142 struct kretprobe_instance *ri; 1146 struct kretprobe_instance *ri;
1143 struct hlist_head *head, empty_rp; 1147 struct hlist_head *head, empty_rp;
1144 struct hlist_node *node, *tmp; 1148 struct hlist_node *tmp;
1145 unsigned long hash, flags = 0; 1149 unsigned long hash, flags = 0;
1146 1150
1147 if (unlikely(!kprobes_initialized)) 1151 if (unlikely(!kprobes_initialized))
@@ -1152,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1152 hash = hash_ptr(tk, KPROBE_HASH_BITS); 1156 hash = hash_ptr(tk, KPROBE_HASH_BITS);
1153 head = &kretprobe_inst_table[hash]; 1157 head = &kretprobe_inst_table[hash];
1154 kretprobe_table_lock(hash, &flags); 1158 kretprobe_table_lock(hash, &flags);
1155 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 1159 hlist_for_each_entry_safe(ri, tmp, head, hlist) {
1156 if (ri->task == tk) 1160 if (ri->task == tk)
1157 recycle_rp_inst(ri, &empty_rp); 1161 recycle_rp_inst(ri, &empty_rp);
1158 } 1162 }
1159 kretprobe_table_unlock(hash, &flags); 1163 kretprobe_table_unlock(hash, &flags);
1160 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 1164 hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
1161 hlist_del(&ri->hlist); 1165 hlist_del(&ri->hlist);
1162 kfree(ri); 1166 kfree(ri);
1163 } 1167 }
@@ -1166,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1166static inline void free_rp_inst(struct kretprobe *rp) 1170static inline void free_rp_inst(struct kretprobe *rp)
1167{ 1171{
1168 struct kretprobe_instance *ri; 1172 struct kretprobe_instance *ri;
1169 struct hlist_node *pos, *next; 1173 struct hlist_node *next;
1170 1174
1171 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { 1175 hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
1172 hlist_del(&ri->hlist); 1176 hlist_del(&ri->hlist);
1173 kfree(ri); 1177 kfree(ri);
1174 } 1178 }
@@ -1178,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
1178{ 1182{
1179 unsigned long flags, hash; 1183 unsigned long flags, hash;
1180 struct kretprobe_instance *ri; 1184 struct kretprobe_instance *ri;
1181 struct hlist_node *pos, *next; 1185 struct hlist_node *next;
1182 struct hlist_head *head; 1186 struct hlist_head *head;
1183 1187
1184 /* No race here */ 1188 /* No race here */
1185 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { 1189 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
1186 kretprobe_table_lock(hash, &flags); 1190 kretprobe_table_lock(hash, &flags);
1187 head = &kretprobe_inst_table[hash]; 1191 head = &kretprobe_inst_table[hash];
1188 hlist_for_each_entry_safe(ri, pos, next, head, hlist) { 1192 hlist_for_each_entry_safe(ri, next, head, hlist) {
1189 if (ri->rp == rp) 1193 if (ri->rp == rp)
1190 ri->rp = NULL; 1194 ri->rp = NULL;
1191 } 1195 }
@@ -1414,12 +1418,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
1414 */ 1418 */
1415 ftrace_addr = ftrace_location((unsigned long)p->addr); 1419 ftrace_addr = ftrace_location((unsigned long)p->addr);
1416 if (ftrace_addr) { 1420 if (ftrace_addr) {
1417#ifdef KPROBES_CAN_USE_FTRACE 1421#ifdef CONFIG_KPROBES_ON_FTRACE
1418 /* Given address is not on the instruction boundary */ 1422 /* Given address is not on the instruction boundary */
1419 if ((unsigned long)p->addr != ftrace_addr) 1423 if ((unsigned long)p->addr != ftrace_addr)
1420 return -EILSEQ; 1424 return -EILSEQ;
1421 p->flags |= KPROBE_FLAG_FTRACE; 1425 p->flags |= KPROBE_FLAG_FTRACE;
1422#else /* !KPROBES_CAN_USE_FTRACE */ 1426#else /* !CONFIG_KPROBES_ON_FTRACE */
1423 return -EINVAL; 1427 return -EINVAL;
1424#endif 1428#endif
1425 } 1429 }
@@ -2021,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
2021{ 2025{
2022 struct module *mod = data; 2026 struct module *mod = data;
2023 struct hlist_head *head; 2027 struct hlist_head *head;
2024 struct hlist_node *node;
2025 struct kprobe *p; 2028 struct kprobe *p;
2026 unsigned int i; 2029 unsigned int i;
2027 int checkcore = (val == MODULE_STATE_GOING); 2030 int checkcore = (val == MODULE_STATE_GOING);
@@ -2038,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
2038 mutex_lock(&kprobe_mutex); 2041 mutex_lock(&kprobe_mutex);
2039 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2042 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2040 head = &kprobe_table[i]; 2043 head = &kprobe_table[i];
2041 hlist_for_each_entry_rcu(p, node, head, hlist) 2044 hlist_for_each_entry_rcu(p, head, hlist)
2042 if (within_module_init((unsigned long)p->addr, mod) || 2045 if (within_module_init((unsigned long)p->addr, mod) ||
2043 (checkcore && 2046 (checkcore &&
2044 within_module_core((unsigned long)p->addr, mod))) { 2047 within_module_core((unsigned long)p->addr, mod))) {
@@ -2185,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
2185static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) 2188static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2186{ 2189{
2187 struct hlist_head *head; 2190 struct hlist_head *head;
2188 struct hlist_node *node;
2189 struct kprobe *p, *kp; 2191 struct kprobe *p, *kp;
2190 const char *sym = NULL; 2192 const char *sym = NULL;
2191 unsigned int i = *(loff_t *) v; 2193 unsigned int i = *(loff_t *) v;
@@ -2194,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2194 2196
2195 head = &kprobe_table[i]; 2197 head = &kprobe_table[i];
2196 preempt_disable(); 2198 preempt_disable();
2197 hlist_for_each_entry_rcu(p, node, head, hlist) { 2199 hlist_for_each_entry_rcu(p, head, hlist) {
2198 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 2200 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
2199 &offset, &modname, namebuf); 2201 &offset, &modname, namebuf);
2200 if (kprobe_aggrprobe(p)) { 2202 if (kprobe_aggrprobe(p)) {
@@ -2229,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = {
2229static void __kprobes arm_all_kprobes(void) 2231static void __kprobes arm_all_kprobes(void)
2230{ 2232{
2231 struct hlist_head *head; 2233 struct hlist_head *head;
2232 struct hlist_node *node;
2233 struct kprobe *p; 2234 struct kprobe *p;
2234 unsigned int i; 2235 unsigned int i;
2235 2236
@@ -2242,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void)
2242 /* Arming kprobes doesn't optimize kprobe itself */ 2243 /* Arming kprobes doesn't optimize kprobe itself */
2243 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2244 head = &kprobe_table[i]; 2245 head = &kprobe_table[i];
2245 hlist_for_each_entry_rcu(p, node, head, hlist) 2246 hlist_for_each_entry_rcu(p, head, hlist)
2246 if (!kprobe_disabled(p)) 2247 if (!kprobe_disabled(p))
2247 arm_kprobe(p); 2248 arm_kprobe(p);
2248 } 2249 }
@@ -2258,7 +2259,6 @@ already_enabled:
2258static void __kprobes disarm_all_kprobes(void) 2259static void __kprobes disarm_all_kprobes(void)
2259{ 2260{
2260 struct hlist_head *head; 2261 struct hlist_head *head;
2261 struct hlist_node *node;
2262 struct kprobe *p; 2262 struct kprobe *p;
2263 unsigned int i; 2263 unsigned int i;
2264 2264
@@ -2275,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void)
2275 2275
2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2277 head = &kprobe_table[i]; 2277 head = &kprobe_table[i];
2278 hlist_for_each_entry_rcu(p, node, head, hlist) { 2278 hlist_for_each_entry_rcu(p, head, hlist) {
2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
2280 disarm_kprobe(p, false); 2280 disarm_kprobe(p, false);
2281 } 2281 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350d..259db207b5d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3190#endif 3190#endif
3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
3192 debug_locks_off(); 3192 debug_locks_off();
3193 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 3193 printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
3194 curr->lockdep_depth, MAX_LOCK_DEPTH);
3194 printk("turning off the locking correctness validator.\n"); 3195 printk("turning off the locking correctness validator.\n");
3196
3197 lockdep_print_held_locks(current);
3198 debug_show_all_locks();
3195 dump_stack(); 3199 dump_stack();
3200
3196 return 0; 3201 return 0;
3197 } 3202 }
3198 3203
@@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3203} 3208}
3204 3209
3205static int 3210static int
3206print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, 3211print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3207 unsigned long ip) 3212 unsigned long ip)
3208{ 3213{
3209 if (!debug_locks_off()) 3214 if (!debug_locks_off())
@@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
3246 return 0; 3251 return 0;
3247 3252
3248 if (curr->lockdep_depth <= 0) 3253 if (curr->lockdep_depth <= 0)
3249 return print_unlock_inbalance_bug(curr, lock, ip); 3254 return print_unlock_imbalance_bug(curr, lock, ip);
3250 3255
3251 return 1; 3256 return 1;
3252} 3257}
@@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3317 goto found_it; 3322 goto found_it;
3318 prev_hlock = hlock; 3323 prev_hlock = hlock;
3319 } 3324 }
3320 return print_unlock_inbalance_bug(curr, lock, ip); 3325 return print_unlock_imbalance_bug(curr, lock, ip);
3321 3326
3322found_it: 3327found_it:
3323 lockdep_init_map(lock, name, key, 0); 3328 lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,
3384 goto found_it; 3389 goto found_it;
3385 prev_hlock = hlock; 3390 prev_hlock = hlock;
3386 } 3391 }
3387 return print_unlock_inbalance_bug(curr, lock, ip); 3392 return print_unlock_imbalance_bug(curr, lock, ip);
3388 3393
3389found_it: 3394found_it:
3390 if (hlock->instance == lock) 3395 if (hlock->instance == lock)
@@ -4083,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
4083} 4088}
4084EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); 4089EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
4085 4090
4086static void print_held_locks_bug(struct task_struct *curr) 4091static void print_held_locks_bug(void)
4087{ 4092{
4088 if (!debug_locks_off()) 4093 if (!debug_locks_off())
4089 return; 4094 return;
@@ -4092,22 +4097,21 @@ static void print_held_locks_bug(struct task_struct *curr)
4092 4097
4093 printk("\n"); 4098 printk("\n");
4094 printk("=====================================\n"); 4099 printk("=====================================\n");
4095 printk("[ BUG: lock held at task exit time! ]\n"); 4100 printk("[ BUG: %s/%d still has locks held! ]\n",
4101 current->comm, task_pid_nr(current));
4096 print_kernel_ident(); 4102 print_kernel_ident();
4097 printk("-------------------------------------\n"); 4103 printk("-------------------------------------\n");
4098 printk("%s/%d is exiting with locks still held!\n", 4104 lockdep_print_held_locks(current);
4099 curr->comm, task_pid_nr(curr));
4100 lockdep_print_held_locks(curr);
4101
4102 printk("\nstack backtrace:\n"); 4105 printk("\nstack backtrace:\n");
4103 dump_stack(); 4106 dump_stack();
4104} 4107}
4105 4108
4106void debug_check_no_locks_held(struct task_struct *task) 4109void debug_check_no_locks_held(void)
4107{ 4110{
4108 if (unlikely(task->lockdep_depth > 0)) 4111 if (unlikely(current->lockdep_depth > 0))
4109 print_held_locks_bug(task); 4112 print_held_locks_bug();
4110} 4113}
4114EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4111 4115
4112void debug_show_all_locks(void) 4116void debug_show_all_locks(void)
4113{ 4117{
diff --git a/kernel/module.c b/kernel/module.c
index eab08274ec9b..0925c9a71975 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -197,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
197 return -ENOENT; 197 return -ENOENT;
198} 198}
199 199
200static inline void add_taint_module(struct module *mod, unsigned flag) 200static inline void add_taint_module(struct module *mod, unsigned flag,
201 enum lockdep_ok lockdep_ok)
201{ 202{
202 add_taint(flag); 203 add_taint(flag, lockdep_ok);
203 mod->taints |= (1U << flag); 204 mod->taints |= (1U << flag);
204} 205}
205 206
@@ -727,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
727{ 728{
728 int ret = (flags & O_TRUNC); 729 int ret = (flags & O_TRUNC);
729 if (ret) 730 if (ret)
730 add_taint(TAINT_FORCED_RMMOD); 731 add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
731 return ret; 732 return ret;
732} 733}
733#else 734#else
@@ -1138,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
1138 if (!test_taint(TAINT_FORCED_MODULE)) 1139 if (!test_taint(TAINT_FORCED_MODULE))
1139 printk(KERN_WARNING "%s: %s: kernel tainted.\n", 1140 printk(KERN_WARNING "%s: %s: kernel tainted.\n",
1140 mod->name, reason); 1141 mod->name, reason);
1141 add_taint_module(mod, TAINT_FORCED_MODULE); 1142 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
1142 return 0; 1143 return 0;
1143#else 1144#else
1144 return -ENOEXEC; 1145 return -ENOEXEC;
@@ -2147,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)
2147 if (!test_taint(TAINT_PROPRIETARY_MODULE)) 2148 if (!test_taint(TAINT_PROPRIETARY_MODULE))
2148 printk(KERN_WARNING "%s: module license '%s' taints " 2149 printk(KERN_WARNING "%s: module license '%s' taints "
2149 "kernel.\n", mod->name, license); 2150 "kernel.\n", mod->name, license);
2150 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2151 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2152 LOCKDEP_NOW_UNRELIABLE);
2151 } 2153 }
2152} 2154}
2153 2155
@@ -2539,7 +2541,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
2539 if (err) 2541 if (err)
2540 goto out; 2542 goto out;
2541 2543
2542 err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); 2544 err = vfs_getattr(&file->f_path, &stat);
2543 if (err) 2545 if (err)
2544 goto out; 2546 goto out;
2545 2547
@@ -2700,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2700 } 2702 }
2701 2703
2702 if (!get_modinfo(info, "intree")) 2704 if (!get_modinfo(info, "intree"))
2703 add_taint_module(mod, TAINT_OOT_MODULE); 2705 add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
2704 2706
2705 if (get_modinfo(info, "staging")) { 2707 if (get_modinfo(info, "staging")) {
2706 add_taint_module(mod, TAINT_CRAP); 2708 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
2707 printk(KERN_WARNING "%s: module is from the staging directory," 2709 printk(KERN_WARNING "%s: module is from the staging directory,"
2708 " the quality is unknown, you have been warned.\n", 2710 " the quality is unknown, you have been warned.\n",
2709 mod->name); 2711 mod->name);
@@ -2869,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)
2869 * using GPL-only symbols it needs. 2871 * using GPL-only symbols it needs.
2870 */ 2872 */
2871 if (strcmp(mod->name, "ndiswrapper") == 0) 2873 if (strcmp(mod->name, "ndiswrapper") == 0)
2872 add_taint(TAINT_PROPRIETARY_MODULE); 2874 add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
2873 2875
2874 /* driverloader was caught wrongly pretending to be under GPL */ 2876 /* driverloader was caught wrongly pretending to be under GPL */
2875 if (strcmp(mod->name, "driverloader") == 0) 2877 if (strcmp(mod->name, "driverloader") == 0)
2876 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2878 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2879 LOCKDEP_NOW_UNRELIABLE);
2877 2880
2878 /* lve claims to be GPL but upstream won't provide source */ 2881 /* lve claims to be GPL but upstream won't provide source */
2879 if (strcmp(mod->name, "lve") == 0) 2882 if (strcmp(mod->name, "lve") == 0)
2880 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2883 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2884 LOCKDEP_NOW_UNRELIABLE);
2881 2885
2882#ifdef CONFIG_MODVERSIONS 2886#ifdef CONFIG_MODVERSIONS
2883 if ((mod->num_syms && !mod->crcs) 2887 if ((mod->num_syms && !mod->crcs)
@@ -3141,12 +3145,72 @@ static int may_init_module(void)
3141 return 0; 3145 return 0;
3142} 3146}
3143 3147
3148/*
3149 * We try to place it in the list now to make sure it's unique before
3150 * we dedicate too many resources. In particular, temporary percpu
3151 * memory exhaustion.
3152 */
3153static int add_unformed_module(struct module *mod)
3154{
3155 int err;
3156 struct module *old;
3157
3158 mod->state = MODULE_STATE_UNFORMED;
3159
3160again:
3161 mutex_lock(&module_mutex);
3162 if ((old = find_module_all(mod->name, true)) != NULL) {
3163 if (old->state == MODULE_STATE_COMING
3164 || old->state == MODULE_STATE_UNFORMED) {
3165 /* Wait in case it fails to load. */
3166 mutex_unlock(&module_mutex);
3167 err = wait_event_interruptible(module_wq,
3168 finished_loading(mod->name));
3169 if (err)
3170 goto out_unlocked;
3171 goto again;
3172 }
3173 err = -EEXIST;
3174 goto out;
3175 }
3176 list_add_rcu(&mod->list, &modules);
3177 err = 0;
3178
3179out:
3180 mutex_unlock(&module_mutex);
3181out_unlocked:
3182 return err;
3183}
3184
3185static int complete_formation(struct module *mod, struct load_info *info)
3186{
3187 int err;
3188
3189 mutex_lock(&module_mutex);
3190
3191 /* Find duplicate symbols (must be called under lock). */
3192 err = verify_export_symbols(mod);
3193 if (err < 0)
3194 goto out;
3195
3196 /* This relies on module_mutex for list integrity. */
3197 module_bug_finalize(info->hdr, info->sechdrs, mod);
3198
3199 /* Mark state as coming so strong_try_module_get() ignores us,
3200 * but kallsyms etc. can see us. */
3201 mod->state = MODULE_STATE_COMING;
3202
3203out:
3204 mutex_unlock(&module_mutex);
3205 return err;
3206}
3207
3144/* Allocate and load the module: note that size of section 0 is always 3208/* Allocate and load the module: note that size of section 0 is always
3145 zero, and we rely on this for optional sections. */ 3209 zero, and we rely on this for optional sections. */
3146static int load_module(struct load_info *info, const char __user *uargs, 3210static int load_module(struct load_info *info, const char __user *uargs,
3147 int flags) 3211 int flags)
3148{ 3212{
3149 struct module *mod, *old; 3213 struct module *mod;
3150 long err; 3214 long err;
3151 3215
3152 err = module_sig_check(info); 3216 err = module_sig_check(info);
@@ -3164,36 +3228,20 @@ static int load_module(struct load_info *info, const char __user *uargs,
3164 goto free_copy; 3228 goto free_copy;
3165 } 3229 }
3166 3230
3167 /* 3231 /* Reserve our place in the list. */
3168 * We try to place it in the list now to make sure it's unique 3232 err = add_unformed_module(mod);
3169 * before we dedicate too many resources. In particular, 3233 if (err)
3170 * temporary percpu memory exhaustion.
3171 */
3172 mod->state = MODULE_STATE_UNFORMED;
3173again:
3174 mutex_lock(&module_mutex);
3175 if ((old = find_module_all(mod->name, true)) != NULL) {
3176 if (old->state == MODULE_STATE_COMING
3177 || old->state == MODULE_STATE_UNFORMED) {
3178 /* Wait in case it fails to load. */
3179 mutex_unlock(&module_mutex);
3180 err = wait_event_interruptible(module_wq,
3181 finished_loading(mod->name));
3182 if (err)
3183 goto free_module;
3184 goto again;
3185 }
3186 err = -EEXIST;
3187 mutex_unlock(&module_mutex);
3188 goto free_module; 3234 goto free_module;
3189 }
3190 list_add_rcu(&mod->list, &modules);
3191 mutex_unlock(&module_mutex);
3192 3235
3193#ifdef CONFIG_MODULE_SIG 3236#ifdef CONFIG_MODULE_SIG
3194 mod->sig_ok = info->sig_ok; 3237 mod->sig_ok = info->sig_ok;
3195 if (!mod->sig_ok) 3238 if (!mod->sig_ok) {
3196 add_taint_module(mod, TAINT_FORCED_MODULE); 3239 printk_once(KERN_NOTICE
3240 "%s: module verification failed: signature and/or"
3241 " required key missing - tainting kernel\n",
3242 mod->name);
3243 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
3244 }
3197#endif 3245#endif
3198 3246
3199 /* Now module is in final location, initialize linked lists, etc. */ 3247 /* Now module is in final location, initialize linked lists, etc. */
@@ -3236,21 +3284,11 @@ again:
3236 3284
3237 dynamic_debug_setup(info->debug, info->num_debug); 3285 dynamic_debug_setup(info->debug, info->num_debug);
3238 3286
3239 mutex_lock(&module_mutex); 3287 /* Finally it's fully formed, ready to start executing. */
3240 /* Find duplicate symbols (must be called under lock). */ 3288 err = complete_formation(mod, info);
3241 err = verify_export_symbols(mod); 3289 if (err)
3242 if (err < 0)
3243 goto ddebug_cleanup; 3290 goto ddebug_cleanup;
3244 3291
3245 /* This relies on module_mutex for list integrity. */
3246 module_bug_finalize(info->hdr, info->sechdrs, mod);
3247
3248 /* Mark state as coming so strong_try_module_get() ignores us,
3249 * but kallsyms etc. can see us. */
3250 mod->state = MODULE_STATE_COMING;
3251
3252 mutex_unlock(&module_mutex);
3253
3254 /* Module is ready to execute: parsing args may do that. */ 3292 /* Module is ready to execute: parsing args may do that. */
3255 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3293 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3256 -32768, 32767, &ddebug_dyndbg_module_param_cb); 3294 -32768, 32767, &ddebug_dyndbg_module_param_cb);
@@ -3274,8 +3312,8 @@ again:
3274 /* module_bug_cleanup needs module_mutex protection */ 3312 /* module_bug_cleanup needs module_mutex protection */
3275 mutex_lock(&module_mutex); 3313 mutex_lock(&module_mutex);
3276 module_bug_cleanup(mod); 3314 module_bug_cleanup(mod);
3277 ddebug_cleanup:
3278 mutex_unlock(&module_mutex); 3315 mutex_unlock(&module_mutex);
3316 ddebug_cleanup:
3279 dynamic_debug_remove(info->debug); 3317 dynamic_debug_remove(info->debug);
3280 synchronize_sched(); 3318 synchronize_sched();
3281 kfree(mod->args); 3319 kfree(mod->args);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c9526..52f23011b6e0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,6 +19,7 @@
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/sched/rt.h>
22#include <linux/export.h> 23#include <linux/export.h>
23#include <linux/spinlock.h> 24#include <linux/spinlock.h>
24#include <linux/interrupt.h> 25#include <linux/interrupt.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb20165..afc0456f227a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
153 goto out; 153 goto out;
154 } 154 }
155 155
156 new_ns = create_new_namespaces(flags, tsk, 156 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
157 task_cred_xxx(tsk, user_ns), tsk->fs);
158 if (IS_ERR(new_ns)) { 157 if (IS_ERR(new_ns)) {
159 err = PTR_ERR(new_ns); 158 err = PTR_ERR(new_ns);
160 goto out; 159 goto out;
@@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
251 return PTR_ERR(file); 250 return PTR_ERR(file);
252 251
253 err = -EINVAL; 252 err = -EINVAL;
254 ei = PROC_I(file->f_dentry->d_inode); 253 ei = PROC_I(file_inode(file));
255 ops = ei->ns_ops; 254 ops = ei->ns_ops;
256 if (nstype && (ops->type != nstype)) 255 if (nstype && (ops->type != nstype))
257 goto out; 256 goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff97..7c57cc9eee2c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -259,26 +259,19 @@ unsigned long get_taint(void)
259 return tainted_mask; 259 return tainted_mask;
260} 260}
261 261
262void add_taint(unsigned flag) 262/**
263 * add_taint: add a taint flag if not already set.
264 * @flag: one of the TAINT_* constants.
265 * @lockdep_ok: whether lock debugging is still OK.
266 *
267 * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
268 * some notewortht-but-not-corrupting cases, it can be set to true.
269 */
270void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
263{ 271{
264 /* 272 if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
265 * Can't trust the integrity of the kernel anymore. 273 printk(KERN_WARNING
266 * We don't call directly debug_locks_off() because the issue 274 "Disabling lock debugging due to kernel taint\n");
267 * is not necessarily serious enough to set oops_in_progress to 1
268 * Also we want to keep up lockdep for staging/out-of-tree
269 * development and post-warning case.
270 */
271 switch (flag) {
272 case TAINT_CRAP:
273 case TAINT_OOT_MODULE:
274 case TAINT_WARN:
275 case TAINT_FIRMWARE_WORKAROUND:
276 break;
277
278 default:
279 if (__debug_locks_off())
280 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
281 }
282 275
283 set_bit(flag, &tainted_mask); 276 set_bit(flag, &tainted_mask);
284} 277}
@@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
421 print_modules(); 414 print_modules();
422 dump_stack(); 415 dump_stack();
423 print_oops_end_marker(); 416 print_oops_end_marker();
424 add_taint(taint); 417 /* Just a warning, don't kill lockdep. */
418 add_taint(taint, LOCKDEP_STILL_OK);
425} 419}
426 420
427void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 421void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/pid.c b/kernel/pid.c
index de9af600006f..047dc6264638 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -331,7 +331,7 @@ out:
331 return pid; 331 return pid;
332 332
333out_unlock: 333out_unlock:
334 spin_unlock(&pidmap_lock); 334 spin_unlock_irq(&pidmap_lock);
335out_free: 335out_free:
336 while (++i <= ns->level) 336 while (++i <= ns->level)
337 free_pidmap(pid->numbers + i); 337 free_pidmap(pid->numbers + i);
@@ -350,10 +350,9 @@ void disable_pid_allocation(struct pid_namespace *ns)
350 350
351struct pid *find_pid_ns(int nr, struct pid_namespace *ns) 351struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
352{ 352{
353 struct hlist_node *elem;
354 struct upid *pnr; 353 struct upid *pnr;
355 354
356 hlist_for_each_entry_rcu(pnr, elem, 355 hlist_for_each_entry_rcu(pnr,
357 &pid_hash[pid_hashfn(nr, ns)], pid_chain) 356 &pid_hash[pid_hashfn(nr, ns)], pid_chain)
358 if (pnr->nr == nr && pnr->ns == ns) 357 if (pnr->nr == nr && pnr->ns == ns)
359 return container_of(pnr, struct pid, 358 return container_of(pnr, struct pid,
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index a278cad1d5d6..8fd709c9bb58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -155,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer,
155 155
156static inline cputime_t prof_ticks(struct task_struct *p) 156static inline cputime_t prof_ticks(struct task_struct *p)
157{ 157{
158 return p->utime + p->stime; 158 cputime_t utime, stime;
159
160 task_cputime(p, &utime, &stime);
161
162 return utime + stime;
159} 163}
160static inline cputime_t virt_ticks(struct task_struct *p) 164static inline cputime_t virt_ticks(struct task_struct *p)
161{ 165{
162 return p->utime; 166 cputime_t utime;
167
168 task_cputime(p, &utime, NULL);
169
170 return utime;
163} 171}
164 172
165static int 173static int
@@ -471,18 +479,23 @@ static void cleanup_timers(struct list_head *head,
471 */ 479 */
472void posix_cpu_timers_exit(struct task_struct *tsk) 480void posix_cpu_timers_exit(struct task_struct *tsk)
473{ 481{
482 cputime_t utime, stime;
483
474 add_device_randomness((const void*) &tsk->se.sum_exec_runtime, 484 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
475 sizeof(unsigned long long)); 485 sizeof(unsigned long long));
486 task_cputime(tsk, &utime, &stime);
476 cleanup_timers(tsk->cpu_timers, 487 cleanup_timers(tsk->cpu_timers,
477 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); 488 utime, stime, tsk->se.sum_exec_runtime);
478 489
479} 490}
480void posix_cpu_timers_exit_group(struct task_struct *tsk) 491void posix_cpu_timers_exit_group(struct task_struct *tsk)
481{ 492{
482 struct signal_struct *const sig = tsk->signal; 493 struct signal_struct *const sig = tsk->signal;
494 cputime_t utime, stime;
483 495
496 task_cputime(tsk, &utime, &stime);
484 cleanup_timers(tsk->signal->cpu_timers, 497 cleanup_timers(tsk->signal->cpu_timers,
485 tsk->utime + sig->utime, tsk->stime + sig->stime, 498 utime + sig->utime, stime + sig->stime,
486 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 499 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
487} 500}
488 501
@@ -1226,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
1226static inline int fastpath_timer_check(struct task_struct *tsk) 1239static inline int fastpath_timer_check(struct task_struct *tsk)
1227{ 1240{
1228 struct signal_struct *sig; 1241 struct signal_struct *sig;
1242 cputime_t utime, stime;
1243
1244 task_cputime(tsk, &utime, &stime);
1229 1245
1230 if (!task_cputime_zero(&tsk->cputime_expires)) { 1246 if (!task_cputime_zero(&tsk->cputime_expires)) {
1231 struct task_cputime task_sample = { 1247 struct task_cputime task_sample = {
1232 .utime = tsk->utime, 1248 .utime = utime,
1233 .stime = tsk->stime, 1249 .stime = stime,
1234 .sum_exec_runtime = tsk->se.sum_exec_runtime 1250 .sum_exec_runtime = tsk->se.sum_exec_runtime
1235 }; 1251 };
1236 1252
@@ -1401,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1401 while (!signal_pending(current)) { 1417 while (!signal_pending(current)) {
1402 if (timer.it.cpu.expires.sched == 0) { 1418 if (timer.it.cpu.expires.sched == 0) {
1403 /* 1419 /*
1404 * Our timer fired and was reset. 1420 * Our timer fired and was reset, below
1421 * deletion can not fail.
1405 */ 1422 */
1423 posix_cpu_timer_del(&timer);
1406 spin_unlock_irq(&timer.it_lock); 1424 spin_unlock_irq(&timer.it_lock);
1407 return 0; 1425 return 0;
1408 } 1426 }
@@ -1420,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1420 * We were interrupted by a signal. 1438 * We were interrupted by a signal.
1421 */ 1439 */
1422 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); 1440 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1423 posix_cpu_timer_set(&timer, 0, &zero_it, it); 1441 error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
1442 if (!error) {
1443 /*
1444 * Timer is now unarmed, deletion can not fail.
1445 */
1446 posix_cpu_timer_del(&timer);
1447 }
1424 spin_unlock_irq(&timer.it_lock); 1448 spin_unlock_irq(&timer.it_lock);
1425 1449
1450 while (error == TIMER_RETRY) {
1451 /*
1452 * We need to handle case when timer was or is in the
1453 * middle of firing. In other cases we already freed
1454 * resources.
1455 */
1456 spin_lock_irq(&timer.it_lock);
1457 error = posix_cpu_timer_del(&timer);
1458 spin_unlock_irq(&timer.it_lock);
1459 }
1460
1426 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { 1461 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1427 /* 1462 /*
1428 * It actually did fire already. 1463 * It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b701..6edbb2c55c22 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
552 return -EAGAIN; 552 return -EAGAIN;
553 553
554 spin_lock_init(&new_timer->it_lock); 554 spin_lock_init(&new_timer->it_lock);
555 retry: 555
556 if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { 556 idr_preload(GFP_KERNEL);
557 error = -EAGAIN;
558 goto out;
559 }
560 spin_lock_irq(&idr_lock); 557 spin_lock_irq(&idr_lock);
561 error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); 558 error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
562 spin_unlock_irq(&idr_lock); 559 spin_unlock_irq(&idr_lock);
563 if (error) { 560 idr_preload_end();
564 if (error == -EAGAIN) 561 if (error < 0) {
565 goto retry;
566 /* 562 /*
567 * Weird looking, but we return EAGAIN if the IDR is 563 * Weird looking, but we return EAGAIN if the IDR is
568 * full (proper POSIX return value for this) 564 * full (proper POSIX return value for this)
569 */ 565 */
570 error = -EAGAIN; 566 if (error == -ENOSPC)
567 error = -EAGAIN;
571 goto out; 568 goto out;
572 } 569 }
570 new_timer_id = error;
573 571
574 it_id_set = IT_ID_SET; 572 it_id_set = IT_ID_SET;
575 new_timer->it_id = (timer_t) new_timer_id; 573 new_timer->it_id = (timer_t) new_timer_id;
@@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
639{ 637{
640 struct k_itimer *timr; 638 struct k_itimer *timr;
641 639
640 /*
641 * timer_t could be any type >= int and we want to make sure any
642 * @timer_id outside positive int range fails lookup.
643 */
644 if ((unsigned long long)timer_id > INT_MAX)
645 return NULL;
646
642 rcu_read_lock(); 647 rcu_read_lock();
643 timr = idr_find(&posix_timers_id, (int)timer_id); 648 timr = idr_find(&posix_timers_id, (int)timer_id);
644 if (timr) { 649 if (timr) {
@@ -997,7 +1002,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
997 1002
998 err = kc->clock_adj(which_clock, &ktx); 1003 err = kc->clock_adj(which_clock, &ktx);
999 1004
1000 if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) 1005 if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
1001 return -EFAULT; 1006 return -EFAULT;
1002 1007
1003 return err; 1008 return err;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
66 66
67void queue_up_suspend_work(void) 67void queue_up_suspend_work(void)
68{ 68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) 69 if (autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work); 70 queue_work(autosleep_wq, &suspend_work);
71} 71}
72 72
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de1..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
313static suspend_state_t decode_state(const char *buf, size_t n) 313static suspend_state_t decode_state(const char *buf, size_t n)
314{ 314{
315#ifdef CONFIG_SUSPEND 315#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_STANDBY; 316 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 317 const char * const *s;
318#endif 318#endif
319 char *p; 319 char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
553 553
554#endif /* CONFIG_PM_TRACE */ 554#endif /* CONFIG_PM_TRACE */
555 555
556#ifdef CONFIG_FREEZER
557static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
558 struct kobj_attribute *attr, char *buf)
559{
560 return sprintf(buf, "%u\n", freeze_timeout_msecs);
561}
562
563static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
564 struct kobj_attribute *attr,
565 const char *buf, size_t n)
566{
567 unsigned long val;
568
569 if (kstrtoul(buf, 10, &val))
570 return -EINVAL;
571
572 freeze_timeout_msecs = val;
573 return n;
574}
575
576power_attr(pm_freeze_timeout);
577
578#endif /* CONFIG_FREEZER*/
579
556static struct attribute * g[] = { 580static struct attribute * g[] = {
557 &state_attr.attr, 581 &state_attr.attr,
558#ifdef CONFIG_PM_TRACE 582#ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
576 &pm_print_times_attr.attr, 600 &pm_print_times_attr.attr,
577#endif 601#endif
578#endif 602#endif
603#ifdef CONFIG_FREEZER
604 &pm_freeze_timeout_attr.attr,
605#endif
579 NULL, 606 NULL,
580}; 607};
581 608
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6f..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
21/* 21/*
22 * Timeout for stopping processes 22 * Timeout for stopping processes
23 */ 23 */
24#define TIMEOUT (20 * HZ) 24unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
25 25
26static int try_to_freeze_tasks(bool user_only) 26static int try_to_freeze_tasks(bool user_only)
27{ 27{
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
36 36
37 do_gettimeofday(&start); 37 do_gettimeofday(&start);
38 38
39 end_time = jiffies + TIMEOUT; 39 end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
40 40
41 if (!user_only) 41 if (!user_only)
42 freeze_workqueues_begin(); 42 freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7eaad6..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
359 return; 359 return;
360 } 360 }
361 361
362 if (delayed_work_pending(&req->work)) 362 cancel_delayed_work_sync(&req->work);
363 cancel_delayed_work_sync(&req->work);
364 363
365 if (new_value != req->node.prio) 364 if (new_value != req->node.prio)
366 pm_qos_update_target( 365 pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
386 "%s called for unknown object.", __func__)) 385 "%s called for unknown object.", __func__))
387 return; 386 return;
388 387
389 if (delayed_work_pending(&req->work)) 388 cancel_delayed_work_sync(&req->work);
390 cancel_delayed_work_sync(&req->work);
391 389
392 if (new_value != req->node.prio) 390 if (new_value != req->node.prio)
393 pm_qos_update_target( 391 pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
416 return; 414 return;
417 } 415 }
418 416
419 if (delayed_work_pending(&req->work)) 417 cancel_delayed_work_sync(&req->work);
420 cancel_delayed_work_sync(&req->work);
421 418
422 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 419 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
423 &req->node, PM_QOS_REMOVE_REQ, 420 &req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
30#include "power.h" 30#include "power.h"
31 31
32const char *const pm_states[PM_SUSPEND_MAX] = { 32const char *const pm_states[PM_SUSPEND_MAX] = {
33 [PM_SUSPEND_FREEZE] = "freeze",
33 [PM_SUSPEND_STANDBY] = "standby", 34 [PM_SUSPEND_STANDBY] = "standby",
34 [PM_SUSPEND_MEM] = "mem", 35 [PM_SUSPEND_MEM] = "mem",
35}; 36};
36 37
37static const struct platform_suspend_ops *suspend_ops; 38static const struct platform_suspend_ops *suspend_ops;
38 39
40static bool need_suspend_ops(suspend_state_t state)
41{
42 return !!(state > PM_SUSPEND_FREEZE);
43}
44
45static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
46static bool suspend_freeze_wake;
47
48static void freeze_begin(void)
49{
50 suspend_freeze_wake = false;
51}
52
53static void freeze_enter(void)
54{
55 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
56}
57
58void freeze_wake(void)
59{
60 suspend_freeze_wake = true;
61 wake_up(&suspend_freeze_wait_head);
62}
63EXPORT_SYMBOL_GPL(freeze_wake);
64
39/** 65/**
40 * suspend_set_ops - Set the global suspend method table. 66 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Suspend operations to use. 67 * @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
50 76
51bool valid_state(suspend_state_t state) 77bool valid_state(suspend_state_t state)
52{ 78{
79 if (state == PM_SUSPEND_FREEZE)
80 return true;
53 /* 81 /*
54 * All states need lowlevel support and need to be valid to the lowlevel 82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
83 * support and need to be valid to the lowlevel
55 * implementation, no valid callback implies that none are valid. 84 * implementation, no valid callback implies that none are valid.
56 */ 85 */
57 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); 86 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
89 * hibernation). Run suspend notifiers, allocate the "suspend" console and 118 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes. 119 * freeze processes.
91 */ 120 */
92static int suspend_prepare(void) 121static int suspend_prepare(suspend_state_t state)
93{ 122{
94 int error; 123 int error;
95 124
96 if (!suspend_ops || !suspend_ops->enter) 125 if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
97 return -EPERM; 126 return -EPERM;
98 127
99 pm_prepare_console(); 128 pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
137{ 166{
138 int error; 167 int error;
139 168
140 if (suspend_ops->prepare) { 169 if (need_suspend_ops(state) && suspend_ops->prepare) {
141 error = suspend_ops->prepare(); 170 error = suspend_ops->prepare();
142 if (error) 171 if (error)
143 goto Platform_finish; 172 goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
149 goto Platform_finish; 178 goto Platform_finish;
150 } 179 }
151 180
152 if (suspend_ops->prepare_late) { 181 if (need_suspend_ops(state) && suspend_ops->prepare_late) {
153 error = suspend_ops->prepare_late(); 182 error = suspend_ops->prepare_late();
154 if (error) 183 if (error)
155 goto Platform_wake; 184 goto Platform_wake;
156 } 185 }
157 186
187 /*
188 * PM_SUSPEND_FREEZE equals
189 * frozen processes + suspended devices + idle processors.
190 * Thus we should invoke freeze_enter() soon after
191 * all the devices are suspended.
192 */
193 if (state == PM_SUSPEND_FREEZE) {
194 freeze_enter();
195 goto Platform_wake;
196 }
197
158 if (suspend_test(TEST_PLATFORM)) 198 if (suspend_test(TEST_PLATFORM))
159 goto Platform_wake; 199 goto Platform_wake;
160 200
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
182 enable_nonboot_cpus(); 222 enable_nonboot_cpus();
183 223
184 Platform_wake: 224 Platform_wake:
185 if (suspend_ops->wake) 225 if (need_suspend_ops(state) && suspend_ops->wake)
186 suspend_ops->wake(); 226 suspend_ops->wake();
187 227
188 dpm_resume_start(PMSG_RESUME); 228 dpm_resume_start(PMSG_RESUME);
189 229
190 Platform_finish: 230 Platform_finish:
191 if (suspend_ops->finish) 231 if (need_suspend_ops(state) && suspend_ops->finish)
192 suspend_ops->finish(); 232 suspend_ops->finish();
193 233
194 return error; 234 return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
203 int error; 243 int error;
204 bool wakeup = false; 244 bool wakeup = false;
205 245
206 if (!suspend_ops) 246 if (need_suspend_ops(state) && !suspend_ops)
207 return -ENOSYS; 247 return -ENOSYS;
208 248
209 trace_machine_suspend(state); 249 trace_machine_suspend(state);
210 if (suspend_ops->begin) { 250 if (need_suspend_ops(state) && suspend_ops->begin) {
211 error = suspend_ops->begin(state); 251 error = suspend_ops->begin(state);
212 if (error) 252 if (error)
213 goto Close; 253 goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
226 266
227 do { 267 do {
228 error = suspend_enter(state, &wakeup); 268 error = suspend_enter(state, &wakeup);
229 } while (!error && !wakeup 269 } while (!error && !wakeup && need_suspend_ops(state)
230 && suspend_ops->suspend_again && suspend_ops->suspend_again()); 270 && suspend_ops->suspend_again && suspend_ops->suspend_again());
231 271
232 Resume_devices: 272 Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
236 ftrace_start(); 276 ftrace_start();
237 resume_console(); 277 resume_console();
238 Close: 278 Close:
239 if (suspend_ops->end) 279 if (need_suspend_ops(state) && suspend_ops->end)
240 suspend_ops->end(); 280 suspend_ops->end();
241 trace_machine_suspend(PWR_EVENT_EXIT); 281 trace_machine_suspend(PWR_EVENT_EXIT);
242 return error; 282 return error;
243 283
244 Recover_platform: 284 Recover_platform:
245 if (suspend_ops->recover) 285 if (need_suspend_ops(state) && suspend_ops->recover)
246 suspend_ops->recover(); 286 suspend_ops->recover();
247 goto Resume_devices; 287 goto Resume_devices;
248} 288}
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
278 if (!mutex_trylock(&pm_mutex)) 318 if (!mutex_trylock(&pm_mutex))
279 return -EBUSY; 319 return -EBUSY;
280 320
321 if (state == PM_SUSPEND_FREEZE)
322 freeze_begin();
323
281 printk(KERN_INFO "PM: Syncing filesystems ... "); 324 printk(KERN_INFO "PM: Syncing filesystems ... ");
282 sys_sync(); 325 sys_sync();
283 printk("done.\n"); 326 printk("done.\n");
284 327
285 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 328 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
286 error = suspend_prepare(); 329 error = suspend_prepare(state);
287 if (error) 330 if (error)
288 goto Unlock; 331 goto Unlock;
289 332
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac7..9b2a1d58558d 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
112 rtc_set_alarm(rtc, &alm); 112 rtc_set_alarm(rtc, &alm);
113} 113}
114 114
115static int __init has_wakealarm(struct device *dev, void *name_ptr) 115static int __init has_wakealarm(struct device *dev, const void *data)
116{ 116{
117 struct rtc_device *candidate = to_rtc_device(dev); 117 struct rtc_device *candidate = to_rtc_device(dev);
118 118
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
121 if (!device_may_wakeup(candidate->dev.parent)) 121 if (!device_may_wakeup(candidate->dev.parent))
122 return 0; 122 return 0;
123 123
124 *(const char **)name_ptr = dev_name(dev);
125 return 1; 124 return 1;
126} 125}
127 126
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
159 static char warn_no_rtc[] __initdata = 158 static char warn_no_rtc[] __initdata =
160 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; 159 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
161 160
162 char *pony = NULL;
163 struct rtc_device *rtc = NULL; 161 struct rtc_device *rtc = NULL;
162 struct device *dev;
164 163
165 /* PM is initialized by now; is that state testable? */ 164 /* PM is initialized by now; is that state testable? */
166 if (test_state == PM_SUSPEND_ON) 165 if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
171 } 170 }
172 171
173 /* RTCs have initialized by now too ... can we use one? */ 172 /* RTCs have initialized by now too ... can we use one? */
174 class_find_device(rtc_class, NULL, &pony, has_wakealarm); 173 dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
175 if (pony) 174 if (dev)
176 rtc = rtc_class_open(pony); 175 rtc = rtc_class_open(dev_name(dev));
177 if (!rtc) { 176 if (!rtc) {
178 printk(warn_no_rtc); 177 printk(warn_no_rtc);
179 goto done; 178 goto done;
diff --git a/kernel/printk.c b/kernel/printk.c
index 357f714ddd49..0b31715f335a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -42,6 +42,7 @@
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h> 44#include <linux/poll.h>
45#include <linux/irq_work.h>
45 46
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
47 48
@@ -1967,30 +1968,32 @@ int is_console_locked(void)
1967static DEFINE_PER_CPU(int, printk_pending); 1968static DEFINE_PER_CPU(int, printk_pending);
1968static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); 1969static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1969 1970
1970void printk_tick(void) 1971static void wake_up_klogd_work_func(struct irq_work *irq_work)
1971{ 1972{
1972 if (__this_cpu_read(printk_pending)) { 1973 int pending = __this_cpu_xchg(printk_pending, 0);
1973 int pending = __this_cpu_xchg(printk_pending, 0); 1974
1974 if (pending & PRINTK_PENDING_SCHED) { 1975 if (pending & PRINTK_PENDING_SCHED) {
1975 char *buf = __get_cpu_var(printk_sched_buf); 1976 char *buf = __get_cpu_var(printk_sched_buf);
1976 printk(KERN_WARNING "[sched_delayed] %s", buf); 1977 printk(KERN_WARNING "[sched_delayed] %s", buf);
1977 }
1978 if (pending & PRINTK_PENDING_WAKEUP)
1979 wake_up_interruptible(&log_wait);
1980 } 1978 }
1981}
1982 1979
1983int printk_needs_cpu(int cpu) 1980 if (pending & PRINTK_PENDING_WAKEUP)
1984{ 1981 wake_up_interruptible(&log_wait);
1985 if (cpu_is_offline(cpu))
1986 printk_tick();
1987 return __this_cpu_read(printk_pending);
1988} 1982}
1989 1983
1984static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
1985 .func = wake_up_klogd_work_func,
1986 .flags = IRQ_WORK_LAZY,
1987};
1988
1990void wake_up_klogd(void) 1989void wake_up_klogd(void)
1991{ 1990{
1992 if (waitqueue_active(&log_wait)) 1991 preempt_disable();
1992 if (waitqueue_active(&log_wait)) {
1993 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1993 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1994 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
1995 }
1996 preempt_enable();
1994} 1997}
1995 1998
1996static void console_cont_flush(char *text, size_t size) 1999static void console_cont_flush(char *text, size_t size)
@@ -2471,6 +2474,7 @@ int printk_sched(const char *fmt, ...)
2471 va_end(args); 2474 va_end(args);
2472 2475
2473 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); 2476 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
2477 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2474 local_irq_restore(flags); 2478 local_irq_restore(flags);
2475 2479
2476 return r; 2480 return r;
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42f..dc3384ee874e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -37,9 +37,6 @@ struct profile_hit {
37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) 37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
39 39
40/* Oprofile timer tick hook */
41static int (*timer_hook)(struct pt_regs *) __read_mostly;
42
43static atomic_t *prof_buffer; 40static atomic_t *prof_buffer;
44static unsigned long prof_len, prof_shift; 41static unsigned long prof_len, prof_shift;
45 42
@@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
208} 205}
209EXPORT_SYMBOL_GPL(profile_event_unregister); 206EXPORT_SYMBOL_GPL(profile_event_unregister);
210 207
211int register_timer_hook(int (*hook)(struct pt_regs *))
212{
213 if (timer_hook)
214 return -EBUSY;
215 timer_hook = hook;
216 return 0;
217}
218EXPORT_SYMBOL_GPL(register_timer_hook);
219
220void unregister_timer_hook(int (*hook)(struct pt_regs *))
221{
222 WARN_ON(hook != timer_hook);
223 timer_hook = NULL;
224 /* make sure all CPUs see the NULL hook */
225 synchronize_sched(); /* Allow ongoing interrupts to complete. */
226}
227EXPORT_SYMBOL_GPL(unregister_timer_hook);
228
229
230#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
231/* 209/*
232 * Each cpu has a pair of open-addressed hashtables for pending 210 * Each cpu has a pair of open-addressed hashtables for pending
@@ -436,8 +414,6 @@ void profile_tick(int type)
436{ 414{
437 struct pt_regs *regs = get_irq_regs(); 415 struct pt_regs *regs = get_irq_regs();
438 416
439 if (type == CPU_PROFILING && timer_hook)
440 timer_hook(regs);
441 if (!user_mode(regs) && prof_cpu_mask != NULL && 417 if (!user_mode(regs) && prof_cpu_mask != NULL &&
442 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) 418 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
443 profile_hit(type, (void *)profile_pc(regs)); 419 profile_hit(type, (void *)profile_pc(regs));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6cbeaae4406d..acbd28424d81 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
712 kiov->iov_len, kiov->iov_base); 712 kiov->iov_len, kiov->iov_base);
713} 713}
714 714
715/*
716 * This is declared in linux/regset.h and defined in machine-dependent
717 * code. We put the export here, near the primary machine-neutral use,
718 * to ensure no machine forgets it.
719 */
720EXPORT_SYMBOL_GPL(task_user_regset_view);
715#endif 721#endif
716 722
717int ptrace_request(struct task_struct *child, long request, 723int ptrace_request(struct task_struct *child, long request,
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 20dfba576c2b..7f8e7590e3e5 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
111 111
112extern int rcu_expedited; 112extern int rcu_expedited;
113 113
114#ifdef CONFIG_RCU_STALL_COMMON
115
116extern int rcu_cpu_stall_suppress;
117int rcu_jiffies_till_stall_check(void);
118
119#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
120
114#endif /* __LINUX_RCU_H */ 121#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf76177b44..48ab70384a4c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
404#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 404#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
405 405
406#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 406#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
407void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) 407void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
408 unsigned long secs,
409 unsigned long c_old, unsigned long c)
408{ 410{
409 trace_rcu_torture_read(rcutorturename, rhp); 411 trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
410} 412}
411EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); 413EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
412#else 414#else
413#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) 415#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
416 do { } while (0)
414#endif 417#endif
418
419#ifdef CONFIG_RCU_STALL_COMMON
420
421#ifdef CONFIG_PROVE_RCU
422#define RCU_STALL_DELAY_DELTA (5 * HZ)
423#else
424#define RCU_STALL_DELAY_DELTA 0
425#endif
426
427int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
428int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
429
430module_param(rcu_cpu_stall_suppress, int, 0644);
431module_param(rcu_cpu_stall_timeout, int, 0644);
432
433int rcu_jiffies_till_stall_check(void)
434{
435 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
436
437 /*
438 * Limit check must be consistent with the Kconfig limits
439 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
440 */
441 if (till_stall_check < 3) {
442 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
443 till_stall_check = 3;
444 } else if (till_stall_check > 300) {
445 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
446 till_stall_check = 300;
447 }
448 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
449}
450
451static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
452{
453 rcu_cpu_stall_suppress = 1;
454 return NOTIFY_DONE;
455}
456
457static struct notifier_block rcu_panic_block = {
458 .notifier_call = rcu_panic,
459};
460
461static int __init check_cpu_stall_init(void)
462{
463 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
464 return 0;
465}
466early_initcall(check_cpu_stall_init);
467
468#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58f9c2a..a0714a51b6d7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu), 51 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp); 52 struct rcu_ctrlblk *rcp);
53 53
54#include "rcutiny_plugin.h"
55
56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 54static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 55
56#include "rcutiny_plugin.h"
57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long newval) 59static void rcu_idle_enter_common(long long newval)
60{ 60{
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
193 * interrupts don't count, we must be running at the first interrupt 193 * interrupts don't count, we must be running at the first interrupt
194 * level. 194 * level.
195 */ 195 */
196int rcu_is_cpu_rrupt_from_idle(void) 196static int rcu_is_cpu_rrupt_from_idle(void)
197{ 197{
198 return rcu_dynticks_nesting <= 1; 198 return rcu_dynticks_nesting <= 1;
199} 199}
@@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
205 */ 205 */
206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
207{ 207{
208 reset_cpu_stall_ticks(rcp);
208 if (rcp->rcucblist != NULL && 209 if (rcp->rcucblist != NULL &&
209 rcp->donetail != rcp->curtail) { 210 rcp->donetail != rcp->curtail) {
210 rcp->donetail = rcp->curtail; 211 rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)
251 */ 252 */
252void rcu_check_callbacks(int cpu, int user) 253void rcu_check_callbacks(int cpu, int user)
253{ 254{
255 check_cpu_stalls();
254 if (user || rcu_is_cpu_rrupt_from_idle()) 256 if (user || rcu_is_cpu_rrupt_from_idle())
255 rcu_sched_qs(cpu); 257 rcu_sched_qs(cpu);
256 else if (!in_softirq()) 258 else if (!in_softirq())
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a2309b..8a233002faeb 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -33,6 +33,9 @@ struct rcu_ctrlblk {
33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
34 struct rcu_head **curtail; /* ->next pointer of last CB. */ 34 struct rcu_head **curtail; /* ->next pointer of last CB. */
35 RCU_TRACE(long qlen); /* Number of pending CBs. */ 35 RCU_TRACE(long qlen); /* Number of pending CBs. */
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
36 RCU_TRACE(char *name); /* Name of RCU type. */ 39 RCU_TRACE(char *name); /* Name of RCU type. */
37}; 40};
38 41
@@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;
54EXPORT_SYMBOL_GPL(rcu_scheduler_active); 57EXPORT_SYMBOL_GPL(rcu_scheduler_active);
55#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 58#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
56 59
60#ifdef CONFIG_RCU_TRACE
61
62static void check_cpu_stall(struct rcu_ctrlblk *rcp)
63{
64 unsigned long j;
65 unsigned long js;
66
67 if (rcu_cpu_stall_suppress)
68 return;
69 rcp->ticks_this_gp++;
70 j = jiffies;
71 js = rcp->jiffies_stall;
72 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
73 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
74 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
75 jiffies - rcp->gp_start, rcp->qlen);
76 dump_stack();
77 }
78 if (*rcp->curtail && ULONG_CMP_GE(j, js))
79 rcp->jiffies_stall = jiffies +
80 3 * rcu_jiffies_till_stall_check() + 3;
81 else if (ULONG_CMP_GE(j, js))
82 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
83}
84
85static void check_cpu_stall_preempt(void);
86
87#endif /* #ifdef CONFIG_RCU_TRACE */
88
89static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
90{
91#ifdef CONFIG_RCU_TRACE
92 rcp->ticks_this_gp = 0;
93 rcp->gp_start = jiffies;
94 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
95#endif /* #ifdef CONFIG_RCU_TRACE */
96}
97
98static void check_cpu_stalls(void)
99{
100 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
101 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
102 RCU_TRACE(check_cpu_stall_preempt());
103}
104
57#ifdef CONFIG_TINY_PREEMPT_RCU 105#ifdef CONFIG_TINY_PREEMPT_RCU
58 106
59#include <linux/delay.h> 107#include <linux/delay.h>
@@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)
448 /* Official start of GP. */ 496 /* Official start of GP. */
449 rcu_preempt_ctrlblk.gpnum++; 497 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); 498 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
499 reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
451 500
452 /* Any blocked RCU readers block new GP. */ 501 /* Any blocked RCU readers block new GP. */
453 if (rcu_preempt_blocked_readers_any()) 502 if (rcu_preempt_blocked_readers_any())
@@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");
1054MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); 1103MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1055MODULE_LICENSE("GPL"); 1104MODULE_LICENSE("GPL");
1056 1105
1106static void check_cpu_stall_preempt(void)
1107{
1108#ifdef CONFIG_TINY_PREEMPT_RCU
1109 check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
1110#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
1111}
1112
1057#endif /* #ifdef CONFIG_RCU_TRACE */ 1113#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01c85fd..e1f3a8c96724 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -46,6 +46,7 @@
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/trace_clock.h>
49#include <asm/byteorder.h> 50#include <asm/byteorder.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
207#define rcu_can_boost() 0 208#define rcu_can_boost() 0
208#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 209#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
209 210
211#ifdef CONFIG_RCU_TRACE
212static u64 notrace rcu_trace_clock_local(void)
213{
214 u64 ts = trace_clock_local();
215 unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
216 return ts;
217}
218#else /* #ifdef CONFIG_RCU_TRACE */
219static u64 notrace rcu_trace_clock_local(void)
220{
221 return 0ULL;
222}
223#endif /* #else #ifdef CONFIG_RCU_TRACE */
224
210static unsigned long shutdown_time; /* jiffies to system shutdown. */ 225static unsigned long shutdown_time; /* jiffies to system shutdown. */
211static unsigned long boost_starttime; /* jiffies of next boost test start. */ 226static unsigned long boost_starttime; /* jiffies of next boost test start. */
212DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 227DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg)
845 /* Wait for the next test interval. */ 860 /* Wait for the next test interval. */
846 oldstarttime = boost_starttime; 861 oldstarttime = boost_starttime;
847 while (ULONG_CMP_LT(jiffies, oldstarttime)) { 862 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
848 schedule_timeout_uninterruptible(1); 863 schedule_timeout_interruptible(oldstarttime - jiffies);
849 rcu_stutter_wait("rcu_torture_boost"); 864 rcu_stutter_wait("rcu_torture_boost");
850 if (kthread_should_stop() || 865 if (kthread_should_stop() ||
851 fullstop != FULLSTOP_DONTSTOP) 866 fullstop != FULLSTOP_DONTSTOP)
@@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void)
1028 return; 1043 return;
1029 if (atomic_xchg(&beenhere, 1) != 0) 1044 if (atomic_xchg(&beenhere, 1) != 0)
1030 return; 1045 return;
1031 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
1032 ftrace_dump(DUMP_ALL); 1046 ftrace_dump(DUMP_ALL);
1033} 1047}
1034 1048
@@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)
1042{ 1056{
1043 int idx; 1057 int idx;
1044 int completed; 1058 int completed;
1059 int completed_end;
1045 static DEFINE_RCU_RANDOM(rand); 1060 static DEFINE_RCU_RANDOM(rand);
1046 static DEFINE_SPINLOCK(rand_lock); 1061 static DEFINE_SPINLOCK(rand_lock);
1047 struct rcu_torture *p; 1062 struct rcu_torture *p;
1048 int pipe_count; 1063 int pipe_count;
1064 unsigned long long ts;
1049 1065
1050 idx = cur_ops->readlock(); 1066 idx = cur_ops->readlock();
1051 completed = cur_ops->completed(); 1067 completed = cur_ops->completed();
1068 ts = rcu_trace_clock_local();
1052 p = rcu_dereference_check(rcu_torture_current, 1069 p = rcu_dereference_check(rcu_torture_current,
1053 rcu_read_lock_bh_held() || 1070 rcu_read_lock_bh_held() ||
1054 rcu_read_lock_sched_held() || 1071 rcu_read_lock_sched_held() ||
@@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)
1058 cur_ops->readunlock(idx); 1075 cur_ops->readunlock(idx);
1059 return; 1076 return;
1060 } 1077 }
1061 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1062 if (p->rtort_mbtest == 0) 1078 if (p->rtort_mbtest == 0)
1063 atomic_inc(&n_rcu_torture_mberror); 1079 atomic_inc(&n_rcu_torture_mberror);
1064 spin_lock(&rand_lock); 1080 spin_lock(&rand_lock);
@@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)
1071 /* Should not happen, but... */ 1087 /* Should not happen, but... */
1072 pipe_count = RCU_TORTURE_PIPE_LEN; 1088 pipe_count = RCU_TORTURE_PIPE_LEN;
1073 } 1089 }
1074 if (pipe_count > 1) 1090 completed_end = cur_ops->completed();
1091 if (pipe_count > 1) {
1092 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
1093 completed, completed_end);
1075 rcutorture_trace_dump(); 1094 rcutorture_trace_dump();
1095 }
1076 __this_cpu_inc(rcu_torture_count[pipe_count]); 1096 __this_cpu_inc(rcu_torture_count[pipe_count]);
1077 completed = cur_ops->completed() - completed; 1097 completed = completed_end - completed;
1078 if (completed > RCU_TORTURE_PIPE_LEN) { 1098 if (completed > RCU_TORTURE_PIPE_LEN) {
1079 /* Should not happen, but... */ 1099 /* Should not happen, but... */
1080 completed = RCU_TORTURE_PIPE_LEN; 1100 completed = RCU_TORTURE_PIPE_LEN;
@@ -1094,11 +1114,13 @@ static int
1094rcu_torture_reader(void *arg) 1114rcu_torture_reader(void *arg)
1095{ 1115{
1096 int completed; 1116 int completed;
1117 int completed_end;
1097 int idx; 1118 int idx;
1098 DEFINE_RCU_RANDOM(rand); 1119 DEFINE_RCU_RANDOM(rand);
1099 struct rcu_torture *p; 1120 struct rcu_torture *p;
1100 int pipe_count; 1121 int pipe_count;
1101 struct timer_list t; 1122 struct timer_list t;
1123 unsigned long long ts;
1102 1124
1103 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 1125 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
1104 set_user_nice(current, 19); 1126 set_user_nice(current, 19);
@@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg)
1112 } 1134 }
1113 idx = cur_ops->readlock(); 1135 idx = cur_ops->readlock();
1114 completed = cur_ops->completed(); 1136 completed = cur_ops->completed();
1137 ts = rcu_trace_clock_local();
1115 p = rcu_dereference_check(rcu_torture_current, 1138 p = rcu_dereference_check(rcu_torture_current,
1116 rcu_read_lock_bh_held() || 1139 rcu_read_lock_bh_held() ||
1117 rcu_read_lock_sched_held() || 1140 rcu_read_lock_sched_held() ||
@@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg)
1122 schedule_timeout_interruptible(HZ); 1145 schedule_timeout_interruptible(HZ);
1123 continue; 1146 continue;
1124 } 1147 }
1125 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1126 if (p->rtort_mbtest == 0) 1148 if (p->rtort_mbtest == 0)
1127 atomic_inc(&n_rcu_torture_mberror); 1149 atomic_inc(&n_rcu_torture_mberror);
1128 cur_ops->read_delay(&rand); 1150 cur_ops->read_delay(&rand);
@@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg)
1132 /* Should not happen, but... */ 1154 /* Should not happen, but... */
1133 pipe_count = RCU_TORTURE_PIPE_LEN; 1155 pipe_count = RCU_TORTURE_PIPE_LEN;
1134 } 1156 }
1135 if (pipe_count > 1) 1157 completed_end = cur_ops->completed();
1158 if (pipe_count > 1) {
1159 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
1160 ts, completed, completed_end);
1136 rcutorture_trace_dump(); 1161 rcutorture_trace_dump();
1162 }
1137 __this_cpu_inc(rcu_torture_count[pipe_count]); 1163 __this_cpu_inc(rcu_torture_count[pipe_count]);
1138 completed = cur_ops->completed() - completed; 1164 completed = completed_end - completed;
1139 if (completed > RCU_TORTURE_PIPE_LEN) { 1165 if (completed > RCU_TORTURE_PIPE_LEN) {
1140 /* Should not happen, but... */ 1166 /* Should not happen, but... */
1141 completed = RCU_TORTURE_PIPE_LEN; 1167 completed = RCU_TORTURE_PIPE_LEN;
@@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)
1301 set_cpus_allowed_ptr(reader_tasks[i], 1327 set_cpus_allowed_ptr(reader_tasks[i],
1302 shuffle_tmp_mask); 1328 shuffle_tmp_mask);
1303 } 1329 }
1304
1305 if (fakewriter_tasks) { 1330 if (fakewriter_tasks) {
1306 for (i = 0; i < nfakewriters; i++) 1331 for (i = 0; i < nfakewriters; i++)
1307 if (fakewriter_tasks[i]) 1332 if (fakewriter_tasks[i])
1308 set_cpus_allowed_ptr(fakewriter_tasks[i], 1333 set_cpus_allowed_ptr(fakewriter_tasks[i],
1309 shuffle_tmp_mask); 1334 shuffle_tmp_mask);
1310 } 1335 }
1311
1312 if (writer_task) 1336 if (writer_task)
1313 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); 1337 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
1314
1315 if (stats_task) 1338 if (stats_task)
1316 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); 1339 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
1340 if (stutter_task)
1341 set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
1342 if (fqs_task)
1343 set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
1344 if (shutdown_task)
1345 set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
1346#ifdef CONFIG_HOTPLUG_CPU
1347 if (onoff_task)
1348 set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
1349#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1350 if (stall_task)
1351 set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
1352 if (barrier_cbs_tasks)
1353 for (i = 0; i < n_barrier_cbs; i++)
1354 if (barrier_cbs_tasks[i])
1355 set_cpus_allowed_ptr(barrier_cbs_tasks[i],
1356 shuffle_tmp_mask);
1357 if (barrier_task)
1358 set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
1317 1359
1318 if (rcu_idle_cpu == -1) 1360 if (rcu_idle_cpu == -1)
1319 rcu_idle_cpu = num_online_cpus() - 1; 1361 rcu_idle_cpu = num_online_cpus() - 1;
@@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void)
1749 barrier_cbs_wq = 1791 barrier_cbs_wq =
1750 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), 1792 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1751 GFP_KERNEL); 1793 GFP_KERNEL);
1752 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) 1794 if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
1753 return -ENOMEM; 1795 return -ENOMEM;
1754 for (i = 0; i < n_barrier_cbs; i++) { 1796 for (i = 0; i < n_barrier_cbs; i++) {
1755 init_waitqueue_head(&barrier_cbs_wq[i]); 1797 init_waitqueue_head(&barrier_cbs_wq[i]);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77b614e..5b8ad827fd86 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
105 * The rcu_scheduler_active variable transitions from zero to one just 105 * The rcu_scheduler_active variable transitions from zero to one just
106 * before the first task is spawned. So when this variable is zero, RCU 106 * before the first task is spawned. So when this variable is zero, RCU
107 * can assume that there is but one task, allowing RCU to (for example) 107 * can assume that there is but one task, allowing RCU to (for example)
108 * optimized synchronize_sched() to a simple barrier(). When this variable 108 * optimize synchronize_sched() to a simple barrier(). When this variable
109 * is one, RCU must actually do all the hard work required to detect real 109 * is one, RCU must actually do all the hard work required to detect real
110 * grace periods. This variable is also used to suppress boot-time false 110 * grace periods. This variable is also used to suppress boot-time false
111 * positives from lockdep-RCU error checking. 111 * positives from lockdep-RCU error checking.
@@ -217,12 +217,6 @@ module_param(blimit, long, 0444);
217module_param(qhimark, long, 0444); 217module_param(qhimark, long, 0444);
218module_param(qlowmark, long, 0444); 218module_param(qlowmark, long, 0444);
219 219
220int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
221int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
222
223module_param(rcu_cpu_stall_suppress, int, 0644);
224module_param(rcu_cpu_stall_timeout, int, 0644);
225
226static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; 220static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
227static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; 221static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
228 222
@@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305} 299}
306 300
307/* 301/*
308 * Does the current CPU require a yet-as-unscheduled grace period? 302 * Does the current CPU require a not-yet-started grace period?
303 * The caller must have disabled interrupts to prevent races with
304 * normal callback registry.
309 */ 305 */
310static int 306static int
311cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 307cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
312{ 308{
313 struct rcu_head **ntp; 309 int i;
314 310
315 ntp = rdp->nxttail[RCU_DONE_TAIL + 311 if (rcu_gp_in_progress(rsp))
316 (ACCESS_ONCE(rsp->completed) != rdp->completed)]; 312 return 0; /* No, a grace period is already in progress. */
317 return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && 313 if (!rdp->nxttail[RCU_NEXT_TAIL])
318 !rcu_gp_in_progress(rsp); 314 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
316 return 1; /* Yes, this CPU has newly registered callbacks. */
317 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
318 if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
319 ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
320 rdp->nxtcompleted[i]))
321 return 1; /* Yes, CBs for future grace period. */
322 return 0; /* No grace period needed. */
319} 323}
320 324
321/* 325/*
@@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
336static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 340static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
337 bool user) 341 bool user)
338{ 342{
339 trace_rcu_dyntick("Start", oldval, 0); 343 trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
340 if (!user && !is_idle_task(current)) { 344 if (!user && !is_idle_task(current)) {
341 struct task_struct *idle = idle_task(smp_processor_id()); 345 struct task_struct *idle = idle_task(smp_processor_id());
342 346
@@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
727 * interrupt from idle, return true. The caller must have at least 731 * interrupt from idle, return true. The caller must have at least
728 * disabled preemption. 732 * disabled preemption.
729 */ 733 */
730int rcu_is_cpu_rrupt_from_idle(void) 734static int rcu_is_cpu_rrupt_from_idle(void)
731{ 735{
732 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 736 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
733} 737}
@@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
793 return 0; 797 return 0;
794} 798}
795 799
796static int jiffies_till_stall_check(void)
797{
798 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
799
800 /*
801 * Limit check must be consistent with the Kconfig limits
802 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
803 */
804 if (till_stall_check < 3) {
805 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
806 till_stall_check = 3;
807 } else if (till_stall_check > 300) {
808 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
809 till_stall_check = 300;
810 }
811 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
812}
813
814static void record_gp_stall_check_time(struct rcu_state *rsp) 800static void record_gp_stall_check_time(struct rcu_state *rsp)
815{ 801{
816 rsp->gp_start = jiffies; 802 rsp->gp_start = jiffies;
817 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 803 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
818} 804}
819 805
820/* 806/*
@@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
857 raw_spin_unlock_irqrestore(&rnp->lock, flags); 843 raw_spin_unlock_irqrestore(&rnp->lock, flags);
858 return; 844 return;
859 } 845 }
860 rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; 846 rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
861 raw_spin_unlock_irqrestore(&rnp->lock, flags); 847 raw_spin_unlock_irqrestore(&rnp->lock, flags);
862 848
863 /* 849 /*
@@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
935 raw_spin_lock_irqsave(&rnp->lock, flags); 921 raw_spin_lock_irqsave(&rnp->lock, flags);
936 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 922 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
937 rsp->jiffies_stall = jiffies + 923 rsp->jiffies_stall = jiffies +
938 3 * jiffies_till_stall_check() + 3; 924 3 * rcu_jiffies_till_stall_check() + 3;
939 raw_spin_unlock_irqrestore(&rnp->lock, flags); 925 raw_spin_unlock_irqrestore(&rnp->lock, flags);
940 926
941 set_need_resched(); /* kick ourselves to get things going. */ 927 set_need_resched(); /* kick ourselves to get things going. */
@@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
966 } 952 }
967} 953}
968 954
969static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
970{
971 rcu_cpu_stall_suppress = 1;
972 return NOTIFY_DONE;
973}
974
975/** 955/**
976 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period 956 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
977 * 957 *
@@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void)
989 rsp->jiffies_stall = jiffies + ULONG_MAX / 2; 969 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
990} 970}
991 971
992static struct notifier_block rcu_panic_block = {
993 .notifier_call = rcu_panic,
994};
995
996static void __init check_cpu_stall_init(void)
997{
998 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
999}
1000
1001/* 972/*
1002 * Update CPU-local rcu_data state to record the newly noticed grace period. 973 * Update CPU-local rcu_data state to record the newly noticed grace period.
1003 * This is used both when we started the grace period and when we notice 974 * This is used both when we started the grace period and when we notice
@@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp)
1071} 1042}
1072 1043
1073/* 1044/*
1045 * Determine the value that ->completed will have at the end of the
1046 * next subsequent grace period. This is used to tag callbacks so that
1047 * a CPU can invoke callbacks in a timely fashion even if that CPU has
1048 * been dyntick-idle for an extended period with callbacks under the
1049 * influence of RCU_FAST_NO_HZ.
1050 *
1051 * The caller must hold rnp->lock with interrupts disabled.
1052 */
1053static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1054 struct rcu_node *rnp)
1055{
1056 /*
1057 * If RCU is idle, we just wait for the next grace period.
1058 * But we can only be sure that RCU is idle if we are looking
1059 * at the root rcu_node structure -- otherwise, a new grace
1060 * period might have started, but just not yet gotten around
1061 * to initializing the current non-root rcu_node structure.
1062 */
1063 if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
1064 return rnp->completed + 1;
1065
1066 /*
1067 * Otherwise, wait for a possible partial grace period and
1068 * then the subsequent full grace period.
1069 */
1070 return rnp->completed + 2;
1071}
1072
1073/*
1074 * If there is room, assign a ->completed number to any callbacks on
1075 * this CPU that have not already been assigned. Also accelerate any
1076 * callbacks that were previously assigned a ->completed number that has
1077 * since proven to be too conservative, which can happen if callbacks get
1078 * assigned a ->completed number while RCU is idle, but with reference to
1079 * a non-root rcu_node structure. This function is idempotent, so it does
1080 * not hurt to call it repeatedly.
1081 *
1082 * The caller must hold rnp->lock with interrupts disabled.
1083 */
1084static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1085 struct rcu_data *rdp)
1086{
1087 unsigned long c;
1088 int i;
1089
1090 /* If the CPU has no callbacks, nothing to do. */
1091 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1092 return;
1093
1094 /*
1095 * Starting from the sublist containing the callbacks most
1096 * recently assigned a ->completed number and working down, find the
1097 * first sublist that is not assignable to an upcoming grace period.
1098 * Such a sublist has something in it (first two tests) and has
1099 * a ->completed number assigned that will complete sooner than
1100 * the ->completed number for newly arrived callbacks (last test).
1101 *
1102 * The key point is that any later sublist can be assigned the
1103 * same ->completed number as the newly arrived callbacks, which
1104 * means that the callbacks in any of these later sublist can be
1105 * grouped into a single sublist, whether or not they have already
1106 * been assigned a ->completed number.
1107 */
1108 c = rcu_cbs_completed(rsp, rnp);
1109 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
1110 if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
1111 !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
1112 break;
1113
1114 /*
1115 * If there are no sublist for unassigned callbacks, leave.
1116 * At the same time, advance "i" one sublist, so that "i" will
1117 * index into the sublist where all the remaining callbacks should
1118 * be grouped into.
1119 */
1120 if (++i >= RCU_NEXT_TAIL)
1121 return;
1122
1123 /*
1124 * Assign all subsequent callbacks' ->completed number to the next
1125 * full grace period and group them all in the sublist initially
1126 * indexed by "i".
1127 */
1128 for (; i <= RCU_NEXT_TAIL; i++) {
1129 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 rdp->nxtcompleted[i] = c;
1131 }
1132
1133 /* Trace depending on how much we were able to accelerate. */
1134 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1135 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
1136 else
1137 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
1138}
1139
1140/*
1141 * Move any callbacks whose grace period has completed to the
1142 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1143 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1144 * sublist. This function is idempotent, so it does not hurt to
1145 * invoke it repeatedly. As long as it is not invoked -too- often...
1146 *
1147 * The caller must hold rnp->lock with interrupts disabled.
1148 */
1149static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1150 struct rcu_data *rdp)
1151{
1152 int i, j;
1153
1154 /* If the CPU has no callbacks, nothing to do. */
1155 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1156 return;
1157
1158 /*
1159 * Find all callbacks whose ->completed numbers indicate that they
1160 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1161 */
1162 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
1163 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
1164 break;
1165 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
1166 }
1167 /* Clean up any sublist tail pointers that were misordered above. */
1168 for (j = RCU_WAIT_TAIL; j < i; j++)
1169 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
1170
1171 /* Copy down callbacks to fill in empty sublists. */
1172 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
1173 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
1174 break;
1175 rdp->nxttail[j] = rdp->nxttail[i];
1176 rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
1177 }
1178
1179 /* Classify any remaining callbacks. */
1180 rcu_accelerate_cbs(rsp, rnp, rdp);
1181}
1182
1183/*
1074 * Advance this CPU's callbacks, but only if the current grace period 1184 * Advance this CPU's callbacks, but only if the current grace period
1075 * has ended. This may be called only from the CPU to whom the rdp 1185 * has ended. This may be called only from the CPU to whom the rdp
1076 * belongs. In addition, the corresponding leaf rcu_node structure's 1186 * belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1080,12 +1190,15 @@ static void
1080__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 1190__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1081{ 1191{
1082 /* Did another grace period end? */ 1192 /* Did another grace period end? */
1083 if (rdp->completed != rnp->completed) { 1193 if (rdp->completed == rnp->completed) {
1084 1194
1085 /* Advance callbacks. No harm if list empty. */ 1195 /* No, so just accelerate recent callbacks. */
1086 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; 1196 rcu_accelerate_cbs(rsp, rnp, rdp);
1087 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; 1197
1088 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1198 } else {
1199
1200 /* Advance callbacks. */
1201 rcu_advance_cbs(rsp, rnp, rdp);
1089 1202
1090 /* Remember that we saw this grace-period completion. */ 1203 /* Remember that we saw this grace-period completion. */
1091 rdp->completed = rnp->completed; 1204 rdp->completed = rnp->completed;
@@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1392 /* 1505 /*
1393 * Because there is no grace period in progress right now, 1506 * Because there is no grace period in progress right now,
1394 * any callbacks we have up to this point will be satisfied 1507 * any callbacks we have up to this point will be satisfied
1395 * by the next grace period. So promote all callbacks to be 1508 * by the next grace period. So this is a good place to
1396 * handled after the end of the next grace period. If the 1509 * assign a grace period number to recently posted callbacks.
1397 * CPU is not yet aware of the end of the previous grace period,
1398 * we need to allow for the callback advancement that will
1399 * occur when it does become aware. Deadlock prevents us from
1400 * making it aware at this point: We cannot acquire a leaf
1401 * rcu_node ->lock while holding the root rcu_node ->lock.
1402 */ 1510 */
1403 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1511 rcu_accelerate_cbs(rsp, rnp, rdp);
1404 if (rdp->completed == rsp->completed)
1405 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1406 1512
1407 rsp->gp_flags = RCU_GP_FLAG_INIT; 1513 rsp->gp_flags = RCU_GP_FLAG_INIT;
1408 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ 1514 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
@@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1527 * This GP can't end until cpu checks in, so all of our 1633 * This GP can't end until cpu checks in, so all of our
1528 * callbacks can be processed during the next GP. 1634 * callbacks can be processed during the next GP.
1529 */ 1635 */
1530 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1636 rcu_accelerate_cbs(rsp, rnp, rdp);
1531 1637
1532 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 1638 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1533 } 1639 }
@@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1779 long bl, count, count_lazy; 1885 long bl, count, count_lazy;
1780 int i; 1886 int i;
1781 1887
1782 /* If no callbacks are ready, just return.*/ 1888 /* If no callbacks are ready, just return. */
1783 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1889 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1784 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 1890 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1785 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), 1891 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2008 2114
2009 WARN_ON_ONCE(rdp->beenonline == 0); 2115 WARN_ON_ONCE(rdp->beenonline == 0);
2010 2116
2011 /* 2117 /* Handle the end of a grace period that some other CPU ended. */
2012 * Advance callbacks in response to end of earlier grace
2013 * period that some other CPU ended.
2014 */
2015 rcu_process_gp_end(rsp, rdp); 2118 rcu_process_gp_end(rsp, rdp);
2016 2119
2017 /* Update RCU state based on any recent quiescent states. */ 2120 /* Update RCU state based on any recent quiescent states. */
2018 rcu_check_quiescent_state(rsp, rdp); 2121 rcu_check_quiescent_state(rsp, rdp);
2019 2122
2020 /* Does this CPU require a not-yet-started grace period? */ 2123 /* Does this CPU require a not-yet-started grace period? */
2124 local_irq_save(flags);
2021 if (cpu_needs_another_gp(rsp, rdp)) { 2125 if (cpu_needs_another_gp(rsp, rdp)) {
2022 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 2126 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2023 rcu_start_gp(rsp, flags); /* releases above lock */ 2127 rcu_start_gp(rsp, flags); /* releases above lock */
2128 } else {
2129 local_irq_restore(flags);
2024 } 2130 }
2025 2131
2026 /* If there are callbacks ready, invoke them. */ 2132 /* If there are callbacks ready, invoke them. */
@@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2719 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2825 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2720 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2826 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2721 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2827 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2722#ifdef CONFIG_RCU_USER_QS
2723 WARN_ON_ONCE(rdp->dynticks->in_user);
2724#endif
2725 rdp->cpu = cpu; 2828 rdp->cpu = cpu;
2726 rdp->rsp = rsp; 2829 rdp->rsp = rsp;
2727 rcu_boot_init_nocb_percpu_data(rdp); 2830 rcu_boot_init_nocb_percpu_data(rdp);
@@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2938 3041
2939 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 3042 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
2940 3043
3044 /* Silence gcc 4.8 warning about array index out of range. */
3045 if (rcu_num_lvls > RCU_NUM_LVLS)
3046 panic("rcu_init_one: rcu_num_lvls overflow");
3047
2941 /* Initialize the level-tracking arrays. */ 3048 /* Initialize the level-tracking arrays. */
2942 3049
2943 for (i = 0; i < rcu_num_lvls; i++) 3050 for (i = 0; i < rcu_num_lvls; i++)
@@ -3074,7 +3181,6 @@ void __init rcu_init(void)
3074 cpu_notifier(rcu_cpu_notify, 0); 3181 cpu_notifier(rcu_cpu_notify, 0);
3075 for_each_online_cpu(cpu) 3182 for_each_online_cpu(cpu)
3076 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3183 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3077 check_cpu_stall_init();
3078} 3184}
3079 3185
3080#include "rcutree_plugin.h" 3186#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291b093d..c896b5045d9d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,10 +102,6 @@ struct rcu_dynticks {
102 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105#ifdef CONFIG_RCU_USER_QS
106 bool ignore_user_qs; /* Treat userspace as extended QS or not */
107 bool in_user; /* Is the CPU in userland from RCU POV? */
108#endif
109}; 105};
110 106
111/* RCU's kthread states for tracing. */ 107/* RCU's kthread states for tracing. */
@@ -282,6 +278,8 @@ struct rcu_data {
282 */ 278 */
283 struct rcu_head *nxtlist; 279 struct rcu_head *nxtlist;
284 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 280 struct rcu_head **nxttail[RCU_NEXT_SIZE];
281 unsigned long nxtcompleted[RCU_NEXT_SIZE];
282 /* grace periods for sublists. */
285 long qlen_lazy; /* # of lazy queued callbacks */ 283 long qlen_lazy; /* # of lazy queued callbacks */
286 long qlen; /* # of queued callbacks, incl lazy */ 284 long qlen; /* # of queued callbacks, incl lazy */
287 long qlen_last_fqs_check; 285 long qlen_last_fqs_check;
@@ -343,11 +341,6 @@ struct rcu_data {
343 341
344#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 342#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
345 343
346#ifdef CONFIG_PROVE_RCU
347#define RCU_STALL_DELAY_DELTA (5 * HZ)
348#else
349#define RCU_STALL_DELAY_DELTA 0
350#endif
351#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 344#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
352 /* to take at least one */ 345 /* to take at least one */
353 /* scheduling clock irq */ 346 /* scheduling clock irq */
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abbd..01ab081ac53a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1139 if (!desc->count) 1139 if (!desc->count)
1140 return 0; 1140 return 0;
1141 1141
1142 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); 1142 mutex_lock(&file_inode(filp)->i_mutex);
1143 do { 1143 do {
1144 if (!relay_file_read_avail(buf, *ppos)) 1144 if (!relay_file_read_avail(buf, *ppos))
1145 break; 1145 break;
@@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1159 *ppos = relay_file_read_end_pos(buf, read_start, ret); 1159 *ppos = relay_file_read_end_pos(buf, read_start, ret);
1160 } 1160 }
1161 } while (desc->count && ret); 1161 } while (desc->count && ret);
1162 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); 1162 mutex_unlock(&file_inode(filp)->i_mutex);
1163 1163
1164 return desc->written; 1164 return desc->written;
1165} 1165}
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c8..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -17,6 +17,7 @@
17 * See rt.c in preempt-rt for proper credits and further information 17 * See rt.c in preempt-rt for proper credits and further information
18 */ 18 */
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sched/rt.h>
20#include <linux/delay.h> 21#include <linux/delay.h>
21#include <linux/export.h> 22#include <linux/export.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec49475460..7890b10084a7 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/sched/rt.h>
13#include <linux/spinlock.h> 14#include <linux/spinlock.h>
14#include <linux/timer.h> 15#include <linux/timer.h>
15#include <linux/freezer.h> 16#include <linux/freezer.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c993..1e09308bf2a1 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/rt.h>
16#include <linux/timer.h> 17#include <linux/timer.h>
17 18
18#include "rtmutex_common.h" 19#include "rtmutex_common.h"
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
35 ag->tg->rt_se = NULL; 35 ag->tg->rt_se = NULL;
36 ag->tg->rt_rq = NULL; 36 ag->tg->rt_rq = NULL;
37#endif 37#endif
38 sched_offline_group(ag->tg);
38 sched_destroy_group(ag->tg); 39 sched_destroy_group(ag->tg);
39} 40}
40 41
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
76 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
77 goto out_free; 78 goto out_free;
78 79
80 sched_online_group(tg, &root_task_group);
81
79 kref_init(&ag->kref); 82 kref_init(&ag->kref);
80 init_rwsem(&ag->lock); 83 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr); 84 ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26058d0bebba..7f12624a393c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,7 +83,7 @@
83#endif 83#endif
84 84
85#include "sched.h" 85#include "sched.h"
86#include "../workqueue_sched.h" 86#include "../workqueue_internal.h"
87#include "../smpboot.h" 87#include "../smpboot.h"
88 88
89#define CREATE_TRACE_POINTS 89#define CREATE_TRACE_POINTS
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
1132 */ 1132 */
1133static int select_fallback_rq(int cpu, struct task_struct *p) 1133static int select_fallback_rq(int cpu, struct task_struct *p)
1134{ 1134{
1135 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1135 int nid = cpu_to_node(cpu);
1136 const struct cpumask *nodemask = NULL;
1136 enum { cpuset, possible, fail } state = cpuset; 1137 enum { cpuset, possible, fail } state = cpuset;
1137 int dest_cpu; 1138 int dest_cpu;
1138 1139
1139 /* Look for allowed, online CPU in same node. */ 1140 /*
1140 for_each_cpu(dest_cpu, nodemask) { 1141 * If the node that the cpu is on has been offlined, cpu_to_node()
1141 if (!cpu_online(dest_cpu)) 1142 * will return -1. There is no cpu on the node, and we should
1142 continue; 1143 * select the cpu on the other node.
1143 if (!cpu_active(dest_cpu)) 1144 */
1144 continue; 1145 if (nid != -1) {
1145 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1146 nodemask = cpumask_of_node(nid);
1146 return dest_cpu; 1147
1148 /* Look for allowed, online CPU in same node. */
1149 for_each_cpu(dest_cpu, nodemask) {
1150 if (!cpu_online(dest_cpu))
1151 continue;
1152 if (!cpu_active(dest_cpu))
1153 continue;
1154 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1155 return dest_cpu;
1156 }
1147 } 1157 }
1148 1158
1149 for (;;) { 1159 for (;;) {
@@ -1742,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1742static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1752static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1743{ 1753{
1744 struct preempt_notifier *notifier; 1754 struct preempt_notifier *notifier;
1745 struct hlist_node *node;
1746 1755
1747 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1756 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1748 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1757 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1749} 1758}
1750 1759
@@ -1753,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
1753 struct task_struct *next) 1762 struct task_struct *next)
1754{ 1763{
1755 struct preempt_notifier *notifier; 1764 struct preempt_notifier *notifier;
1756 struct hlist_node *node;
1757 1765
1758 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1766 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1759 notifier->ops->sched_out(notifier, next); 1767 notifier->ops->sched_out(notifier, next);
1760} 1768}
1761 1769
@@ -1969,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
1969} 1977}
1970 1978
1971/* 1979/*
1972 * nr_running, nr_uninterruptible and nr_context_switches: 1980 * nr_running and nr_context_switches:
1973 * 1981 *
1974 * externally visible scheduler statistics: current number of runnable 1982 * externally visible scheduler statistics: current number of runnable
1975 * threads, current number of uninterruptible-sleeping threads, total 1983 * threads, total number of context switches performed since bootup.
1976 * number of context switches performed since bootup.
1977 */ 1984 */
1978unsigned long nr_running(void) 1985unsigned long nr_running(void)
1979{ 1986{
@@ -1985,23 +1992,6 @@ unsigned long nr_running(void)
1985 return sum; 1992 return sum;
1986} 1993}
1987 1994
1988unsigned long nr_uninterruptible(void)
1989{
1990 unsigned long i, sum = 0;
1991
1992 for_each_possible_cpu(i)
1993 sum += cpu_rq(i)->nr_uninterruptible;
1994
1995 /*
1996 * Since we read the counters lockless, it might be slightly
1997 * inaccurate. Do not allow it to go below zero though:
1998 */
1999 if (unlikely((long)sum < 0))
2000 sum = 0;
2001
2002 return sum;
2003}
2004
2005unsigned long long nr_context_switches(void) 1995unsigned long long nr_context_switches(void)
2006{ 1996{
2007 int i; 1997 int i;
@@ -2786,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
2786 if (irqs_disabled()) 2776 if (irqs_disabled())
2787 print_irqtrace_events(prev); 2777 print_irqtrace_events(prev);
2788 dump_stack(); 2778 dump_stack();
2789 add_taint(TAINT_WARN); 2779 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2790} 2780}
2791 2781
2792/* 2782/*
@@ -3268,7 +3258,8 @@ void complete_all(struct completion *x)
3268EXPORT_SYMBOL(complete_all); 3258EXPORT_SYMBOL(complete_all);
3269 3259
3270static inline long __sched 3260static inline long __sched
3271do_wait_for_common(struct completion *x, long timeout, int state) 3261do_wait_for_common(struct completion *x,
3262 long (*action)(long), long timeout, int state)
3272{ 3263{
3273 if (!x->done) { 3264 if (!x->done) {
3274 DECLARE_WAITQUEUE(wait, current); 3265 DECLARE_WAITQUEUE(wait, current);
@@ -3281,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3281 } 3272 }
3282 __set_current_state(state); 3273 __set_current_state(state);
3283 spin_unlock_irq(&x->wait.lock); 3274 spin_unlock_irq(&x->wait.lock);
3284 timeout = schedule_timeout(timeout); 3275 timeout = action(timeout);
3285 spin_lock_irq(&x->wait.lock); 3276 spin_lock_irq(&x->wait.lock);
3286 } while (!x->done && timeout); 3277 } while (!x->done && timeout);
3287 __remove_wait_queue(&x->wait, &wait); 3278 __remove_wait_queue(&x->wait, &wait);
@@ -3292,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3292 return timeout ?: 1; 3283 return timeout ?: 1;
3293} 3284}
3294 3285
3295static long __sched 3286static inline long __sched
3296wait_for_common(struct completion *x, long timeout, int state) 3287__wait_for_common(struct completion *x,
3288 long (*action)(long), long timeout, int state)
3297{ 3289{
3298 might_sleep(); 3290 might_sleep();
3299 3291
3300 spin_lock_irq(&x->wait.lock); 3292 spin_lock_irq(&x->wait.lock);
3301 timeout = do_wait_for_common(x, timeout, state); 3293 timeout = do_wait_for_common(x, action, timeout, state);
3302 spin_unlock_irq(&x->wait.lock); 3294 spin_unlock_irq(&x->wait.lock);
3303 return timeout; 3295 return timeout;
3304} 3296}
3305 3297
3298static long __sched
3299wait_for_common(struct completion *x, long timeout, int state)
3300{
3301 return __wait_for_common(x, schedule_timeout, timeout, state);
3302}
3303
3304static long __sched
3305wait_for_common_io(struct completion *x, long timeout, int state)
3306{
3307 return __wait_for_common(x, io_schedule_timeout, timeout, state);
3308}
3309
3306/** 3310/**
3307 * wait_for_completion: - waits for completion of a task 3311 * wait_for_completion: - waits for completion of a task
3308 * @x: holds the state of this particular completion 3312 * @x: holds the state of this particular completion
@@ -3339,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3339EXPORT_SYMBOL(wait_for_completion_timeout); 3343EXPORT_SYMBOL(wait_for_completion_timeout);
3340 3344
3341/** 3345/**
3346 * wait_for_completion_io: - waits for completion of a task
3347 * @x: holds the state of this particular completion
3348 *
3349 * This waits to be signaled for completion of a specific task. It is NOT
3350 * interruptible and there is no timeout. The caller is accounted as waiting
3351 * for IO.
3352 */
3353void __sched wait_for_completion_io(struct completion *x)
3354{
3355 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3356}
3357EXPORT_SYMBOL(wait_for_completion_io);
3358
3359/**
3360 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
3361 * @x: holds the state of this particular completion
3362 * @timeout: timeout value in jiffies
3363 *
3364 * This waits for either a completion of a specific task to be signaled or for a
3365 * specified timeout to expire. The timeout is in jiffies. It is not
3366 * interruptible. The caller is accounted as waiting for IO.
3367 *
3368 * The return value is 0 if timed out, and positive (at least 1, or number of
3369 * jiffies left till timeout) if completed.
3370 */
3371unsigned long __sched
3372wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
3373{
3374 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
3375}
3376EXPORT_SYMBOL(wait_for_completion_io_timeout);
3377
3378/**
3342 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 3379 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3343 * @x: holds the state of this particular completion 3380 * @x: holds the state of this particular completion
3344 * 3381 *
@@ -4364,20 +4401,32 @@ EXPORT_SYMBOL(yield);
4364 * It's the caller's job to ensure that the target task struct 4401 * It's the caller's job to ensure that the target task struct
4365 * can't go away on us before we can do any checks. 4402 * can't go away on us before we can do any checks.
4366 * 4403 *
4367 * Returns true if we indeed boosted the target task. 4404 * Returns:
4405 * true (>0) if we indeed boosted the target task.
4406 * false (0) if we failed to boost the target.
4407 * -ESRCH if there's no task to yield to.
4368 */ 4408 */
4369bool __sched yield_to(struct task_struct *p, bool preempt) 4409bool __sched yield_to(struct task_struct *p, bool preempt)
4370{ 4410{
4371 struct task_struct *curr = current; 4411 struct task_struct *curr = current;
4372 struct rq *rq, *p_rq; 4412 struct rq *rq, *p_rq;
4373 unsigned long flags; 4413 unsigned long flags;
4374 bool yielded = 0; 4414 int yielded = 0;
4375 4415
4376 local_irq_save(flags); 4416 local_irq_save(flags);
4377 rq = this_rq(); 4417 rq = this_rq();
4378 4418
4379again: 4419again:
4380 p_rq = task_rq(p); 4420 p_rq = task_rq(p);
4421 /*
4422 * If we're the only runnable task on the rq and target rq also
4423 * has only one task, there's absolutely no point in yielding.
4424 */
4425 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4426 yielded = -ESRCH;
4427 goto out_irq;
4428 }
4429
4381 double_rq_lock(rq, p_rq); 4430 double_rq_lock(rq, p_rq);
4382 while (task_rq(p) != p_rq) { 4431 while (task_rq(p) != p_rq) {
4383 double_rq_unlock(rq, p_rq); 4432 double_rq_unlock(rq, p_rq);
@@ -4385,13 +4434,13 @@ again:
4385 } 4434 }
4386 4435
4387 if (!curr->sched_class->yield_to_task) 4436 if (!curr->sched_class->yield_to_task)
4388 goto out; 4437 goto out_unlock;
4389 4438
4390 if (curr->sched_class != p->sched_class) 4439 if (curr->sched_class != p->sched_class)
4391 goto out; 4440 goto out_unlock;
4392 4441
4393 if (task_running(p_rq, p) || p->state) 4442 if (task_running(p_rq, p) || p->state)
4394 goto out; 4443 goto out_unlock;
4395 4444
4396 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4445 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4397 if (yielded) { 4446 if (yielded) {
@@ -4404,11 +4453,12 @@ again:
4404 resched_task(p_rq->curr); 4453 resched_task(p_rq->curr);
4405 } 4454 }
4406 4455
4407out: 4456out_unlock:
4408 double_rq_unlock(rq, p_rq); 4457 double_rq_unlock(rq, p_rq);
4458out_irq:
4409 local_irq_restore(flags); 4459 local_irq_restore(flags);
4410 4460
4411 if (yielded) 4461 if (yielded > 0)
4412 schedule(); 4462 schedule();
4413 4463
4414 return yielded; 4464 return yielded;
@@ -4667,6 +4717,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4667 */ 4717 */
4668 idle->sched_class = &idle_sched_class; 4718 idle->sched_class = &idle_sched_class;
4669 ftrace_graph_init_idle_task(idle, cpu); 4719 ftrace_graph_init_idle_task(idle, cpu);
4720 vtime_init_idle(idle);
4670#if defined(CONFIG_SMP) 4721#if defined(CONFIG_SMP)
4671 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4722 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4672#endif 4723#endif
@@ -7160,7 +7211,6 @@ static void free_sched_group(struct task_group *tg)
7160struct task_group *sched_create_group(struct task_group *parent) 7211struct task_group *sched_create_group(struct task_group *parent)
7161{ 7212{
7162 struct task_group *tg; 7213 struct task_group *tg;
7163 unsigned long flags;
7164 7214
7165 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7215 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7166 if (!tg) 7216 if (!tg)
@@ -7172,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent)
7172 if (!alloc_rt_sched_group(tg, parent)) 7222 if (!alloc_rt_sched_group(tg, parent))
7173 goto err; 7223 goto err;
7174 7224
7225 return tg;
7226
7227err:
7228 free_sched_group(tg);
7229 return ERR_PTR(-ENOMEM);
7230}
7231
7232void sched_online_group(struct task_group *tg, struct task_group *parent)
7233{
7234 unsigned long flags;
7235
7175 spin_lock_irqsave(&task_group_lock, flags); 7236 spin_lock_irqsave(&task_group_lock, flags);
7176 list_add_rcu(&tg->list, &task_groups); 7237 list_add_rcu(&tg->list, &task_groups);
7177 7238
@@ -7181,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
7181 INIT_LIST_HEAD(&tg->children); 7242 INIT_LIST_HEAD(&tg->children);
7182 list_add_rcu(&tg->siblings, &parent->children); 7243 list_add_rcu(&tg->siblings, &parent->children);
7183 spin_unlock_irqrestore(&task_group_lock, flags); 7244 spin_unlock_irqrestore(&task_group_lock, flags);
7184
7185 return tg;
7186
7187err:
7188 free_sched_group(tg);
7189 return ERR_PTR(-ENOMEM);
7190} 7245}
7191 7246
7192/* rcu callback to free various structures associated with a task group */ 7247/* rcu callback to free various structures associated with a task group */
@@ -7199,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
7199/* Destroy runqueue etc associated with a task group */ 7254/* Destroy runqueue etc associated with a task group */
7200void sched_destroy_group(struct task_group *tg) 7255void sched_destroy_group(struct task_group *tg)
7201{ 7256{
7257 /* wait for possible concurrent references to cfs_rqs complete */
7258 call_rcu(&tg->rcu, free_sched_group_rcu);
7259}
7260
7261void sched_offline_group(struct task_group *tg)
7262{
7202 unsigned long flags; 7263 unsigned long flags;
7203 int i; 7264 int i;
7204 7265
@@ -7210,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg)
7210 list_del_rcu(&tg->list); 7271 list_del_rcu(&tg->list);
7211 list_del_rcu(&tg->siblings); 7272 list_del_rcu(&tg->siblings);
7212 spin_unlock_irqrestore(&task_group_lock, flags); 7273 spin_unlock_irqrestore(&task_group_lock, flags);
7213
7214 /* wait for possible concurrent references to cfs_rqs complete */
7215 call_rcu(&tg->rcu, free_sched_group_rcu);
7216} 7274}
7217 7275
7218/* change task's runqueue when it moves between groups. 7276/* change task's runqueue when it moves between groups.
@@ -7508,6 +7566,25 @@ static int sched_rt_global_constraints(void)
7508} 7566}
7509#endif /* CONFIG_RT_GROUP_SCHED */ 7567#endif /* CONFIG_RT_GROUP_SCHED */
7510 7568
7569int sched_rr_handler(struct ctl_table *table, int write,
7570 void __user *buffer, size_t *lenp,
7571 loff_t *ppos)
7572{
7573 int ret;
7574 static DEFINE_MUTEX(mutex);
7575
7576 mutex_lock(&mutex);
7577 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7578 /* make sure that internally we keep jiffies */
7579 /* also, writing zero resets timeslice to default */
7580 if (!ret && write) {
7581 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7582 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7583 }
7584 mutex_unlock(&mutex);
7585 return ret;
7586}
7587
7511int sched_rt_handler(struct ctl_table *table, int write, 7588int sched_rt_handler(struct ctl_table *table, int write,
7512 void __user *buffer, size_t *lenp, 7589 void __user *buffer, size_t *lenp,
7513 loff_t *ppos) 7590 loff_t *ppos)
@@ -7564,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7564 return &tg->css; 7641 return &tg->css;
7565} 7642}
7566 7643
7644static int cpu_cgroup_css_online(struct cgroup *cgrp)
7645{
7646 struct task_group *tg = cgroup_tg(cgrp);
7647 struct task_group *parent;
7648
7649 if (!cgrp->parent)
7650 return 0;
7651
7652 parent = cgroup_tg(cgrp->parent);
7653 sched_online_group(tg, parent);
7654 return 0;
7655}
7656
7567static void cpu_cgroup_css_free(struct cgroup *cgrp) 7657static void cpu_cgroup_css_free(struct cgroup *cgrp)
7568{ 7658{
7569 struct task_group *tg = cgroup_tg(cgrp); 7659 struct task_group *tg = cgroup_tg(cgrp);
@@ -7571,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
7571 sched_destroy_group(tg); 7661 sched_destroy_group(tg);
7572} 7662}
7573 7663
7664static void cpu_cgroup_css_offline(struct cgroup *cgrp)
7665{
7666 struct task_group *tg = cgroup_tg(cgrp);
7667
7668 sched_offline_group(tg);
7669}
7670
7574static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7671static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7575 struct cgroup_taskset *tset) 7672 struct cgroup_taskset *tset)
7576{ 7673{
@@ -7926,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7926 .name = "cpu", 8023 .name = "cpu",
7927 .css_alloc = cpu_cgroup_css_alloc, 8024 .css_alloc = cpu_cgroup_css_alloc,
7928 .css_free = cpu_cgroup_css_free, 8025 .css_free = cpu_cgroup_css_free,
8026 .css_online = cpu_cgroup_css_online,
8027 .css_offline = cpu_cgroup_css_offline,
7929 .can_attach = cpu_cgroup_can_attach, 8028 .can_attach = cpu_cgroup_can_attach,
7930 .attach = cpu_cgroup_attach, 8029 .attach = cpu_cgroup_attach,
7931 .exit = cpu_cgroup_exit, 8030 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
31#include "cpupri.h" 33#include "cpupri.h"
32 34
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 35/* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..ed12cbb135f4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
3#include <linux/tsacct_kern.h> 3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h> 4#include <linux/kernel_stat.h>
5#include <linux/static_key.h> 5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
6#include "sched.h" 7#include "sched.h"
7 8
8 9
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
163 task_group_account_field(p, index, (__force u64) cputime); 164 task_group_account_field(p, index, (__force u64) cputime);
164 165
165 /* Account for user time used */ 166 /* Account for user time used */
166 acct_update_integrals(p); 167 acct_account_cputime(p);
167} 168}
168 169
169/* 170/*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
213 task_group_account_field(p, index, (__force u64) cputime); 214 task_group_account_field(p, index, (__force u64) cputime);
214 215
215 /* Account for system time used */ 216 /* Account for system time used */
216 acct_update_integrals(p); 217 acct_account_cputime(p);
217} 218}
218 219
219/* 220/*
@@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void)
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 296void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{ 297{
297 struct signal_struct *sig = tsk->signal; 298 struct signal_struct *sig = tsk->signal;
299 cputime_t utime, stime;
298 struct task_struct *t; 300 struct task_struct *t;
299 301
300 times->utime = sig->utime; 302 times->utime = sig->utime;
@@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
308 310
309 t = tsk; 311 t = tsk;
310 do { 312 do {
311 times->utime += t->utime; 313 task_cputime(tsk, &utime, &stime);
312 times->stime += t->stime; 314 times->utime += utime;
315 times->stime += stime;
313 times->sum_exec_runtime += task_sched_runtime(t); 316 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t); 317 } while_each_thread(tsk, t);
315out: 318out:
316 rcu_read_unlock(); 319 rcu_read_unlock();
317} 320}
318 321
319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
320
321#ifdef CONFIG_IRQ_TIME_ACCOUNTING 322#ifdef CONFIG_IRQ_TIME_ACCOUNTING
322/* 323/*
323 * Account a tick to a process and cpustat 324 * Account a tick to a process and cpustat
@@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
382 irqtime_account_process_tick(current, 0, rq); 383 irqtime_account_process_tick(current, 0, rq);
383} 384}
384#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 385#else /* CONFIG_IRQ_TIME_ACCOUNTING */
385static void irqtime_account_idle_ticks(int ticks) {} 386static inline void irqtime_account_idle_ticks(int ticks) {}
386static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 387static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
387 struct rq *rq) {} 388 struct rq *rq) {}
388#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
389 390
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
390/* 392/*
391 * Account a single tick of cpu time. 393 * Account a single tick of cpu time.
392 * @p: the process that the cpu time gets accounted to 394 * @p: the process that the cpu time gets accounted to
@@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
397 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 399 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
398 struct rq *rq = this_rq(); 400 struct rq *rq = this_rq();
399 401
402 if (vtime_accounting_enabled())
403 return;
404
400 if (sched_clock_irqtime) { 405 if (sched_clock_irqtime) {
401 irqtime_account_process_tick(p, user_tick, rq); 406 irqtime_account_process_tick(p, user_tick, rq);
402 return; 407 return;
@@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks)
438 443
439 account_idle_time(jiffies_to_cputime(ticks)); 444 account_idle_time(jiffies_to_cputime(ticks));
440} 445}
441 446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
442#endif
443 447
444/* 448/*
445 * Use precise platform statistics if available: 449 * Use precise platform statistics if available:
@@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
461 *st = cputime.stime; 465 *st = cputime.stime;
462} 466}
463 467
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev) 469void vtime_task_switch(struct task_struct *prev)
476{ 470{
471 if (!vtime_accounting_enabled())
472 return;
473
477 if (is_idle_task(prev)) 474 if (is_idle_task(prev))
478 vtime_account_idle(prev); 475 vtime_account_idle(prev);
479 else 476 else
480 vtime_account_system(prev); 477 vtime_account_system(prev);
481 478
479#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
482 vtime_account_user(prev); 480 vtime_account_user(prev);
481#endif
483 arch_vtime_task_switch(prev); 482 arch_vtime_task_switch(prev);
484} 483}
485#endif 484#endif
@@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev)
493 * vtime_account(). 492 * vtime_account().
494 */ 493 */
495#ifndef __ARCH_HAS_VTIME_ACCOUNT 494#ifndef __ARCH_HAS_VTIME_ACCOUNT
496void vtime_account(struct task_struct *tsk) 495void vtime_account_irq_enter(struct task_struct *tsk)
497{ 496{
498 if (in_interrupt() || !is_idle_task(tsk)) 497 if (!vtime_accounting_enabled())
499 vtime_account_system(tsk); 498 return;
500 else 499
501 vtime_account_idle(tsk); 500 if (!in_interrupt()) {
501 /*
502 * If we interrupted user, context_tracking_in_user()
503 * is 1 because the context tracking don't hook
504 * on irq entry/exit. This way we know if
505 * we need to flush user time on kernel entry.
506 */
507 if (context_tracking_in_user()) {
508 vtime_account_user(tsk);
509 return;
510 }
511
512 if (is_idle_task(tsk)) {
513 vtime_account_idle(tsk);
514 return;
515 }
516 }
517 vtime_account_system(tsk);
502} 518}
503EXPORT_SYMBOL_GPL(vtime_account); 519EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
504#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 520#endif /* __ARCH_HAS_VTIME_ACCOUNT */
505 521
506#else 522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
507
508#ifndef nsecs_to_cputime
509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
510#endif
511 523
512static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) 524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
513{ 525{
514 u64 temp = (__force u64) rtime; 526 u64 temp = (__force u64) rtime;
515 527
516 temp *= (__force u64) utime; 528 temp *= (__force u64) stime;
517 529
518 if (sizeof(cputime_t) == 4) 530 if (sizeof(cputime_t) == 4)
519 temp = div_u64(temp, (__force u32) total); 531 temp = div_u64(temp, (__force u32) total);
@@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev, 543 struct cputime *prev,
532 cputime_t *ut, cputime_t *st) 544 cputime_t *ut, cputime_t *st)
533{ 545{
534 cputime_t rtime, utime, total; 546 cputime_t rtime, stime, total;
535 547
536 utime = curr->utime; 548 stime = curr->stime;
537 total = utime + curr->stime; 549 total = stime + curr->utime;
538 550
539 /* 551 /*
540 * Tick based cputime accounting depend on random scheduling 552 * Tick based cputime accounting depend on random scheduling
@@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr,
549 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 561 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
550 562
551 if (total) 563 if (total)
552 utime = scale_utime(utime, rtime, total); 564 stime = scale_stime(stime, rtime, total);
553 else 565 else
554 utime = rtime; 566 stime = rtime;
555 567
556 /* 568 /*
557 * If the tick based count grows faster than the scheduler one, 569 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward. 570 * the result of the scaling may go backward.
559 * Let's enforce monotonicity. 571 * Let's enforce monotonicity.
560 */ 572 */
561 prev->utime = max(prev->utime, utime); 573 prev->stime = max(prev->stime, stime);
562 prev->stime = max(prev->stime, rtime - prev->utime); 574 prev->utime = max(prev->utime, rtime - prev->stime);
563 575
564 *ut = prev->utime; 576 *ut = prev->utime;
565 *st = prev->stime; 577 *st = prev->stime;
@@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr,
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 580void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{ 581{
570 struct task_cputime cputime = { 582 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime, 583 .sum_exec_runtime = p->se.sum_exec_runtime,
574 }; 584 };
575 585
586 task_cputime(p, &cputime.utime, &cputime.stime);
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 587 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
577} 588}
578 589
@@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
586 thread_group_cputime(p, &cputime); 597 thread_group_cputime(p, &cputime);
587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
588} 599}
589#endif 600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
601
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk)
604{
605 unsigned long long clock;
606
607 clock = local_clock();
608 if (clock < tsk->vtime_snap)
609 return 0;
610
611 return clock - tsk->vtime_snap;
612}
613
614static cputime_t get_vtime_delta(struct task_struct *tsk)
615{
616 unsigned long long delta = vtime_delta(tsk);
617
618 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
619 tsk->vtime_snap += delta;
620
621 /* CHECKME: always safe to convert nsecs to cputime? */
622 return nsecs_to_cputime(delta);
623}
624
625static void __vtime_account_system(struct task_struct *tsk)
626{
627 cputime_t delta_cpu = get_vtime_delta(tsk);
628
629 account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
630}
631
632void vtime_account_system(struct task_struct *tsk)
633{
634 if (!vtime_accounting_enabled())
635 return;
636
637 write_seqlock(&tsk->vtime_seqlock);
638 __vtime_account_system(tsk);
639 write_sequnlock(&tsk->vtime_seqlock);
640}
641
642void vtime_account_irq_exit(struct task_struct *tsk)
643{
644 if (!vtime_accounting_enabled())
645 return;
646
647 write_seqlock(&tsk->vtime_seqlock);
648 if (context_tracking_in_user())
649 tsk->vtime_snap_whence = VTIME_USER;
650 __vtime_account_system(tsk);
651 write_sequnlock(&tsk->vtime_seqlock);
652}
653
654void vtime_account_user(struct task_struct *tsk)
655{
656 cputime_t delta_cpu;
657
658 if (!vtime_accounting_enabled())
659 return;
660
661 delta_cpu = get_vtime_delta(tsk);
662
663 write_seqlock(&tsk->vtime_seqlock);
664 tsk->vtime_snap_whence = VTIME_SYS;
665 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
666 write_sequnlock(&tsk->vtime_seqlock);
667}
668
669void vtime_user_enter(struct task_struct *tsk)
670{
671 if (!vtime_accounting_enabled())
672 return;
673
674 write_seqlock(&tsk->vtime_seqlock);
675 tsk->vtime_snap_whence = VTIME_USER;
676 __vtime_account_system(tsk);
677 write_sequnlock(&tsk->vtime_seqlock);
678}
679
680void vtime_guest_enter(struct task_struct *tsk)
681{
682 write_seqlock(&tsk->vtime_seqlock);
683 __vtime_account_system(tsk);
684 current->flags |= PF_VCPU;
685 write_sequnlock(&tsk->vtime_seqlock);
686}
687
688void vtime_guest_exit(struct task_struct *tsk)
689{
690 write_seqlock(&tsk->vtime_seqlock);
691 __vtime_account_system(tsk);
692 current->flags &= ~PF_VCPU;
693 write_sequnlock(&tsk->vtime_seqlock);
694}
695
696void vtime_account_idle(struct task_struct *tsk)
697{
698 cputime_t delta_cpu = get_vtime_delta(tsk);
699
700 account_idle_time(delta_cpu);
701}
702
703bool vtime_accounting_enabled(void)
704{
705 return context_tracking_active();
706}
707
708void arch_vtime_task_switch(struct task_struct *prev)
709{
710 write_seqlock(&prev->vtime_seqlock);
711 prev->vtime_snap_whence = VTIME_SLEEPING;
712 write_sequnlock(&prev->vtime_seqlock);
713
714 write_seqlock(&current->vtime_seqlock);
715 current->vtime_snap_whence = VTIME_SYS;
716 current->vtime_snap = sched_clock();
717 write_sequnlock(&current->vtime_seqlock);
718}
719
720void vtime_init_idle(struct task_struct *t)
721{
722 unsigned long flags;
723
724 write_seqlock_irqsave(&t->vtime_seqlock, flags);
725 t->vtime_snap_whence = VTIME_SYS;
726 t->vtime_snap = sched_clock();
727 write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
728}
729
730cputime_t task_gtime(struct task_struct *t)
731{
732 unsigned int seq;
733 cputime_t gtime;
734
735 do {
736 seq = read_seqbegin(&t->vtime_seqlock);
737
738 gtime = t->gtime;
739 if (t->flags & PF_VCPU)
740 gtime += vtime_delta(t);
741
742 } while (read_seqretry(&t->vtime_seqlock, seq));
743
744 return gtime;
745}
746
747/*
748 * Fetch cputime raw values from fields of task_struct and
749 * add up the pending nohz execution time since the last
750 * cputime snapshot.
751 */
752static void
753fetch_task_cputime(struct task_struct *t,
754 cputime_t *u_dst, cputime_t *s_dst,
755 cputime_t *u_src, cputime_t *s_src,
756 cputime_t *udelta, cputime_t *sdelta)
757{
758 unsigned int seq;
759 unsigned long long delta;
760
761 do {
762 *udelta = 0;
763 *sdelta = 0;
764
765 seq = read_seqbegin(&t->vtime_seqlock);
766
767 if (u_dst)
768 *u_dst = *u_src;
769 if (s_dst)
770 *s_dst = *s_src;
771
772 /* Task is sleeping, nothing to add */
773 if (t->vtime_snap_whence == VTIME_SLEEPING ||
774 is_idle_task(t))
775 continue;
776
777 delta = vtime_delta(t);
778
779 /*
780 * Task runs either in user or kernel space, add pending nohz time to
781 * the right place.
782 */
783 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
784 *udelta = delta;
785 } else {
786 if (t->vtime_snap_whence == VTIME_SYS)
787 *sdelta = delta;
788 }
789 } while (read_seqretry(&t->vtime_seqlock, seq));
790}
791
792
793void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
794{
795 cputime_t udelta, sdelta;
796
797 fetch_task_cputime(t, utime, stime, &t->utime,
798 &t->stime, &udelta, &sdelta);
799 if (utime)
800 *utime += udelta;
801 if (stime)
802 *stime += sdelta;
803}
804
805void task_cputime_scaled(struct task_struct *t,
806 cputime_t *utimescaled, cputime_t *stimescaled)
807{
808 cputime_t udelta, sdelta;
809
810 fetch_task_cputime(t, utimescaled, stimescaled,
811 &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
812 if (utimescaled)
813 *utimescaled += cputime_to_scaled(udelta);
814 if (stimescaled)
815 *stimescaled += cputime_to_scaled(sdelta);
816}
817#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c5420e..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
110 if (autogroup_path(tg, group_path, PATH_MAX)) 110 if (autogroup_path(tg, group_path, PATH_MAX))
111 return group_path; 111 return group_path;
112 112
113 /*
114 * May be NULL if the underlying cgroup isn't fully-created yet
115 */
116 if (!tg->css.cgroup) {
117 group_path[0] = '\0';
118 return group_path;
119 }
120 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 113 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
121 return group_path; 114 return group_path;
122} 115}
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
269 { 262 {
270 unsigned int freq = cpu_khz ? : 1; 263 unsigned int freq = cpu_khz ? : 1;
271 264
272 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", 265 SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
273 cpu, freq / 1000, (freq % 1000)); 266 cpu, freq / 1000, (freq % 1000));
274 } 267 }
275#else 268#else
276 SEQ_printf(m, "\ncpu#%d\n", cpu); 269 SEQ_printf(m, "cpu#%d\n", cpu);
277#endif 270#endif
278 271
279#define P(x) \ 272#define P(x) \
@@ -330,6 +323,7 @@ do { \
330 print_rq(m, rq, cpu); 323 print_rq(m, rq, cpu);
331 rcu_read_unlock(); 324 rcu_read_unlock();
332 spin_unlock_irqrestore(&sched_debug_lock, flags); 325 spin_unlock_irqrestore(&sched_debug_lock, flags);
326 SEQ_printf(m, "\n");
333} 327}
334 328
335static const char *sched_tunable_scaling_names[] = { 329static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
338 "linear" 332 "linear"
339}; 333};
340 334
341static int sched_debug_show(struct seq_file *m, void *v) 335static void sched_debug_header(struct seq_file *m)
342{ 336{
343 u64 ktime, sched_clk, cpu_clk; 337 u64 ktime, sched_clk, cpu_clk;
344 unsigned long flags; 338 unsigned long flags;
345 int cpu;
346 339
347 local_irq_save(flags); 340 local_irq_save(flags);
348 ktime = ktime_to_ns(ktime_get()); 341 ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
384#undef PN 377#undef PN
385#undef P 378#undef P
386 379
387 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", 380 SEQ_printf(m, " .%-40s: %d (%s)\n",
381 "sysctl_sched_tunable_scaling",
388 sysctl_sched_tunable_scaling, 382 sysctl_sched_tunable_scaling,
389 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); 383 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
384 SEQ_printf(m, "\n");
385}
390 386
391 for_each_online_cpu(cpu) 387static int sched_debug_show(struct seq_file *m, void *v)
392 print_cpu(m, cpu); 388{
389 int cpu = (unsigned long)(v - 2);
393 390
394 SEQ_printf(m, "\n"); 391 if (cpu != -1)
392 print_cpu(m, cpu);
393 else
394 sched_debug_header(m);
395 395
396 return 0; 396 return 0;
397} 397}
398 398
399void sysrq_sched_debug_show(void) 399void sysrq_sched_debug_show(void)
400{ 400{
401 sched_debug_show(NULL, NULL); 401 int cpu;
402
403 sched_debug_header(NULL);
404 for_each_online_cpu(cpu)
405 print_cpu(NULL, cpu);
406
407}
408
409/*
410 * This itererator needs some explanation.
411 * It returns 1 for the header position.
412 * This means 2 is cpu 0.
413 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
414 * to use cpumask_* to iterate over the cpus.
415 */
416static void *sched_debug_start(struct seq_file *file, loff_t *offset)
417{
418 unsigned long n = *offset;
419
420 if (n == 0)
421 return (void *) 1;
422
423 n--;
424
425 if (n > 0)
426 n = cpumask_next(n - 1, cpu_online_mask);
427 else
428 n = cpumask_first(cpu_online_mask);
429
430 *offset = n + 1;
431
432 if (n < nr_cpu_ids)
433 return (void *)(unsigned long)(n + 2);
434 return NULL;
435}
436
437static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
438{
439 (*offset)++;
440 return sched_debug_start(file, offset);
441}
442
443static void sched_debug_stop(struct seq_file *file, void *data)
444{
445}
446
447static const struct seq_operations sched_debug_sops = {
448 .start = sched_debug_start,
449 .next = sched_debug_next,
450 .stop = sched_debug_stop,
451 .show = sched_debug_show,
452};
453
454static int sched_debug_release(struct inode *inode, struct file *file)
455{
456 seq_release(inode, file);
457
458 return 0;
402} 459}
403 460
404static int sched_debug_open(struct inode *inode, struct file *filp) 461static int sched_debug_open(struct inode *inode, struct file *filp)
405{ 462{
406 return single_open(filp, sched_debug_show, NULL); 463 int ret = 0;
464
465 ret = seq_open(filp, &sched_debug_sops);
466
467 return ret;
407} 468}
408 469
409static const struct file_operations sched_debug_fops = { 470static const struct file_operations sched_debug_fops = {
410 .open = sched_debug_open, 471 .open = sched_debug_open,
411 .read = seq_read, 472 .read = seq_read,
412 .llseek = seq_lseek, 473 .llseek = seq_lseek,
413 .release = single_release, 474 .release = sched_debug_release,
414}; 475};
415 476
416static int __init init_sched_debug_procfs(void) 477static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81fa53643409..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1680 } 1680 }
1681 1681
1682 /* ensure we never gain time by being placed backwards. */ 1682 /* ensure we never gain time by being placed backwards. */
1683 vruntime = max_vruntime(se->vruntime, vruntime); 1683 se->vruntime = max_vruntime(se->vruntime, vruntime);
1684
1685 se->vruntime = vruntime;
1686} 1684}
1687 1685
1688static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 1686static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -3254,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3254 */ 3252 */
3255static int select_idle_sibling(struct task_struct *p, int target) 3253static int select_idle_sibling(struct task_struct *p, int target)
3256{ 3254{
3257 int cpu = smp_processor_id();
3258 int prev_cpu = task_cpu(p);
3259 struct sched_domain *sd; 3255 struct sched_domain *sd;
3260 struct sched_group *sg; 3256 struct sched_group *sg;
3261 int i; 3257 int i = task_cpu(p);
3262 3258
3263 /* 3259 if (idle_cpu(target))
3264 * If the task is going to be woken-up on this cpu and if it is 3260 return target;
3265 * already idle, then it is the right target.
3266 */
3267 if (target == cpu && idle_cpu(cpu))
3268 return cpu;
3269 3261
3270 /* 3262 /*
3271 * If the task is going to be woken-up on the cpu where it previously 3263 * If the prevous cpu is cache affine and idle, don't be stupid.
3272 * ran and if it is currently idle, then it the right target.
3273 */ 3264 */
3274 if (target == prev_cpu && idle_cpu(prev_cpu)) 3265 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3275 return prev_cpu; 3266 return i;
3276 3267
3277 /* 3268 /*
3278 * Otherwise, iterate the domains and find an elegible idle cpu. 3269 * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3286,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
3286 goto next; 3277 goto next;
3287 3278
3288 for_each_cpu(i, sched_group_cpus(sg)) { 3279 for_each_cpu(i, sched_group_cpus(sg)) {
3289 if (!idle_cpu(i)) 3280 if (i == target || !idle_cpu(i))
3290 goto next; 3281 goto next;
3291 } 3282 }
3292 3283
@@ -6101,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
6101 * idle runqueue: 6092 * idle runqueue:
6102 */ 6093 */
6103 if (rq->cfs.load.weight) 6094 if (rq->cfs.load.weight)
6104 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 6095 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
6105 6096
6106 return rr_interval; 6097 return rr_interval;
6107} 6098}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f02b2847357..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
7 7
8#include <linux/slab.h> 8#include <linux/slab.h>
9 9
10int sched_rr_timeslice = RR_TIMESLICE;
11
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 12static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11 13
12struct rt_bandwidth def_rt_bandwidth; 14struct rt_bandwidth def_rt_bandwidth;
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
925 return; 927 return;
926 928
927 delta_exec = rq->clock_task - curr->se.exec_start; 929 delta_exec = rq->clock_task - curr->se.exec_start;
928 if (unlikely((s64)delta_exec < 0)) 930 if (unlikely((s64)delta_exec <= 0))
929 delta_exec = 0; 931 return;
930 932
931 schedstat_set(curr->se.statistics.exec_max, 933 schedstat_set(curr->se.statistics.exec_max,
932 max(curr->se.statistics.exec_max, delta_exec)); 934 max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1427static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1429static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1428{ 1430{
1429 if (!task_running(rq, p) && 1431 if (!task_running(rq, p) &&
1430 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1432 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1431 (p->nr_cpus_allowed > 1))
1432 return 1; 1433 return 1;
1433 return 0; 1434 return 0;
1434} 1435}
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1889 * we may need to handle the pulling of RT tasks 1890 * we may need to handle the pulling of RT tasks
1890 * now. 1891 * now.
1891 */ 1892 */
1892 if (p->on_rq && !rq->rt.rt_nr_running) 1893 if (!p->on_rq || rq->rt.rt_nr_running)
1893 pull_rt_task(rq); 1894 return;
1895
1896 if (pull_rt_task(rq))
1897 resched_task(rq->curr);
1894} 1898}
1895 1899
1896void init_sched_rt_class(void) 1900void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1985 if (soft != RLIM_INFINITY) { 1989 if (soft != RLIM_INFINITY) {
1986 unsigned long next; 1990 unsigned long next;
1987 1991
1988 p->rt.timeout++; 1992 if (p->rt.watchdog_stamp != jiffies) {
1993 p->rt.timeout++;
1994 p->rt.watchdog_stamp = jiffies;
1995 }
1996
1989 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1997 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1990 if (p->rt.timeout > next) 1998 if (p->rt.timeout > next)
1991 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 1999 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2010 if (--p->rt.time_slice) 2018 if (--p->rt.time_slice)
2011 return; 2019 return;
2012 2020
2013 p->rt.time_slice = RR_TIMESLICE; 2021 p->rt.time_slice = sched_rr_timeslice;
2014 2022
2015 /* 2023 /*
2016 * Requeue to the end of queue if we (and all of our ancestors) are the 2024 * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2041 * Time slice is 0 for SCHED_FIFO tasks 2049 * Time slice is 0 for SCHED_FIFO tasks
2042 */ 2050 */
2043 if (task->policy == SCHED_RR) 2051 if (task->policy == SCHED_RR)
2044 return RR_TIMESLICE; 2052 return sched_rr_timeslice;
2045 else 2053 else
2046 return 0; 2054 return 0;
2047} 2055}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc886441436a..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
1 1
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h>
3#include <linux/mutex.h> 5#include <linux/mutex.h>
4#include <linux/spinlock.h> 6#include <linux/spinlock.h>
5#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
21 if (mask_str == NULL) 21 if (mask_str == NULL)
22 return -ENOMEM; 22 return -ENOMEM;
23 23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 24 if (v == (void *)1) {
25 seq_printf(seq, "timestamp %lu\n", jiffies); 25 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
26 for_each_online_cpu(cpu) { 26 seq_printf(seq, "timestamp %lu\n", jiffies);
27 struct rq *rq = cpu_rq(cpu); 27 } else {
28 struct rq *rq;
28#ifdef CONFIG_SMP 29#ifdef CONFIG_SMP
29 struct sched_domain *sd; 30 struct sched_domain *sd;
30 int dcount = 0; 31 int dcount = 0;
31#endif 32#endif
33 cpu = (unsigned long)(v - 2);
34 rq = cpu_rq(cpu);
32 35
33 /* runqueue-specific stats */ 36 /* runqueue-specific stats */
34 seq_printf(seq, 37 seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
77 return 0; 80 return 0;
78} 81}
79 82
80static int schedstat_open(struct inode *inode, struct file *file) 83/*
84 * This itererator needs some explanation.
85 * It returns 1 for the header position.
86 * This means 2 is cpu 0.
87 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
88 * to use cpumask_* to iterate over the cpus.
89 */
90static void *schedstat_start(struct seq_file *file, loff_t *offset)
81{ 91{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 92 unsigned long n = *offset;
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86 93
87 if (!buf) 94 if (n == 0)
88 return -ENOMEM; 95 return (void *) 1;
89 res = single_open(file, show_schedstat, NULL); 96
90 if (!res) { 97 n--;
91 m = file->private_data; 98
92 m->buf = buf; 99 if (n > 0)
93 m->size = size; 100 n = cpumask_next(n - 1, cpu_online_mask);
94 } else 101 else
95 kfree(buf); 102 n = cpumask_first(cpu_online_mask);
96 return res; 103
104 *offset = n + 1;
105
106 if (n < nr_cpu_ids)
107 return (void *)(unsigned long)(n + 2);
108 return NULL;
109}
110
111static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
112{
113 (*offset)++;
114 return schedstat_start(file, offset);
115}
116
117static void schedstat_stop(struct seq_file *file, void *data)
118{
119}
120
121static const struct seq_operations schedstat_sops = {
122 .start = schedstat_start,
123 .next = schedstat_next,
124 .stop = schedstat_stop,
125 .show = show_schedstat,
126};
127
128static int schedstat_open(struct inode *inode, struct file *file)
129{
130 return seq_open(file, &schedstat_sops);
97} 131}
98 132
133static int schedstat_release(struct inode *inode, struct file *file)
134{
135 return 0;
136};
137
99static const struct file_operations proc_schedstat_operations = { 138static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open, 139 .open = schedstat_open,
101 .read = seq_read, 140 .read = seq_read,
102 .llseek = seq_lseek, 141 .llseek = seq_lseek,
103 .release = single_release, 142 .release = schedstat_release,
104}; 143};
105 144
106static int __init proc_schedstat_init(void) 145static int __init proc_schedstat_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 3d09cf6cde75..dd72567767d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
485 if (force_default || ka->sa.sa_handler != SIG_IGN) 485 if (force_default || ka->sa.sa_handler != SIG_IGN)
486 ka->sa.sa_handler = SIG_DFL; 486 ka->sa.sa_handler = SIG_DFL;
487 ka->sa.sa_flags = 0; 487 ka->sa.sa_flags = 0;
488#ifdef __ARCH_HAS_SA_RESTORER
489 ka->sa.sa_restorer = NULL;
490#endif
488 sigemptyset(&ka->sa.sa_mask); 491 sigemptyset(&ka->sa.sa_mask);
489 ka++; 492 ka++;
490 } 493 }
@@ -1157,11 +1160,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1157static void print_fatal_signal(int signr) 1160static void print_fatal_signal(int signr)
1158{ 1161{
1159 struct pt_regs *regs = signal_pt_regs(); 1162 struct pt_regs *regs = signal_pt_regs();
1160 printk("%s/%d: potentially unexpected fatal signal %d.\n", 1163 printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
1161 current->comm, task_pid_nr(current), signr); 1164 current->comm, task_pid_nr(current), signr);
1162 1165
1163#if defined(__i386__) && !defined(__arch_um__) 1166#if defined(__i386__) && !defined(__arch_um__)
1164 printk("code at %08lx: ", regs->ip); 1167 printk(KERN_INFO "code at %08lx: ", regs->ip);
1165 { 1168 {
1166 int i; 1169 int i;
1167 for (i = 0; i < 16; i++) { 1170 for (i = 0; i < 16; i++) {
@@ -1169,11 +1172,11 @@ static void print_fatal_signal(int signr)
1169 1172
1170 if (get_user(insn, (unsigned char *)(regs->ip + i))) 1173 if (get_user(insn, (unsigned char *)(regs->ip + i)))
1171 break; 1174 break;
1172 printk("%02x ", insn); 1175 printk(KERN_CONT "%02x ", insn);
1173 } 1176 }
1174 } 1177 }
1178 printk(KERN_CONT "\n");
1175#endif 1179#endif
1176 printk("\n");
1177 preempt_disable(); 1180 preempt_disable();
1178 show_regs(regs); 1181 show_regs(regs);
1179 preempt_enable(); 1182 preempt_enable();
@@ -1632,6 +1635,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1632 unsigned long flags; 1635 unsigned long flags;
1633 struct sighand_struct *psig; 1636 struct sighand_struct *psig;
1634 bool autoreap = false; 1637 bool autoreap = false;
1638 cputime_t utime, stime;
1635 1639
1636 BUG_ON(sig == -1); 1640 BUG_ON(sig == -1);
1637 1641
@@ -1669,8 +1673,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1669 task_uid(tsk)); 1673 task_uid(tsk));
1670 rcu_read_unlock(); 1674 rcu_read_unlock();
1671 1675
1672 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); 1676 task_cputime(tsk, &utime, &stime);
1673 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); 1677 info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
1678 info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
1674 1679
1675 info.si_status = tsk->exit_code & 0x7f; 1680 info.si_status = tsk->exit_code & 0x7f;
1676 if (tsk->exit_code & 0x80) 1681 if (tsk->exit_code & 0x80)
@@ -1734,6 +1739,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1734 unsigned long flags; 1739 unsigned long flags;
1735 struct task_struct *parent; 1740 struct task_struct *parent;
1736 struct sighand_struct *sighand; 1741 struct sighand_struct *sighand;
1742 cputime_t utime, stime;
1737 1743
1738 if (for_ptracer) { 1744 if (for_ptracer) {
1739 parent = tsk->parent; 1745 parent = tsk->parent;
@@ -1752,8 +1758,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1752 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1758 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1753 rcu_read_unlock(); 1759 rcu_read_unlock();
1754 1760
1755 info.si_utime = cputime_to_clock_t(tsk->utime); 1761 task_cputime(tsk, &utime, &stime);
1756 info.si_stime = cputime_to_clock_t(tsk->stime); 1762 info.si_utime = cputime_to_clock_t(utime);
1763 info.si_stime = cputime_to_clock_t(stime);
1757 1764
1758 info.si_code = why; 1765 info.si_code = why;
1759 switch (why) { 1766 switch (why) {
@@ -2395,6 +2402,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
2395 tracehook_signal_handler(sig, info, ka, regs, stepping); 2402 tracehook_signal_handler(sig, info, ka, regs, stepping);
2396} 2403}
2397 2404
2405void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
2406{
2407 if (failed)
2408 force_sigsegv(ksig->sig, current);
2409 else
2410 signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
2411 signal_pt_regs(), stepping);
2412}
2413
2398/* 2414/*
2399 * It could be that complete_signal() picked us to notify about the 2415 * It could be that complete_signal() picked us to notify about the
2400 * group-wide signal. Other threads should be notified now to take 2416 * group-wide signal. Other threads should be notified now to take
@@ -2612,40 +2628,95 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
2612 return 0; 2628 return 0;
2613} 2629}
2614 2630
2615long do_sigpending(void __user *set, unsigned long sigsetsize) 2631#ifdef CONFIG_COMPAT
2632COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
2633 compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
2616{ 2634{
2617 long error = -EINVAL; 2635#ifdef __BIG_ENDIAN
2618 sigset_t pending; 2636 sigset_t old_set = current->blocked;
2637
2638 /* XXX: Don't preclude handling different sized sigset_t's. */
2639 if (sigsetsize != sizeof(sigset_t))
2640 return -EINVAL;
2641
2642 if (nset) {
2643 compat_sigset_t new32;
2644 sigset_t new_set;
2645 int error;
2646 if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
2647 return -EFAULT;
2648
2649 sigset_from_compat(&new_set, &new32);
2650 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2651
2652 error = sigprocmask(how, &new_set, NULL);
2653 if (error)
2654 return error;
2655 }
2656 if (oset) {
2657 compat_sigset_t old32;
2658 sigset_to_compat(&old32, &old_set);
2659 if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
2660 return -EFAULT;
2661 }
2662 return 0;
2663#else
2664 return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
2665 (sigset_t __user *)oset, sigsetsize);
2666#endif
2667}
2668#endif
2619 2669
2670static int do_sigpending(void *set, unsigned long sigsetsize)
2671{
2620 if (sigsetsize > sizeof(sigset_t)) 2672 if (sigsetsize > sizeof(sigset_t))
2621 goto out; 2673 return -EINVAL;
2622 2674
2623 spin_lock_irq(&current->sighand->siglock); 2675 spin_lock_irq(&current->sighand->siglock);
2624 sigorsets(&pending, &current->pending.signal, 2676 sigorsets(set, &current->pending.signal,
2625 &current->signal->shared_pending.signal); 2677 &current->signal->shared_pending.signal);
2626 spin_unlock_irq(&current->sighand->siglock); 2678 spin_unlock_irq(&current->sighand->siglock);
2627 2679
2628 /* Outside the lock because only this thread touches it. */ 2680 /* Outside the lock because only this thread touches it. */
2629 sigandsets(&pending, &current->blocked, &pending); 2681 sigandsets(set, &current->blocked, set);
2630 2682 return 0;
2631 error = -EFAULT;
2632 if (!copy_to_user(set, &pending, sigsetsize))
2633 error = 0;
2634
2635out:
2636 return error;
2637} 2683}
2638 2684
2639/** 2685/**
2640 * sys_rt_sigpending - examine a pending signal that has been raised 2686 * sys_rt_sigpending - examine a pending signal that has been raised
2641 * while blocked 2687 * while blocked
2642 * @set: stores pending signals 2688 * @uset: stores pending signals
2643 * @sigsetsize: size of sigset_t type or larger 2689 * @sigsetsize: size of sigset_t type or larger
2644 */ 2690 */
2645SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) 2691SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
2646{ 2692{
2647 return do_sigpending(set, sigsetsize); 2693 sigset_t set;
2694 int err = do_sigpending(&set, sigsetsize);
2695 if (!err && copy_to_user(uset, &set, sigsetsize))
2696 err = -EFAULT;
2697 return err;
2698}
2699
2700#ifdef CONFIG_COMPAT
2701COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
2702 compat_size_t, sigsetsize)
2703{
2704#ifdef __BIG_ENDIAN
2705 sigset_t set;
2706 int err = do_sigpending(&set, sigsetsize);
2707 if (!err) {
2708 compat_sigset_t set32;
2709 sigset_to_compat(&set32, &set);
2710 /* we can get here only if sigsetsize <= sizeof(set) */
2711 if (copy_to_user(uset, &set32, sigsetsize))
2712 err = -EFAULT;
2713 }
2714 return err;
2715#else
2716 return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
2717#endif
2648} 2718}
2719#endif
2649 2720
2650#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER 2721#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
2651 2722
@@ -2923,6 +2994,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2923 return do_tkill(0, pid, sig); 2994 return do_tkill(0, pid, sig);
2924} 2995}
2925 2996
2997static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
2998{
2999 /* Not even root can pretend to send signals from the kernel.
3000 * Nor can they impersonate a kill()/tgkill(), which adds source info.
3001 */
3002 if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
3003 (task_pid_vnr(current) != pid)) {
3004 /* We used to allow any < 0 si_code */
3005 WARN_ON_ONCE(info->si_code < 0);
3006 return -EPERM;
3007 }
3008 info->si_signo = sig;
3009
3010 /* POSIX.1b doesn't mention process groups. */
3011 return kill_proc_info(sig, info, pid);
3012}
3013
2926/** 3014/**
2927 * sys_rt_sigqueueinfo - send signal information to a signal 3015 * sys_rt_sigqueueinfo - send signal information to a signal
2928 * @pid: the PID of the thread 3016 * @pid: the PID of the thread
@@ -2933,25 +3021,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2933 siginfo_t __user *, uinfo) 3021 siginfo_t __user *, uinfo)
2934{ 3022{
2935 siginfo_t info; 3023 siginfo_t info;
2936
2937 if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) 3024 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2938 return -EFAULT; 3025 return -EFAULT;
3026 return do_rt_sigqueueinfo(pid, sig, &info);
3027}
2939 3028
2940 /* Not even root can pretend to send signals from the kernel. 3029#ifdef CONFIG_COMPAT
2941 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3030COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
2942 */ 3031 compat_pid_t, pid,
2943 if (info.si_code >= 0 || info.si_code == SI_TKILL) { 3032 int, sig,
2944 /* We used to allow any < 0 si_code */ 3033 struct compat_siginfo __user *, uinfo)
2945 WARN_ON_ONCE(info.si_code < 0); 3034{
2946 return -EPERM; 3035 siginfo_t info;
2947 } 3036 int ret = copy_siginfo_from_user32(&info, uinfo);
2948 info.si_signo = sig; 3037 if (unlikely(ret))
2949 3038 return ret;
2950 /* POSIX.1b doesn't mention process groups. */ 3039 return do_rt_sigqueueinfo(pid, sig, &info);
2951 return kill_proc_info(sig, &info, pid);
2952} 3040}
3041#endif
2953 3042
2954long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) 3043static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2955{ 3044{
2956 /* This is only valid for single tasks */ 3045 /* This is only valid for single tasks */
2957 if (pid <= 0 || tgid <= 0) 3046 if (pid <= 0 || tgid <= 0)
@@ -2960,7 +3049,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2960 /* Not even root can pretend to send signals from the kernel. 3049 /* Not even root can pretend to send signals from the kernel.
2961 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3050 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2962 */ 3051 */
2963 if (info->si_code >= 0 || info->si_code == SI_TKILL) { 3052 if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
3053 (task_pid_vnr(current) != pid)) {
2964 /* We used to allow any < 0 si_code */ 3054 /* We used to allow any < 0 si_code */
2965 WARN_ON_ONCE(info->si_code < 0); 3055 WARN_ON_ONCE(info->si_code < 0);
2966 return -EPERM; 3056 return -EPERM;
@@ -2981,6 +3071,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2981 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); 3071 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2982} 3072}
2983 3073
3074#ifdef CONFIG_COMPAT
3075COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3076 compat_pid_t, tgid,
3077 compat_pid_t, pid,
3078 int, sig,
3079 struct compat_siginfo __user *, uinfo)
3080{
3081 siginfo_t info;
3082
3083 if (copy_siginfo_from_user32(&info, uinfo))
3084 return -EFAULT;
3085 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
3086}
3087#endif
3088
2984int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3089int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2985{ 3090{
2986 struct task_struct *t = current; 3091 struct task_struct *t = current;
@@ -3026,7 +3131,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3026 return 0; 3131 return 0;
3027} 3132}
3028 3133
3029int 3134static int
3030do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3135do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
3031{ 3136{
3032 stack_t oss; 3137 stack_t oss;
@@ -3091,12 +3196,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
3091out: 3196out:
3092 return error; 3197 return error;
3093} 3198}
3094#ifdef CONFIG_GENERIC_SIGALTSTACK
3095SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) 3199SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3096{ 3200{
3097 return do_sigaltstack(uss, uoss, current_user_stack_pointer()); 3201 return do_sigaltstack(uss, uoss, current_user_stack_pointer());
3098} 3202}
3099#endif
3100 3203
3101int restore_altstack(const stack_t __user *uss) 3204int restore_altstack(const stack_t __user *uss)
3102{ 3205{
@@ -3114,7 +3217,6 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
3114} 3217}
3115 3218
3116#ifdef CONFIG_COMPAT 3219#ifdef CONFIG_COMPAT
3117#ifdef CONFIG_GENERIC_SIGALTSTACK
3118COMPAT_SYSCALL_DEFINE2(sigaltstack, 3220COMPAT_SYSCALL_DEFINE2(sigaltstack,
3119 const compat_stack_t __user *, uss_ptr, 3221 const compat_stack_t __user *, uss_ptr,
3120 compat_stack_t __user *, uoss_ptr) 3222 compat_stack_t __user *, uoss_ptr)
@@ -3164,7 +3266,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3164 __put_user(t->sas_ss_size, &uss->ss_size); 3266 __put_user(t->sas_ss_size, &uss->ss_size);
3165} 3267}
3166#endif 3268#endif
3167#endif
3168 3269
3169#ifdef __ARCH_WANT_SYS_SIGPENDING 3270#ifdef __ARCH_WANT_SYS_SIGPENDING
3170 3271
@@ -3174,7 +3275,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3174 */ 3275 */
3175SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 3276SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
3176{ 3277{
3177 return do_sigpending(set, sizeof(*set)); 3278 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
3178} 3279}
3179 3280
3180#endif 3281#endif
@@ -3230,7 +3331,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3230} 3331}
3231#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 3332#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
3232 3333
3233#ifdef __ARCH_WANT_SYS_RT_SIGACTION 3334#ifndef CONFIG_ODD_RT_SIGACTION
3234/** 3335/**
3235 * sys_rt_sigaction - alter an action taken by a process 3336 * sys_rt_sigaction - alter an action taken by a process
3236 * @sig: signal to be sent 3337 * @sig: signal to be sent
@@ -3264,7 +3365,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
3264out: 3365out:
3265 return ret; 3366 return ret;
3266} 3367}
3267#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ 3368#ifdef CONFIG_COMPAT
3369COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3370 const struct compat_sigaction __user *, act,
3371 struct compat_sigaction __user *, oact,
3372 compat_size_t, sigsetsize)
3373{
3374 struct k_sigaction new_ka, old_ka;
3375 compat_sigset_t mask;
3376#ifdef __ARCH_HAS_SA_RESTORER
3377 compat_uptr_t restorer;
3378#endif
3379 int ret;
3380
3381 /* XXX: Don't preclude handling different sized sigset_t's. */
3382 if (sigsetsize != sizeof(compat_sigset_t))
3383 return -EINVAL;
3384
3385 if (act) {
3386 compat_uptr_t handler;
3387 ret = get_user(handler, &act->sa_handler);
3388 new_ka.sa.sa_handler = compat_ptr(handler);
3389#ifdef __ARCH_HAS_SA_RESTORER
3390 ret |= get_user(restorer, &act->sa_restorer);
3391 new_ka.sa.sa_restorer = compat_ptr(restorer);
3392#endif
3393 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
3394 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
3395 if (ret)
3396 return -EFAULT;
3397 sigset_from_compat(&new_ka.sa.sa_mask, &mask);
3398 }
3399
3400 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3401 if (!ret && oact) {
3402 sigset_to_compat(&mask, &old_ka.sa.sa_mask);
3403 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
3404 &oact->sa_handler);
3405 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
3406 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
3407#ifdef __ARCH_HAS_SA_RESTORER
3408 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3409 &oact->sa_restorer);
3410#endif
3411 }
3412 return ret;
3413}
3414#endif
3415#endif /* !CONFIG_ODD_RT_SIGACTION */
3416
3417#ifdef CONFIG_OLD_SIGACTION
3418SYSCALL_DEFINE3(sigaction, int, sig,
3419 const struct old_sigaction __user *, act,
3420 struct old_sigaction __user *, oact)
3421{
3422 struct k_sigaction new_ka, old_ka;
3423 int ret;
3424
3425 if (act) {
3426 old_sigset_t mask;
3427 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
3428 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
3429 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
3430 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
3431 __get_user(mask, &act->sa_mask))
3432 return -EFAULT;
3433#ifdef __ARCH_HAS_KA_RESTORER
3434 new_ka.ka_restorer = NULL;
3435#endif
3436 siginitset(&new_ka.sa.sa_mask, mask);
3437 }
3438
3439 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3440
3441 if (!ret && oact) {
3442 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
3443 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
3444 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
3445 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
3446 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
3447 return -EFAULT;
3448 }
3449
3450 return ret;
3451}
3452#endif
3453#ifdef CONFIG_COMPAT_OLD_SIGACTION
3454COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
3455 const struct compat_old_sigaction __user *, act,
3456 struct compat_old_sigaction __user *, oact)
3457{
3458 struct k_sigaction new_ka, old_ka;
3459 int ret;
3460 compat_old_sigset_t mask;
3461 compat_uptr_t handler, restorer;
3462
3463 if (act) {
3464 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
3465 __get_user(handler, &act->sa_handler) ||
3466 __get_user(restorer, &act->sa_restorer) ||
3467 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
3468 __get_user(mask, &act->sa_mask))
3469 return -EFAULT;
3470
3471#ifdef __ARCH_HAS_KA_RESTORER
3472 new_ka.ka_restorer = NULL;
3473#endif
3474 new_ka.sa.sa_handler = compat_ptr(handler);
3475 new_ka.sa.sa_restorer = compat_ptr(restorer);
3476 siginitset(&new_ka.sa.sa_mask, mask);
3477 }
3478
3479 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3480
3481 if (!ret && oact) {
3482 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
3483 __put_user(ptr_to_compat(old_ka.sa.sa_handler),
3484 &oact->sa_handler) ||
3485 __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3486 &oact->sa_restorer) ||
3487 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
3488 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
3489 return -EFAULT;
3490 }
3491 return ret;
3492}
3493#endif
3268 3494
3269#ifdef __ARCH_WANT_SYS_SGETMASK 3495#ifdef __ARCH_WANT_SYS_SGETMASK
3270 3496
@@ -3332,7 +3558,6 @@ int sigsuspend(sigset_t *set)
3332 return -ERESTARTNOHAND; 3558 return -ERESTARTNOHAND;
3333} 3559}
3334 3560
3335#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3336/** 3561/**
3337 * sys_rt_sigsuspend - replace the signal mask for a value with the 3562 * sys_rt_sigsuspend - replace the signal mask for a value with the
3338 * @unewset value until a signal is received 3563 * @unewset value until a signal is received
@@ -3351,7 +3576,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3351 return -EFAULT; 3576 return -EFAULT;
3352 return sigsuspend(&newset); 3577 return sigsuspend(&newset);
3353} 3578}
3354#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 3579
3580#ifdef CONFIG_COMPAT
3581COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
3582{
3583#ifdef __BIG_ENDIAN
3584 sigset_t newset;
3585 compat_sigset_t newset32;
3586
3587 /* XXX: Don't preclude handling different sized sigset_t's. */
3588 if (sigsetsize != sizeof(sigset_t))
3589 return -EINVAL;
3590
3591 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
3592 return -EFAULT;
3593 sigset_from_compat(&newset, &newset32);
3594 return sigsuspend(&newset);
3595#else
3596 /* on little-endian bitmaps don't care about granularity */
3597 return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
3598#endif
3599}
3600#endif
3601
3602#ifdef CONFIG_OLD_SIGSUSPEND
3603SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
3604{
3605 sigset_t blocked;
3606 siginitset(&blocked, mask);
3607 return sigsuspend(&blocked);
3608}
3609#endif
3610#ifdef CONFIG_OLD_SIGSUSPEND3
3611SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
3612{
3613 sigset_t blocked;
3614 siginitset(&blocked, mask);
3615 return sigsuspend(&blocked);
3616}
3617#endif
3355 3618
3356__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) 3619__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
3357{ 3620{
diff --git a/kernel/smp.c b/kernel/smp.c
index 69f38bd98b42..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,22 +16,12 @@
16#include "smpboot.h" 16#include "smpboot.h"
17 17
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19static struct {
20 struct list_head queue;
21 raw_spinlock_t lock;
22} call_function __cacheline_aligned_in_smp =
23 {
24 .queue = LIST_HEAD_INIT(call_function.queue),
25 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
26 };
27
28enum { 19enum {
29 CSD_FLAG_LOCK = 0x01, 20 CSD_FLAG_LOCK = 0x01,
30}; 21};
31 22
32struct call_function_data { 23struct call_function_data {
33 struct call_single_data csd; 24 struct call_single_data __percpu *csd;
34 atomic_t refs;
35 cpumask_var_t cpumask; 25 cpumask_var_t cpumask;
36 cpumask_var_t cpumask_ipi; 26 cpumask_var_t cpumask_ipi;
37}; 27};
@@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
60 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, 50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
61 cpu_to_node(cpu))) 51 cpu_to_node(cpu)))
62 return notifier_from_errno(-ENOMEM); 52 return notifier_from_errno(-ENOMEM);
53 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) {
55 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM);
57 }
63 break; 58 break;
64 59
65#ifdef CONFIG_HOTPLUG_CPU 60#ifdef CONFIG_HOTPLUG_CPU
@@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
70 case CPU_DEAD_FROZEN: 65 case CPU_DEAD_FROZEN:
71 free_cpumask_var(cfd->cpumask); 66 free_cpumask_var(cfd->cpumask);
72 free_cpumask_var(cfd->cpumask_ipi); 67 free_cpumask_var(cfd->cpumask_ipi);
68 free_percpu(cfd->csd);
73 break; 69 break;
74#endif 70#endif
75 }; 71 };
@@ -171,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171} 167}
172 168
173/* 169/*
174 * Invoked by arch to handle an IPI for call function. Must be called with
175 * interrupts disabled.
176 */
177void generic_smp_call_function_interrupt(void)
178{
179 struct call_function_data *data;
180 int cpu = smp_processor_id();
181
182 /*
183 * Shouldn't receive this interrupt on a cpu that is not yet online.
184 */
185 WARN_ON_ONCE(!cpu_online(cpu));
186
187 /*
188 * Ensure entry is visible on call_function_queue after we have
189 * entered the IPI. See comment in smp_call_function_many.
190 * If we don't have this, then we may miss an entry on the list
191 * and never get another IPI to process it.
192 */
193 smp_mb();
194
195 /*
196 * It's ok to use list_for_each_rcu() here even though we may
197 * delete 'pos', since list_del_rcu() doesn't clear ->next
198 */
199 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
200 int refs;
201 smp_call_func_t func;
202
203 /*
204 * Since we walk the list without any locks, we might
205 * see an entry that was completed, removed from the
206 * list and is in the process of being reused.
207 *
208 * We must check that the cpu is in the cpumask before
209 * checking the refs, and both must be set before
210 * executing the callback on this cpu.
211 */
212
213 if (!cpumask_test_cpu(cpu, data->cpumask))
214 continue;
215
216 smp_rmb();
217
218 if (atomic_read(&data->refs) == 0)
219 continue;
220
221 func = data->csd.func; /* save for later warn */
222 func(data->csd.info);
223
224 /*
225 * If the cpu mask is not still set then func enabled
226 * interrupts (BUG), and this cpu took another smp call
227 * function interrupt and executed func(info) twice
228 * on this cpu. That nested execution decremented refs.
229 */
230 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
231 WARN(1, "%pf enabled interrupts and double executed\n", func);
232 continue;
233 }
234
235 refs = atomic_dec_return(&data->refs);
236 WARN_ON(refs < 0);
237
238 if (refs)
239 continue;
240
241 WARN_ON(!cpumask_empty(data->cpumask));
242
243 raw_spin_lock(&call_function.lock);
244 list_del_rcu(&data->csd.list);
245 raw_spin_unlock(&call_function.lock);
246
247 csd_unlock(&data->csd);
248 }
249
250}
251
252/*
253 * Invoked by arch to handle an IPI for call function single. Must be 170 * Invoked by arch to handle an IPI for call function single. Must be
254 * called from the arch with interrupts disabled. 171 * called from the arch with interrupts disabled.
255 */ 172 */
@@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
453 smp_call_func_t func, void *info, bool wait) 370 smp_call_func_t func, void *info, bool wait)
454{ 371{
455 struct call_function_data *data; 372 struct call_function_data *data;
456 unsigned long flags; 373 int cpu, next_cpu, this_cpu = smp_processor_id();
457 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
458 374
459 /* 375 /*
460 * Can deadlock when called with interrupts disabled. 376 * Can deadlock when called with interrupts disabled.
@@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask,
486 } 402 }
487 403
488 data = &__get_cpu_var(cfd_data); 404 data = &__get_cpu_var(cfd_data);
489 csd_lock(&data->csd);
490
491 /* This BUG_ON verifies our reuse assertions and can be removed */
492 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
493
494 /*
495 * The global call function queue list add and delete are protected
496 * by a lock, but the list is traversed without any lock, relying
497 * on the rcu list add and delete to allow safe concurrent traversal.
498 * We reuse the call function data without waiting for any grace
499 * period after some other cpu removes it from the global queue.
500 * This means a cpu might find our data block as it is being
501 * filled out.
502 *
503 * We hold off the interrupt handler on the other cpu by
504 * ordering our writes to the cpu mask vs our setting of the
505 * refs counter. We assert only the cpu owning the data block
506 * will set a bit in cpumask, and each bit will only be cleared
507 * by the subject cpu. Each cpu must first find its bit is
508 * set and then check that refs is set indicating the element is
509 * ready to be processed, otherwise it must skip the entry.
510 *
511 * On the previous iteration refs was set to 0 by another cpu.
512 * To avoid the use of transitivity, set the counter to 0 here
513 * so the wmb will pair with the rmb in the interrupt handler.
514 */
515 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
516
517 data->csd.func = func;
518 data->csd.info = info;
519 405
520 /* Ensure 0 refs is visible before mask. Also orders func and info */
521 smp_wmb();
522
523 /* We rely on the "and" being processed before the store */
524 cpumask_and(data->cpumask, mask, cpu_online_mask); 406 cpumask_and(data->cpumask, mask, cpu_online_mask);
525 cpumask_clear_cpu(this_cpu, data->cpumask); 407 cpumask_clear_cpu(this_cpu, data->cpumask);
526 refs = cpumask_weight(data->cpumask);
527 408
528 /* Some callers race with other cpus changing the passed mask */ 409 /* Some callers race with other cpus changing the passed mask */
529 if (unlikely(!refs)) { 410 if (unlikely(!cpumask_weight(data->cpumask)))
530 csd_unlock(&data->csd);
531 return; 411 return;
532 }
533 412
534 /* 413 /*
535 * After we put an entry into the list, data->cpumask 414 * After we put an entry into the list, data->cpumask
@@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask,
537 * a SMP function call, so data->cpumask will be zero. 416 * a SMP function call, so data->cpumask will be zero.
538 */ 417 */
539 cpumask_copy(data->cpumask_ipi, data->cpumask); 418 cpumask_copy(data->cpumask_ipi, data->cpumask);
540 raw_spin_lock_irqsave(&call_function.lock, flags);
541 /*
542 * Place entry at the _HEAD_ of the list, so that any cpu still
543 * observing the entry in generic_smp_call_function_interrupt()
544 * will not miss any other list entries:
545 */
546 list_add_rcu(&data->csd.list, &call_function.queue);
547 /*
548 * We rely on the wmb() in list_add_rcu to complete our writes
549 * to the cpumask before this write to refs, which indicates
550 * data is on the list and is ready to be processed.
551 */
552 atomic_set(&data->refs, refs);
553 raw_spin_unlock_irqrestore(&call_function.lock, flags);
554 419
555 /* 420 for_each_cpu(cpu, data->cpumask) {
556 * Make the list addition visible before sending the ipi. 421 struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
557 * (IPIs must obey or appear to obey normal Linux cache 422 struct call_single_queue *dst =
558 * coherency rules -- see comment in generic_exec_single). 423 &per_cpu(call_single_queue, cpu);
559 */ 424 unsigned long flags;
560 smp_mb(); 425
426 csd_lock(csd);
427 csd->func = func;
428 csd->info = info;
429
430 raw_spin_lock_irqsave(&dst->lock, flags);
431 list_add_tail(&csd->list, &dst->list);
432 raw_spin_unlock_irqrestore(&dst->lock, flags);
433 }
561 434
562 /* Send a message to all CPUs in the map */ 435 /* Send a message to all CPUs in the map */
563 arch_send_call_function_ipi_mask(data->cpumask_ipi); 436 arch_send_call_function_ipi_mask(data->cpumask_ipi);
564 437
565 /* Optionally wait for the CPUs to complete */ 438 if (wait) {
566 if (wait) 439 for_each_cpu(cpu, data->cpumask) {
567 csd_lock_wait(&data->csd); 440 struct call_single_data *csd =
441 per_cpu_ptr(data->csd, cpu);
442 csd_lock_wait(csd);
443 }
444 }
568} 445}
569EXPORT_SYMBOL(smp_call_function_many); 446EXPORT_SYMBOL(smp_call_function_many);
570 447
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d6c5fc054242..8eaed9aa9cf0 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
183 kfree(td); 183 kfree(td);
184 return PTR_ERR(tsk); 184 return PTR_ERR(tsk);
185 } 185 }
186
187 get_task_struct(tsk); 186 get_task_struct(tsk);
188 *per_cpu_ptr(ht->store, cpu) = tsk; 187 *per_cpu_ptr(ht->store, cpu) = tsk;
188 if (ht->create)
189 ht->create(cpu);
189 return 0; 190 return 0;
190} 191}
191 192
@@ -208,6 +209,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
208{ 209{
209 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 210 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
210 211
212 if (ht->pre_unpark)
213 ht->pre_unpark(cpu);
211 kthread_unpark(tsk); 214 kthread_unpark(tsk);
212} 215}
213 216
@@ -225,7 +228,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
225{ 228{
226 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 229 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
227 230
228 if (tsk) 231 if (tsk && !ht->selfparking)
229 kthread_park(tsk); 232 kthread_park(tsk);
230} 233}
231 234
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567babe789..14d7758074aa 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 195EXPORT_SYMBOL(local_bh_enable_ip);
196 196
197/* 197/*
198 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 198 * We restart softirq processing for at most 2 ms,
199 * and we fall back to softirqd after that. 199 * and if need_resched() is not set.
200 * 200 *
201 * This number has been established via experimentation. 201 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 202 * The two things to balance is latency against fairness -
203 * we want to handle softirqs as soon as possible, but they 203 * we want to handle softirqs as soon as possible, but they
204 * should not be able to lock up the box. 204 * should not be able to lock up the box.
205 */ 205 */
206#define MAX_SOFTIRQ_RESTART 10 206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
207 207
208asmlinkage void __do_softirq(void) 208asmlinkage void __do_softirq(void)
209{ 209{
210 struct softirq_action *h; 210 struct softirq_action *h;
211 __u32 pending; 211 __u32 pending;
212 int max_restart = MAX_SOFTIRQ_RESTART; 212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 213 int cpu;
214 unsigned long old_flags = current->flags; 214 unsigned long old_flags = current->flags;
215 215
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 vtime_account_irq_enter(current); 224 account_irq_enter_time(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -264,15 +264,16 @@ restart:
264 local_irq_disable(); 264 local_irq_disable();
265 265
266 pending = local_softirq_pending(); 266 pending = local_softirq_pending();
267 if (pending && --max_restart) 267 if (pending) {
268 goto restart; 268 if (time_before(jiffies, end) && !need_resched())
269 goto restart;
269 270
270 if (pending)
271 wakeup_softirqd(); 271 wakeup_softirqd();
272 }
272 273
273 lockdep_softirq_exit(); 274 lockdep_softirq_exit();
274 275
275 vtime_account_irq_exit(current); 276 account_irq_exit_time(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 277 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 278 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 279}
@@ -322,18 +323,10 @@ void irq_enter(void)
322 323
323static inline void invoke_softirq(void) 324static inline void invoke_softirq(void)
324{ 325{
325 if (!force_irqthreads) { 326 if (!force_irqthreads)
326#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
327 __do_softirq(); 327 __do_softirq();
328#else 328 else
329 do_softirq();
330#endif
331 } else {
332 __local_bh_disable((unsigned long)__builtin_return_address(0),
333 SOFTIRQ_OFFSET);
334 wakeup_softirqd(); 329 wakeup_softirqd();
335 __local_bh_enable(SOFTIRQ_OFFSET);
336 }
337} 330}
338 331
339/* 332/*
@@ -341,9 +334,15 @@ static inline void invoke_softirq(void)
341 */ 334 */
342void irq_exit(void) 335void irq_exit(void)
343{ 336{
344 vtime_account_irq_exit(current); 337#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
338 local_irq_disable();
339#else
340 WARN_ON_ONCE(!irqs_disabled());
341#endif
342
343 account_irq_exit_time(current);
345 trace_hardirq_exit(); 344 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 345 sub_preempt_count(HARDIRQ_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 346 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 347 invoke_softirq();
349 348
@@ -353,7 +352,6 @@ void irq_exit(void)
353 tick_nohz_irq_exit(); 352 tick_nohz_irq_exit();
354#endif 353#endif
355 rcu_irq_exit(); 354 rcu_irq_exit();
356 sched_preempt_enable_no_resched();
357} 355}
358 356
359/* 357/*
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2b859828cdc3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
282 */ 282 */
283void cleanup_srcu_struct(struct srcu_struct *sp) 283void cleanup_srcu_struct(struct srcu_struct *sp)
284{ 284{
285 int sum; 285 if (WARN_ON(srcu_readers_active(sp)))
286 286 return; /* Leakage unless caller handles error. */
287 sum = srcu_readers_active(sp);
288 WARN_ON(sum); /* Leakage unless caller handles error. */
289 if (sum != 0)
290 return;
291 free_percpu(sp->per_cpu_ref); 287 free_percpu(sp->per_cpu_ref);
292 sp->per_cpu_ref = NULL; 288 sp->per_cpu_ref = NULL;
293} 289}
@@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
302{ 298{
303 int idx; 299 int idx;
304 300
301 idx = ACCESS_ONCE(sp->completed) & 0x1;
305 preempt_disable(); 302 preempt_disable();
306 idx = rcu_dereference_index_check(sp->completed,
307 rcu_read_lock_sched_held()) & 0x1;
308 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; 303 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
309 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 304 smp_mb(); /* B */ /* Avoid leaking the critical section. */
310 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
@@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
321 */ 316 */
322void __srcu_read_unlock(struct srcu_struct *sp, int idx) 317void __srcu_read_unlock(struct srcu_struct *sp, int idx)
323{ 318{
324 preempt_disable();
325 smp_mb(); /* C */ /* Avoid leaking the critical section. */ 319 smp_mb(); /* C */ /* Avoid leaking the critical section. */
326 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; 320 this_cpu_dec(sp->per_cpu_ref->c[idx]);
327 preempt_enable();
328} 321}
329EXPORT_SYMBOL_GPL(__srcu_read_unlock); 322EXPORT_SYMBOL_GPL(__srcu_read_unlock);
330 323
@@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
423 !lock_is_held(&rcu_sched_lock_map), 416 !lock_is_held(&rcu_sched_lock_map),
424 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 417 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
425 418
419 might_sleep();
426 init_completion(&rcu.completion); 420 init_completion(&rcu.completion);
427 421
428 head->next = NULL; 422 head->next = NULL;
@@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
455 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 449 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
456 * @sp: srcu_struct with which to synchronize. 450 * @sp: srcu_struct with which to synchronize.
457 * 451 *
458 * Flip the completed counter, and wait for the old count to drain to zero. 452 * Wait for the count to drain to zero of both indexes. To avoid the
459 * As with classic RCU, the updater must use some separate means of 453 * possible starvation of synchronize_srcu(), it waits for the count of
460 * synchronizing concurrent updates. Can block; must be called from 454 * the index=((->completed & 1) ^ 1) to drain to zero at first,
461 * process context. 455 * and then flip the completed and wait for the count of the other index.
456 *
457 * Can block; must be called from process context.
462 * 458 *
463 * Note that it is illegal to call synchronize_srcu() from the corresponding 459 * Note that it is illegal to call synchronize_srcu() from the corresponding
464 * SRCU read-side critical section; doing so will result in deadlock. 460 * SRCU read-side critical section; doing so will result in deadlock.
@@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
480 * Wait for an SRCU grace period to elapse, but be more aggressive about 476 * Wait for an SRCU grace period to elapse, but be more aggressive about
481 * spinning rather than blocking when waiting. 477 * spinning rather than blocking when waiting.
482 * 478 *
483 * Note that it is illegal to call this function while holding any lock 479 * Note that it is also illegal to call synchronize_srcu_expedited()
484 * that is acquired by a CPU-hotplug notifier. It is also illegal to call 480 * from the corresponding SRCU read-side critical section;
485 * synchronize_srcu_expedited() from the corresponding SRCU read-side 481 * doing so will result in deadlock. However, it is perfectly legal
486 * critical section; doing so will result in deadlock. However, it is 482 * to call synchronize_srcu_expedited() on one srcu_struct from some
487 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 483 * other srcu_struct's read-side critical section, as long as
488 * from some other srcu_struct's read-side critical section, as long as
489 * the resulting graph of srcu_structs is acyclic. 484 * the resulting graph of srcu_structs is acyclic.
490 */ 485 */
491void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e965715..c09f2955ae30 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -18,7 +18,7 @@
18#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23 23
24/* 24/*
@@ -37,10 +37,10 @@ struct cpu_stopper {
37 spinlock_t lock; 37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */ 38 bool enabled; /* is this stopper enabled? */
39 struct list_head works; /* list of pending works */ 39 struct list_head works; /* list of pending works */
40 struct task_struct *thread; /* stopper thread */
41}; 40};
42 41
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 42static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44static bool stop_machine_initialized = false; 44static bool stop_machine_initialized = false;
45 45
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
62} 62}
63 63
64/* queue @work to @stopper. if offline, @work is completed immediately */ 64/* queue @work to @stopper. if offline, @work is completed immediately */
65static void cpu_stop_queue_work(struct cpu_stopper *stopper, 65static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
66 struct cpu_stop_work *work)
67{ 66{
67 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
68 struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
69
68 unsigned long flags; 70 unsigned long flags;
69 71
70 spin_lock_irqsave(&stopper->lock, flags); 72 spin_lock_irqsave(&stopper->lock, flags);
71 73
72 if (stopper->enabled) { 74 if (stopper->enabled) {
73 list_add_tail(&work->list, &stopper->works); 75 list_add_tail(&work->list, &stopper->works);
74 wake_up_process(stopper->thread); 76 wake_up_process(p);
75 } else 77 } else
76 cpu_stop_signal_done(work->done, false); 78 cpu_stop_signal_done(work->done, false);
77 79
@@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
108 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; 110 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
109 111
110 cpu_stop_init_done(&done, 1); 112 cpu_stop_init_done(&done, 1);
111 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); 113 cpu_stop_queue_work(cpu, &work);
112 wait_for_completion(&done.completion); 114 wait_for_completion(&done.completion);
113 return done.executed ? done.ret : -ENOENT; 115 return done.executed ? done.ret : -ENOENT;
114} 116}
@@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
130 struct cpu_stop_work *work_buf) 132 struct cpu_stop_work *work_buf)
131{ 133{
132 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; 134 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
133 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); 135 cpu_stop_queue_work(cpu, work_buf);
134} 136}
135 137
136/* static data for stop_cpus */ 138/* static data for stop_cpus */
@@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
159 */ 161 */
160 preempt_disable(); 162 preempt_disable();
161 for_each_cpu(cpu, cpumask) 163 for_each_cpu(cpu, cpumask)
162 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), 164 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
163 &per_cpu(stop_cpus_work, cpu));
164 preempt_enable(); 165 preempt_enable();
165} 166}
166 167
@@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
244 return ret; 245 return ret;
245} 246}
246 247
247static int cpu_stopper_thread(void *data) 248static int cpu_stop_should_run(unsigned int cpu)
249{
250 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
251 unsigned long flags;
252 int run;
253
254 spin_lock_irqsave(&stopper->lock, flags);
255 run = !list_empty(&stopper->works);
256 spin_unlock_irqrestore(&stopper->lock, flags);
257 return run;
258}
259
260static void cpu_stopper_thread(unsigned int cpu)
248{ 261{
249 struct cpu_stopper *stopper = data; 262 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
250 struct cpu_stop_work *work; 263 struct cpu_stop_work *work;
251 int ret; 264 int ret;
252 265
253repeat: 266repeat:
254 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
255
256 if (kthread_should_stop()) {
257 __set_current_state(TASK_RUNNING);
258 return 0;
259 }
260
261 work = NULL; 267 work = NULL;
262 spin_lock_irq(&stopper->lock); 268 spin_lock_irq(&stopper->lock);
263 if (!list_empty(&stopper->works)) { 269 if (!list_empty(&stopper->works)) {
@@ -273,8 +279,6 @@ repeat:
273 struct cpu_stop_done *done = work->done; 279 struct cpu_stop_done *done = work->done;
274 char ksym_buf[KSYM_NAME_LEN] __maybe_unused; 280 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
275 281
276 __set_current_state(TASK_RUNNING);
277
278 /* cpu stop callbacks are not allowed to sleep */ 282 /* cpu stop callbacks are not allowed to sleep */
279 preempt_disable(); 283 preempt_disable();
280 284
@@ -290,88 +294,55 @@ repeat:
290 ksym_buf), arg); 294 ksym_buf), arg);
291 295
292 cpu_stop_signal_done(done, true); 296 cpu_stop_signal_done(done, true);
293 } else 297 goto repeat;
294 schedule(); 298 }
295
296 goto repeat;
297} 299}
298 300
299extern void sched_set_stop_task(int cpu, struct task_struct *stop); 301extern void sched_set_stop_task(int cpu, struct task_struct *stop);
300 302
301/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 303static void cpu_stop_create(unsigned int cpu)
302static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 304{
303 unsigned long action, void *hcpu) 305 sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
306}
307
308static void cpu_stop_park(unsigned int cpu)
304{ 309{
305 unsigned int cpu = (unsigned long)hcpu;
306 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 310 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
307 struct task_struct *p; 311 struct cpu_stop_work *work;
308 312 unsigned long flags;
309 switch (action & ~CPU_TASKS_FROZEN) {
310 case CPU_UP_PREPARE:
311 BUG_ON(stopper->thread || stopper->enabled ||
312 !list_empty(&stopper->works));
313 p = kthread_create_on_node(cpu_stopper_thread,
314 stopper,
315 cpu_to_node(cpu),
316 "migration/%d", cpu);
317 if (IS_ERR(p))
318 return notifier_from_errno(PTR_ERR(p));
319 get_task_struct(p);
320 kthread_bind(p, cpu);
321 sched_set_stop_task(cpu, p);
322 stopper->thread = p;
323 break;
324
325 case CPU_ONLINE:
326 /* strictly unnecessary, as first user will wake it */
327 wake_up_process(stopper->thread);
328 /* mark enabled */
329 spin_lock_irq(&stopper->lock);
330 stopper->enabled = true;
331 spin_unlock_irq(&stopper->lock);
332 break;
333
334#ifdef CONFIG_HOTPLUG_CPU
335 case CPU_UP_CANCELED:
336 case CPU_POST_DEAD:
337 {
338 struct cpu_stop_work *work;
339
340 sched_set_stop_task(cpu, NULL);
341 /* kill the stopper */
342 kthread_stop(stopper->thread);
343 /* drain remaining works */
344 spin_lock_irq(&stopper->lock);
345 list_for_each_entry(work, &stopper->works, list)
346 cpu_stop_signal_done(work->done, false);
347 stopper->enabled = false;
348 spin_unlock_irq(&stopper->lock);
349 /* release the stopper */
350 put_task_struct(stopper->thread);
351 stopper->thread = NULL;
352 break;
353 }
354#endif
355 }
356 313
357 return NOTIFY_OK; 314 /* drain remaining works */
315 spin_lock_irqsave(&stopper->lock, flags);
316 list_for_each_entry(work, &stopper->works, list)
317 cpu_stop_signal_done(work->done, false);
318 stopper->enabled = false;
319 spin_unlock_irqrestore(&stopper->lock, flags);
358} 320}
359 321
360/* 322static void cpu_stop_unpark(unsigned int cpu)
361 * Give it a higher priority so that cpu stopper is available to other 323{
362 * cpu notifiers. It currently shares the same priority as sched 324 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
363 * migration_notifier. 325
364 */ 326 spin_lock_irq(&stopper->lock);
365static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { 327 stopper->enabled = true;
366 .notifier_call = cpu_stop_cpu_callback, 328 spin_unlock_irq(&stopper->lock);
367 .priority = 10, 329}
330
331static struct smp_hotplug_thread cpu_stop_threads = {
332 .store = &cpu_stopper_task,
333 .thread_should_run = cpu_stop_should_run,
334 .thread_fn = cpu_stopper_thread,
335 .thread_comm = "migration/%u",
336 .create = cpu_stop_create,
337 .setup = cpu_stop_unpark,
338 .park = cpu_stop_park,
339 .pre_unpark = cpu_stop_unpark,
340 .selfparking = true,
368}; 341};
369 342
370static int __init cpu_stop_init(void) 343static int __init cpu_stop_init(void)
371{ 344{
372 void *bcpu = (void *)(long)smp_processor_id();
373 unsigned int cpu; 345 unsigned int cpu;
374 int err;
375 346
376 for_each_possible_cpu(cpu) { 347 for_each_possible_cpu(cpu) {
377 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 348 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,15 +351,8 @@ static int __init cpu_stop_init(void)
380 INIT_LIST_HEAD(&stopper->works); 351 INIT_LIST_HEAD(&stopper->works);
381 } 352 }
382 353
383 /* start one for the boot cpu */ 354 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
384 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
385 bcpu);
386 BUG_ON(err != NOTIFY_OK);
387 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
388 register_cpu_notifier(&cpu_stop_cpu_notifier);
389
390 stop_machine_initialized = true; 355 stop_machine_initialized = true;
391
392 return 0; 356 return 0;
393} 357}
394early_initcall(cpu_stop_init); 358early_initcall(cpu_stop_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 265b37690421..81f56445fba9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
47#include <linux/syscalls.h> 47#include <linux/syscalls.h>
48#include <linux/kprobes.h> 48#include <linux/kprobes.h>
49#include <linux/user_namespace.h> 49#include <linux/user_namespace.h>
50#include <linux/binfmts.h>
50 51
51#include <linux/kmsg_dump.h> 52#include <linux/kmsg_dump.h>
52/* Move somewhere else to avoid recompiling? */ 53/* Move somewhere else to avoid recompiling? */
@@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex);
433SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, 434SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
434 void __user *, arg) 435 void __user *, arg)
435{ 436{
437 struct pid_namespace *pid_ns = task_active_pid_ns(current);
436 char buffer[256]; 438 char buffer[256];
437 int ret = 0; 439 int ret = 0;
438 440
439 /* We only trust the superuser with rebooting the system. */ 441 /* We only trust the superuser with rebooting the system. */
440 if (!capable(CAP_SYS_BOOT)) 442 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
441 return -EPERM; 443 return -EPERM;
442 444
443 /* For safety, we require "magic" arguments. */ 445 /* For safety, we require "magic" arguments. */
@@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
453 * pid_namespace, the command is handled by reboot_pid_ns() which will 455 * pid_namespace, the command is handled by reboot_pid_ns() which will
454 * call do_exit(). 456 * call do_exit().
455 */ 457 */
456 ret = reboot_pid_ns(task_active_pid_ns(current), cmd); 458 ret = reboot_pid_ns(pid_ns, cmd);
457 if (ret) 459 if (ret)
458 return ret; 460 return ret;
459 461
@@ -1792,14 +1794,14 @@ SYSCALL_DEFINE1(umask, int, mask)
1792static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1794static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1793{ 1795{
1794 struct fd exe; 1796 struct fd exe;
1795 struct dentry *dentry; 1797 struct inode *inode;
1796 int err; 1798 int err;
1797 1799
1798 exe = fdget(fd); 1800 exe = fdget(fd);
1799 if (!exe.file) 1801 if (!exe.file)
1800 return -EBADF; 1802 return -EBADF;
1801 1803
1802 dentry = exe.file->f_path.dentry; 1804 inode = file_inode(exe.file);
1803 1805
1804 /* 1806 /*
1805 * Because the original mm->exe_file points to executable file, make 1807 * Because the original mm->exe_file points to executable file, make
@@ -1807,11 +1809,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1807 * overall picture. 1809 * overall picture.
1808 */ 1810 */
1809 err = -EACCES; 1811 err = -EACCES;
1810 if (!S_ISREG(dentry->d_inode->i_mode) || 1812 if (!S_ISREG(inode->i_mode) ||
1811 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) 1813 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1812 goto exit; 1814 goto exit;
1813 1815
1814 err = inode_permission(dentry->d_inode, MAY_EXEC); 1816 err = inode_permission(inode, MAY_EXEC);
1815 if (err) 1817 if (err)
1816 goto exit; 1818 goto exit;
1817 1819
@@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2012 2014
2013 error = 0; 2015 error = 0;
2014 switch (option) { 2016 switch (option) {
2015 case PR_SET_PDEATHSIG: 2017 case PR_SET_PDEATHSIG:
2016 if (!valid_signal(arg2)) { 2018 if (!valid_signal(arg2)) {
2017 error = -EINVAL; 2019 error = -EINVAL;
2018 break;
2019 }
2020 me->pdeath_signal = arg2;
2021 break;
2022 case PR_GET_PDEATHSIG:
2023 error = put_user(me->pdeath_signal, (int __user *)arg2);
2024 break;
2025 case PR_GET_DUMPABLE:
2026 error = get_dumpable(me->mm);
2027 break; 2020 break;
2028 case PR_SET_DUMPABLE: 2021 }
2029 if (arg2 < 0 || arg2 > 1) { 2022 me->pdeath_signal = arg2;
2030 error = -EINVAL; 2023 break;
2031 break; 2024 case PR_GET_PDEATHSIG:
2032 } 2025 error = put_user(me->pdeath_signal, (int __user *)arg2);
2033 set_dumpable(me->mm, arg2); 2026 break;
2027 case PR_GET_DUMPABLE:
2028 error = get_dumpable(me->mm);
2029 break;
2030 case PR_SET_DUMPABLE:
2031 if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
2032 error = -EINVAL;
2034 break; 2033 break;
2034 }
2035 set_dumpable(me->mm, arg2);
2036 break;
2035 2037
2036 case PR_SET_UNALIGN: 2038 case PR_SET_UNALIGN:
2037 error = SET_UNALIGN_CTL(me, arg2); 2039 error = SET_UNALIGN_CTL(me, arg2);
2038 break; 2040 break;
2039 case PR_GET_UNALIGN: 2041 case PR_GET_UNALIGN:
2040 error = GET_UNALIGN_CTL(me, arg2); 2042 error = GET_UNALIGN_CTL(me, arg2);
2041 break; 2043 break;
2042 case PR_SET_FPEMU: 2044 case PR_SET_FPEMU:
2043 error = SET_FPEMU_CTL(me, arg2); 2045 error = SET_FPEMU_CTL(me, arg2);
2044 break; 2046 break;
2045 case PR_GET_FPEMU: 2047 case PR_GET_FPEMU:
2046 error = GET_FPEMU_CTL(me, arg2); 2048 error = GET_FPEMU_CTL(me, arg2);
2047 break; 2049 break;
2048 case PR_SET_FPEXC: 2050 case PR_SET_FPEXC:
2049 error = SET_FPEXC_CTL(me, arg2); 2051 error = SET_FPEXC_CTL(me, arg2);
2050 break; 2052 break;
2051 case PR_GET_FPEXC: 2053 case PR_GET_FPEXC:
2052 error = GET_FPEXC_CTL(me, arg2); 2054 error = GET_FPEXC_CTL(me, arg2);
2053 break; 2055 break;
2054 case PR_GET_TIMING: 2056 case PR_GET_TIMING:
2055 error = PR_TIMING_STATISTICAL; 2057 error = PR_TIMING_STATISTICAL;
2056 break; 2058 break;
2057 case PR_SET_TIMING: 2059 case PR_SET_TIMING:
2058 if (arg2 != PR_TIMING_STATISTICAL) 2060 if (arg2 != PR_TIMING_STATISTICAL)
2059 error = -EINVAL; 2061 error = -EINVAL;
2060 break; 2062 break;
2061 case PR_SET_NAME: 2063 case PR_SET_NAME:
2062 comm[sizeof(me->comm)-1] = 0; 2064 comm[sizeof(me->comm) - 1] = 0;
2063 if (strncpy_from_user(comm, (char __user *)arg2, 2065 if (strncpy_from_user(comm, (char __user *)arg2,
2064 sizeof(me->comm) - 1) < 0) 2066 sizeof(me->comm) - 1) < 0)
2065 return -EFAULT; 2067 return -EFAULT;
2066 set_task_comm(me, comm); 2068 set_task_comm(me, comm);
2067 proc_comm_connector(me); 2069 proc_comm_connector(me);
2068 break; 2070 break;
2069 case PR_GET_NAME: 2071 case PR_GET_NAME:
2070 get_task_comm(comm, me); 2072 get_task_comm(comm, me);
2071 if (copy_to_user((char __user *)arg2, comm, 2073 if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
2072 sizeof(comm))) 2074 return -EFAULT;
2073 return -EFAULT; 2075 break;
2074 break; 2076 case PR_GET_ENDIAN:
2075 case PR_GET_ENDIAN: 2077 error = GET_ENDIAN(me, arg2);
2076 error = GET_ENDIAN(me, arg2); 2078 break;
2077 break; 2079 case PR_SET_ENDIAN:
2078 case PR_SET_ENDIAN: 2080 error = SET_ENDIAN(me, arg2);
2079 error = SET_ENDIAN(me, arg2); 2081 break;
2080 break; 2082 case PR_GET_SECCOMP:
2081 case PR_GET_SECCOMP: 2083 error = prctl_get_seccomp();
2082 error = prctl_get_seccomp(); 2084 break;
2083 break; 2085 case PR_SET_SECCOMP:
2084 case PR_SET_SECCOMP: 2086 error = prctl_set_seccomp(arg2, (char __user *)arg3);
2085 error = prctl_set_seccomp(arg2, (char __user *)arg3); 2087 break;
2086 break; 2088 case PR_GET_TSC:
2087 case PR_GET_TSC: 2089 error = GET_TSC_CTL(arg2);
2088 error = GET_TSC_CTL(arg2); 2090 break;
2089 break; 2091 case PR_SET_TSC:
2090 case PR_SET_TSC: 2092 error = SET_TSC_CTL(arg2);
2091 error = SET_TSC_CTL(arg2); 2093 break;
2092 break; 2094 case PR_TASK_PERF_EVENTS_DISABLE:
2093 case PR_TASK_PERF_EVENTS_DISABLE: 2095 error = perf_event_task_disable();
2094 error = perf_event_task_disable(); 2096 break;
2095 break; 2097 case PR_TASK_PERF_EVENTS_ENABLE:
2096 case PR_TASK_PERF_EVENTS_ENABLE: 2098 error = perf_event_task_enable();
2097 error = perf_event_task_enable(); 2099 break;
2098 break; 2100 case PR_GET_TIMERSLACK:
2099 case PR_GET_TIMERSLACK: 2101 error = current->timer_slack_ns;
2100 error = current->timer_slack_ns; 2102 break;
2101 break; 2103 case PR_SET_TIMERSLACK:
2102 case PR_SET_TIMERSLACK: 2104 if (arg2 <= 0)
2103 if (arg2 <= 0) 2105 current->timer_slack_ns =
2104 current->timer_slack_ns =
2105 current->default_timer_slack_ns; 2106 current->default_timer_slack_ns;
2106 else 2107 else
2107 current->timer_slack_ns = arg2; 2108 current->timer_slack_ns = arg2;
2108 break; 2109 break;
2109 case PR_MCE_KILL: 2110 case PR_MCE_KILL:
2110 if (arg4 | arg5) 2111 if (arg4 | arg5)
2111 return -EINVAL; 2112 return -EINVAL;
2112 switch (arg2) { 2113 switch (arg2) {
2113 case PR_MCE_KILL_CLEAR: 2114 case PR_MCE_KILL_CLEAR:
2114 if (arg3 != 0) 2115 if (arg3 != 0)
2115 return -EINVAL;
2116 current->flags &= ~PF_MCE_PROCESS;
2117 break;
2118 case PR_MCE_KILL_SET:
2119 current->flags |= PF_MCE_PROCESS;
2120 if (arg3 == PR_MCE_KILL_EARLY)
2121 current->flags |= PF_MCE_EARLY;
2122 else if (arg3 == PR_MCE_KILL_LATE)
2123 current->flags &= ~PF_MCE_EARLY;
2124 else if (arg3 == PR_MCE_KILL_DEFAULT)
2125 current->flags &=
2126 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2127 else
2128 return -EINVAL;
2129 break;
2130 default:
2131 return -EINVAL; 2116 return -EINVAL;
2132 } 2117 current->flags &= ~PF_MCE_PROCESS;
2133 break; 2118 break;
2134 case PR_MCE_KILL_GET: 2119 case PR_MCE_KILL_SET:
2135 if (arg2 | arg3 | arg4 | arg5) 2120 current->flags |= PF_MCE_PROCESS;
2136 return -EINVAL; 2121 if (arg3 == PR_MCE_KILL_EARLY)
2137 if (current->flags & PF_MCE_PROCESS) 2122 current->flags |= PF_MCE_EARLY;
2138 error = (current->flags & PF_MCE_EARLY) ? 2123 else if (arg3 == PR_MCE_KILL_LATE)
2139 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; 2124 current->flags &= ~PF_MCE_EARLY;
2125 else if (arg3 == PR_MCE_KILL_DEFAULT)
2126 current->flags &=
2127 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2140 else 2128 else
2141 error = PR_MCE_KILL_DEFAULT;
2142 break;
2143 case PR_SET_MM:
2144 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2145 break;
2146 case PR_GET_TID_ADDRESS:
2147 error = prctl_get_tid_address(me, (int __user **)arg2);
2148 break;
2149 case PR_SET_CHILD_SUBREAPER:
2150 me->signal->is_child_subreaper = !!arg2;
2151 break;
2152 case PR_GET_CHILD_SUBREAPER:
2153 error = put_user(me->signal->is_child_subreaper,
2154 (int __user *) arg2);
2155 break;
2156 case PR_SET_NO_NEW_PRIVS:
2157 if (arg2 != 1 || arg3 || arg4 || arg5)
2158 return -EINVAL; 2129 return -EINVAL;
2159
2160 current->no_new_privs = 1;
2161 break; 2130 break;
2162 case PR_GET_NO_NEW_PRIVS:
2163 if (arg2 || arg3 || arg4 || arg5)
2164 return -EINVAL;
2165 return current->no_new_privs ? 1 : 0;
2166 default: 2131 default:
2167 error = -EINVAL; 2132 return -EINVAL;
2168 break; 2133 }
2134 break;
2135 case PR_MCE_KILL_GET:
2136 if (arg2 | arg3 | arg4 | arg5)
2137 return -EINVAL;
2138 if (current->flags & PF_MCE_PROCESS)
2139 error = (current->flags & PF_MCE_EARLY) ?
2140 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
2141 else
2142 error = PR_MCE_KILL_DEFAULT;
2143 break;
2144 case PR_SET_MM:
2145 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2146 break;
2147 case PR_GET_TID_ADDRESS:
2148 error = prctl_get_tid_address(me, (int __user **)arg2);
2149 break;
2150 case PR_SET_CHILD_SUBREAPER:
2151 me->signal->is_child_subreaper = !!arg2;
2152 break;
2153 case PR_GET_CHILD_SUBREAPER:
2154 error = put_user(me->signal->is_child_subreaper,
2155 (int __user *)arg2);
2156 break;
2157 case PR_SET_NO_NEW_PRIVS:
2158 if (arg2 != 1 || arg3 || arg4 || arg5)
2159 return -EINVAL;
2160
2161 current->no_new_privs = 1;
2162 break;
2163 case PR_GET_NO_NEW_PRIVS:
2164 if (arg2 || arg3 || arg4 || arg5)
2165 return -EINVAL;
2166 return current->no_new_privs ? 1 : 0;
2167 default:
2168 error = -EINVAL;
2169 break;
2169 } 2170 }
2170 return error; 2171 return error;
2171} 2172}
@@ -2184,11 +2185,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2184 2185
2185char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 2186char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2186 2187
2187static void argv_cleanup(struct subprocess_info *info)
2188{
2189 argv_free(info->argv);
2190}
2191
2192static int __orderly_poweroff(void) 2188static int __orderly_poweroff(void)
2193{ 2189{
2194 int argc; 2190 int argc;
@@ -2208,9 +2204,8 @@ static int __orderly_poweroff(void)
2208 } 2204 }
2209 2205
2210 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, 2206 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
2211 NULL, argv_cleanup, NULL); 2207 NULL, NULL, NULL);
2212 if (ret == -ENOMEM) 2208 argv_free(argv);
2213 argv_free(argv);
2214 2209
2215 return ret; 2210 return ret;
2216} 2211}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c88878db491e..afc1dc60f3f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
61#include <linux/kmod.h> 61#include <linux/kmod.h>
62#include <linux/capability.h> 62#include <linux/capability.h>
63#include <linux/binfmts.h> 63#include <linux/binfmts.h>
64#include <linux/sched/sysctl.h>
64 65
65#include <asm/uaccess.h> 66#include <asm/uaccess.h>
66#include <asm/processor.h> 67#include <asm/processor.h>
@@ -104,7 +105,6 @@ extern char core_pattern[];
104extern unsigned int core_pipe_limit; 105extern unsigned int core_pipe_limit;
105#endif 106#endif
106extern int pid_max; 107extern int pid_max;
107extern int min_free_kbytes;
108extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
109extern int sysctl_drop_caches; 109extern int sysctl_drop_caches;
110extern int percpu_pagelist_fraction; 110extern int percpu_pagelist_fraction;
@@ -157,14 +157,20 @@ extern int sysctl_tsb_ratio;
157 157
158#ifdef __hppa__ 158#ifdef __hppa__
159extern int pwrsw_enabled; 159extern int pwrsw_enabled;
160#endif
161
162#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
160extern int unaligned_enabled; 163extern int unaligned_enabled;
161#endif 164#endif
162 165
163#ifdef CONFIG_IA64 166#ifdef CONFIG_IA64
164extern int no_unaligned_warning;
165extern int unaligned_dump_stack; 167extern int unaligned_dump_stack;
166#endif 168#endif
167 169
170#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
171extern int no_unaligned_warning;
172#endif
173
168#ifdef CONFIG_PROC_SYSCTL 174#ifdef CONFIG_PROC_SYSCTL
169static int proc_do_cad_pid(struct ctl_table *table, int write, 175static int proc_do_cad_pid(struct ctl_table *table, int write,
170 void __user *buffer, size_t *lenp, loff_t *ppos); 176 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -403,6 +409,13 @@ static struct ctl_table kern_table[] = {
403 .mode = 0644, 409 .mode = 0644,
404 .proc_handler = sched_rt_handler, 410 .proc_handler = sched_rt_handler,
405 }, 411 },
412 {
413 .procname = "sched_rr_timeslice_ms",
414 .data = &sched_rr_timeslice,
415 .maxlen = sizeof(int),
416 .mode = 0644,
417 .proc_handler = sched_rr_handler,
418 },
406#ifdef CONFIG_SCHED_AUTOGROUP 419#ifdef CONFIG_SCHED_AUTOGROUP
407 { 420 {
408 .procname = "sched_autogroup_enabled", 421 .procname = "sched_autogroup_enabled",
@@ -545,6 +558,8 @@ static struct ctl_table kern_table[] = {
545 .mode = 0644, 558 .mode = 0644,
546 .proc_handler = proc_dointvec, 559 .proc_handler = proc_dointvec,
547 }, 560 },
561#endif
562#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
548 { 563 {
549 .procname = "unaligned-trap", 564 .procname = "unaligned-trap",
550 .data = &unaligned_enabled, 565 .data = &unaligned_enabled,
@@ -911,7 +926,7 @@ static struct ctl_table kern_table[] = {
911 .proc_handler = proc_doulongvec_minmax, 926 .proc_handler = proc_doulongvec_minmax,
912 }, 927 },
913#endif 928#endif
914#ifdef CONFIG_IA64 929#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
915 { 930 {
916 .procname = "ignore-unaligned-usertrap", 931 .procname = "ignore-unaligned-usertrap",
917 .data = &no_unaligned_warning, 932 .data = &no_unaligned_warning,
@@ -919,6 +934,8 @@ static struct ctl_table kern_table[] = {
919 .mode = 0644, 934 .mode = 0644,
920 .proc_handler = proc_dointvec, 935 .proc_handler = proc_dointvec,
921 }, 936 },
937#endif
938#ifdef CONFIG_IA64
922 { 939 {
923 .procname = "unaligned-dump-stack", 940 .procname = "unaligned-dump-stack",
924 .data = &unaligned_dump_stack, 941 .data = &unaligned_dump_stack,
@@ -2006,7 +2023,7 @@ static int proc_taint(struct ctl_table *table, int write,
2006 int i; 2023 int i;
2007 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { 2024 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
2008 if ((tmptaint >> i) & 1) 2025 if ((tmptaint >> i) & 1)
2009 add_taint(i); 2026 add_taint(i, LOCKDEP_STILL_OK);
2010 } 2027 }
2011 } 2028 }
2012 2029
@@ -2083,7 +2100,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2083static void validate_coredump_safety(void) 2100static void validate_coredump_safety(void)
2084{ 2101{
2085#ifdef CONFIG_COREDUMP 2102#ifdef CONFIG_COREDUMP
2086 if (suid_dumpable == SUID_DUMPABLE_SAFE && 2103 if (suid_dumpable == SUID_DUMP_ROOT &&
2087 core_pattern[0] != '/' && core_pattern[0] != '|') { 2104 core_pattern[0] != '/' && core_pattern[0] != '|') {
2088 printk(KERN_WARNING "Unsafe core_pattern used with "\ 2105 printk(KERN_WARNING "Unsafe core_pattern used with "\
2089 "suid_dumpable=2. Pipe handler or fully qualified "\ 2106 "suid_dumpable=2. Pipe handler or fully qualified "\
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a6384450501..ebf72358e86a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, 387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, 388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, 389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, 390 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, 391 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, 392 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
971static ssize_t bin_intvec(struct file *file, 970static ssize_t bin_intvec(struct file *file,
972 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 971 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
973{ 972{
974 mm_segment_t old_fs = get_fs();
975 ssize_t copied = 0; 973 ssize_t copied = 0;
976 char *buffer; 974 char *buffer;
977 ssize_t result; 975 ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
984 if (oldval && oldlen) { 982 if (oldval && oldlen) {
985 unsigned __user *vec = oldval; 983 unsigned __user *vec = oldval;
986 size_t length = oldlen / sizeof(*vec); 984 size_t length = oldlen / sizeof(*vec);
987 loff_t pos = 0;
988 char *str, *end; 985 char *str, *end;
989 int i; 986 int i;
990 987
991 set_fs(KERNEL_DS); 988 result = kernel_read(file, 0, buffer, BUFSZ - 1);
992 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
993 set_fs(old_fs);
994 if (result < 0) 989 if (result < 0)
995 goto out_kfree; 990 goto out_kfree;
996 991
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
1017 if (newval && newlen) { 1012 if (newval && newlen) {
1018 unsigned __user *vec = newval; 1013 unsigned __user *vec = newval;
1019 size_t length = newlen / sizeof(*vec); 1014 size_t length = newlen / sizeof(*vec);
1020 loff_t pos = 0;
1021 char *str, *end; 1015 char *str, *end;
1022 int i; 1016 int i;
1023 1017
@@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file,
1033 str += snprintf(str, end - str, "%lu\t", value); 1027 str += snprintf(str, end - str, "%lu\t", value);
1034 } 1028 }
1035 1029
1036 set_fs(KERNEL_DS); 1030 result = kernel_write(file, buffer, str - buffer, 0);
1037 result = vfs_write(file, buffer, str - buffer, &pos);
1038 set_fs(old_fs);
1039 if (result < 0) 1031 if (result < 0)
1040 goto out_kfree; 1032 goto out_kfree;
1041 } 1033 }
@@ -1049,7 +1041,6 @@ out:
1049static ssize_t bin_ulongvec(struct file *file, 1041static ssize_t bin_ulongvec(struct file *file,
1050 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1042 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1051{ 1043{
1052 mm_segment_t old_fs = get_fs();
1053 ssize_t copied = 0; 1044 ssize_t copied = 0;
1054 char *buffer; 1045 char *buffer;
1055 ssize_t result; 1046 ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
1062 if (oldval && oldlen) { 1053 if (oldval && oldlen) {
1063 unsigned long __user *vec = oldval; 1054 unsigned long __user *vec = oldval;
1064 size_t length = oldlen / sizeof(*vec); 1055 size_t length = oldlen / sizeof(*vec);
1065 loff_t pos = 0;
1066 char *str, *end; 1056 char *str, *end;
1067 int i; 1057 int i;
1068 1058
1069 set_fs(KERNEL_DS); 1059 result = kernel_read(file, 0, buffer, BUFSZ - 1);
1070 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1071 set_fs(old_fs);
1072 if (result < 0) 1060 if (result < 0)
1073 goto out_kfree; 1061 goto out_kfree;
1074 1062
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
1095 if (newval && newlen) { 1083 if (newval && newlen) {
1096 unsigned long __user *vec = newval; 1084 unsigned long __user *vec = newval;
1097 size_t length = newlen / sizeof(*vec); 1085 size_t length = newlen / sizeof(*vec);
1098 loff_t pos = 0;
1099 char *str, *end; 1086 char *str, *end;
1100 int i; 1087 int i;
1101 1088
@@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file,
1111 str += snprintf(str, end - str, "%lu\t", value); 1098 str += snprintf(str, end - str, "%lu\t", value);
1112 } 1099 }
1113 1100
1114 set_fs(KERNEL_DS); 1101 result = kernel_write(file, buffer, str - buffer, 0);
1115 result = vfs_write(file, buffer, str - buffer, &pos);
1116 set_fs(old_fs);
1117 if (result < 0) 1102 if (result < 0)
1118 goto out_kfree; 1103 goto out_kfree;
1119 } 1104 }
@@ -1127,19 +1112,15 @@ out:
1127static ssize_t bin_uuid(struct file *file, 1112static ssize_t bin_uuid(struct file *file,
1128 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1113 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1129{ 1114{
1130 mm_segment_t old_fs = get_fs();
1131 ssize_t result, copied = 0; 1115 ssize_t result, copied = 0;
1132 1116
1133 /* Only supports reads */ 1117 /* Only supports reads */
1134 if (oldval && oldlen) { 1118 if (oldval && oldlen) {
1135 loff_t pos = 0;
1136 char buf[40], *str = buf; 1119 char buf[40], *str = buf;
1137 unsigned char uuid[16]; 1120 unsigned char uuid[16];
1138 int i; 1121 int i;
1139 1122
1140 set_fs(KERNEL_DS); 1123 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1141 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1142 set_fs(old_fs);
1143 if (result < 0) 1124 if (result < 0)
1144 goto out; 1125 goto out;
1145 1126
@@ -1175,18 +1156,14 @@ out:
1175static ssize_t bin_dn_node_address(struct file *file, 1156static ssize_t bin_dn_node_address(struct file *file,
1176 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1157 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1177{ 1158{
1178 mm_segment_t old_fs = get_fs();
1179 ssize_t result, copied = 0; 1159 ssize_t result, copied = 0;
1180 1160
1181 if (oldval && oldlen) { 1161 if (oldval && oldlen) {
1182 loff_t pos = 0;
1183 char buf[15], *nodep; 1162 char buf[15], *nodep;
1184 unsigned long area, node; 1163 unsigned long area, node;
1185 __le16 dnaddr; 1164 __le16 dnaddr;
1186 1165
1187 set_fs(KERNEL_DS); 1166 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1188 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1189 set_fs(old_fs);
1190 if (result < 0) 1167 if (result < 0)
1191 goto out; 1168 goto out;
1192 1169
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
1194 1171
1195 /* Convert the decnet address to binary */ 1172 /* Convert the decnet address to binary */
1196 result = -EIO; 1173 result = -EIO;
1197 nodep = strchr(buf, '.') + 1; 1174 nodep = strchr(buf, '.');
1198 if (!nodep) 1175 if (!nodep)
1199 goto out; 1176 goto out;
1177 ++nodep;
1200 1178
1201 area = simple_strtoul(buf, NULL, 10); 1179 area = simple_strtoul(buf, NULL, 10);
1202 node = simple_strtoul(nodep, NULL, 10); 1180 node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
1215 } 1193 }
1216 1194
1217 if (newval && newlen) { 1195 if (newval && newlen) {
1218 loff_t pos = 0;
1219 __le16 dnaddr; 1196 __le16 dnaddr;
1220 char buf[15]; 1197 char buf[15];
1221 int len; 1198 int len;
@@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1232 le16_to_cpu(dnaddr) >> 10, 1209 le16_to_cpu(dnaddr) >> 10,
1233 le16_to_cpu(dnaddr) & 0x3ff); 1210 le16_to_cpu(dnaddr) & 0x3ff);
1234 1211
1235 set_fs(KERNEL_DS); 1212 result = kernel_write(file, buf, len, 0);
1236 result = vfs_write(file, buf, len, &pos);
1237 set_fs(old_fs);
1238 if (result < 0) 1213 if (result < 0)
1239 goto out; 1214 goto out;
1240 } 1215 }
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd28..f8342a41efa6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
115} 115}
116 116
117/* 117/*
118 * Indicates if there is an offset between the system clock and the hardware
119 * clock/persistent clock/rtc.
120 */
121int persistent_clock_is_local;
122
123/*
118 * Adjust the time obtained from the CMOS to be UTC time instead of 124 * Adjust the time obtained from the CMOS to be UTC time instead of
119 * local time. 125 * local time.
120 * 126 *
@@ -135,6 +141,8 @@ static inline void warp_clock(void)
135 struct timespec adjust; 141 struct timespec adjust;
136 142
137 adjust = current_kernel_time(); 143 adjust = current_kernel_time();
144 if (sys_tz.tz_minuteswest != 0)
145 persistent_clock_is_local = 1;
138 adjust.tv_sec += sys_tz.tz_minuteswest * 60; 146 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
139 do_settimeofday(&adjust); 147 do_settimeofday(&adjust);
140} 148}
@@ -232,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);
232 * Avoid unnecessary multiplications/divisions in the 240 * Avoid unnecessary multiplications/divisions in the
233 * two most common HZ cases: 241 * two most common HZ cases:
234 */ 242 */
235inline unsigned int jiffies_to_msecs(const unsigned long j) 243unsigned int jiffies_to_msecs(const unsigned long j)
236{ 244{
237#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 245#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
238 return (MSEC_PER_SEC / HZ) * j; 246 return (MSEC_PER_SEC / HZ) * j;
@@ -248,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
248} 256}
249EXPORT_SYMBOL(jiffies_to_msecs); 257EXPORT_SYMBOL(jiffies_to_msecs);
250 258
251inline unsigned int jiffies_to_usecs(const unsigned long j) 259unsigned int jiffies_to_usecs(const unsigned long j)
252{ 260{
253#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 261#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
254 return (USEC_PER_SEC / HZ) * j; 262 return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db1261..24510d84efd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
12config ARCH_CLOCKSOURCE_DATA 12config ARCH_CLOCKSOURCE_DATA
13 bool 13 bool
14 14
15# Platforms has a persistent clock
16config ALWAYS_USE_PERSISTENT_CLOCK
17 bool
18 default n
19
15# Timekeeping vsyscall support 20# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL 21config GENERIC_TIME_VSYSCALL
17 bool 22 bool
@@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD
38 default y 43 default y
39 depends on GENERIC_CLOCKEVENTS 44 depends on GENERIC_CLOCKEVENTS
40 45
46# Architecture can handle broadcast in a driver-agnostic way
47config ARCH_HAS_TICK_BROADCAST
48 bool
49
41# Clockevents broadcasting infrastructure 50# Clockevents broadcasting infrastructure
42config GENERIC_CLOCKEVENTS_BROADCAST 51config GENERIC_CLOCKEVENTS_BROADCAST
43 bool 52 bool
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977c..c6d6400ee137 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,
339 clockevents_config(dev, freq); 339 clockevents_config(dev, freq);
340 clockevents_register_device(dev); 340 clockevents_register_device(dev);
341} 341}
342EXPORT_SYMBOL_GPL(clockevents_config_and_register);
342 343
343/** 344/**
344 * clockevents_update_freq - Update frequency and reprogram a clock event device. 345 * clockevents_update_freq - Update frequency and reprogram a clock event device.
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669b..072bb066bb7d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rtc.h>
18 19
19#include "tick-internal.h" 20#include "tick-internal.h"
20 21
@@ -22,7 +23,7 @@
22 * NTP timekeeping variables: 23 * NTP timekeeping variables:
23 */ 24 */
24 25
25DEFINE_SPINLOCK(ntp_lock); 26DEFINE_RAW_SPINLOCK(ntp_lock);
26 27
27 28
28/* USER_HZ period (usecs): */ 29/* USER_HZ period (usecs): */
@@ -347,7 +348,7 @@ void ntp_clear(void)
347{ 348{
348 unsigned long flags; 349 unsigned long flags;
349 350
350 spin_lock_irqsave(&ntp_lock, flags); 351 raw_spin_lock_irqsave(&ntp_lock, flags);
351 352
352 time_adjust = 0; /* stop active adjtime() */ 353 time_adjust = 0; /* stop active adjtime() */
353 time_status |= STA_UNSYNC; 354 time_status |= STA_UNSYNC;
@@ -361,7 +362,7 @@ void ntp_clear(void)
361 362
362 /* Clear PPS state variables */ 363 /* Clear PPS state variables */
363 pps_clear(); 364 pps_clear();
364 spin_unlock_irqrestore(&ntp_lock, flags); 365 raw_spin_unlock_irqrestore(&ntp_lock, flags);
365 366
366} 367}
367 368
@@ -371,9 +372,9 @@ u64 ntp_tick_length(void)
371 unsigned long flags; 372 unsigned long flags;
372 s64 ret; 373 s64 ret;
373 374
374 spin_lock_irqsave(&ntp_lock, flags); 375 raw_spin_lock_irqsave(&ntp_lock, flags);
375 ret = tick_length; 376 ret = tick_length;
376 spin_unlock_irqrestore(&ntp_lock, flags); 377 raw_spin_unlock_irqrestore(&ntp_lock, flags);
377 return ret; 378 return ret;
378} 379}
379 380
@@ -394,7 +395,7 @@ int second_overflow(unsigned long secs)
394 int leap = 0; 395 int leap = 0;
395 unsigned long flags; 396 unsigned long flags;
396 397
397 spin_lock_irqsave(&ntp_lock, flags); 398 raw_spin_lock_irqsave(&ntp_lock, flags);
398 399
399 /* 400 /*
400 * Leap second processing. If in leap-insert state at the end of the 401 * Leap second processing. If in leap-insert state at the end of the
@@ -478,13 +479,12 @@ int second_overflow(unsigned long secs)
478 time_adjust = 0; 479 time_adjust = 0;
479 480
480out: 481out:
481 spin_unlock_irqrestore(&ntp_lock, flags); 482 raw_spin_unlock_irqrestore(&ntp_lock, flags);
482 483
483 return leap; 484 return leap;
484} 485}
485 486
486#ifdef CONFIG_GENERIC_CMOS_UPDATE 487#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
487
488static void sync_cmos_clock(struct work_struct *work); 488static void sync_cmos_clock(struct work_struct *work);
489 489
490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); 490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work)
510 } 510 }
511 511
512 getnstimeofday(&now); 512 getnstimeofday(&now);
513 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 513 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
514 fail = update_persistent_clock(now); 514 struct timespec adjust = now;
515
516 fail = -ENODEV;
517 if (persistent_clock_is_local)
518 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
519#ifdef CONFIG_GENERIC_CMOS_UPDATE
520 fail = update_persistent_clock(adjust);
521#endif
522#ifdef CONFIG_RTC_SYSTOHC
523 if (fail == -ENODEV)
524 fail = rtc_set_ntp_time(adjust);
525#endif
526 }
515 527
516 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); 528 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
517 if (next.tv_nsec <= 0) 529 if (next.tv_nsec <= 0)
518 next.tv_nsec += NSEC_PER_SEC; 530 next.tv_nsec += NSEC_PER_SEC;
519 531
520 if (!fail) 532 if (!fail || fail == -ENODEV)
521 next.tv_sec = 659; 533 next.tv_sec = 659;
522 else 534 else
523 next.tv_sec = 0; 535 next.tv_sec = 0;
@@ -660,7 +672,7 @@ int do_adjtimex(struct timex *txc)
660 672
661 getnstimeofday(&ts); 673 getnstimeofday(&ts);
662 674
663 spin_lock_irq(&ntp_lock); 675 raw_spin_lock_irq(&ntp_lock);
664 676
665 if (txc->modes & ADJ_ADJTIME) { 677 if (txc->modes & ADJ_ADJTIME) {
666 long save_adjust = time_adjust; 678 long save_adjust = time_adjust;
@@ -702,7 +714,7 @@ int do_adjtimex(struct timex *txc)
702 /* fill PPS status fields */ 714 /* fill PPS status fields */
703 pps_fill_timex(txc); 715 pps_fill_timex(txc);
704 716
705 spin_unlock_irq(&ntp_lock); 717 raw_spin_unlock_irq(&ntp_lock);
706 718
707 txc->time.tv_sec = ts.tv_sec; 719 txc->time.tv_sec = ts.tv_sec;
708 txc->time.tv_usec = ts.tv_nsec; 720 txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
900 912
901 pts_norm = pps_normalize_ts(*phase_ts); 913 pts_norm = pps_normalize_ts(*phase_ts);
902 914
903 spin_lock_irqsave(&ntp_lock, flags); 915 raw_spin_lock_irqsave(&ntp_lock, flags);
904 916
905 /* clear the error bits, they will be set again if needed */ 917 /* clear the error bits, they will be set again if needed */
906 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
913 * just start the frequency interval */ 925 * just start the frequency interval */
914 if (unlikely(pps_fbase.tv_sec == 0)) { 926 if (unlikely(pps_fbase.tv_sec == 0)) {
915 pps_fbase = *raw_ts; 927 pps_fbase = *raw_ts;
916 spin_unlock_irqrestore(&ntp_lock, flags); 928 raw_spin_unlock_irqrestore(&ntp_lock, flags);
917 return; 929 return;
918 } 930 }
919 931
@@ -928,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
928 time_status |= STA_PPSJITTER; 940 time_status |= STA_PPSJITTER;
929 /* restart the frequency calibration interval */ 941 /* restart the frequency calibration interval */
930 pps_fbase = *raw_ts; 942 pps_fbase = *raw_ts;
931 spin_unlock_irqrestore(&ntp_lock, flags); 943 raw_spin_unlock_irqrestore(&ntp_lock, flags);
932 pr_err("hardpps: PPSJITTER: bad pulse\n"); 944 pr_err("hardpps: PPSJITTER: bad pulse\n");
933 return; 945 return;
934 } 946 }
@@ -945,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
945 957
946 hardpps_update_phase(pts_norm.nsec); 958 hardpps_update_phase(pts_norm.nsec);
947 959
948 spin_unlock_irqrestore(&ntp_lock, flags); 960 raw_spin_unlock_irqrestore(&ntp_lock, flags);
949} 961}
950EXPORT_SYMBOL(hardpps); 962EXPORT_SYMBOL(hardpps);
951 963
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e2..2fb8cb88df8d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,6 +18,7 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h>
21 22
22#include "tick-internal.h" 23#include "tick-internal.h"
23 24
@@ -86,6 +87,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
86 return (dev && tick_broadcast_device.evtdev == dev); 87 return (dev && tick_broadcast_device.evtdev == dev);
87} 88}
88 89
90static void err_broadcast(const struct cpumask *mask)
91{
92 pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
93}
94
95static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
96{
97 if (!dev->broadcast)
98 dev->broadcast = tick_broadcast;
99 if (!dev->broadcast) {
100 pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
101 dev->name);
102 dev->broadcast = err_broadcast;
103 }
104}
105
89/* 106/*
90 * Check, if the device is disfunctional and a place holder, which 107 * Check, if the device is disfunctional and a place holder, which
91 * needs to be handled by the broadcast device. 108 * needs to be handled by the broadcast device.
@@ -105,6 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
105 */ 122 */
106 if (!tick_device_is_functional(dev)) { 123 if (!tick_device_is_functional(dev)) {
107 dev->event_handler = tick_handle_periodic; 124 dev->event_handler = tick_handle_periodic;
125 tick_device_setup_broadcast_func(dev);
108 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 126 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
109 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 127 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
110 ret = 1; 128 ret = 1;
@@ -116,15 +134,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 */ 134 */
117 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 135 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
118 int cpu = smp_processor_id(); 136 int cpu = smp_processor_id();
119
120 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 137 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
121 tick_broadcast_clear_oneshot(cpu); 138 tick_broadcast_clear_oneshot(cpu);
139 } else {
140 tick_device_setup_broadcast_func(dev);
122 } 141 }
123 } 142 }
124 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 143 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
125 return ret; 144 return ret;
126} 145}
127 146
147#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
148int tick_receive_broadcast(void)
149{
150 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
151 struct clock_event_device *evt = td->evtdev;
152
153 if (!evt)
154 return -ENODEV;
155
156 if (!evt->event_handler)
157 return -EINVAL;
158
159 evt->event_handler(evt);
160 return 0;
161}
162#endif
163
128/* 164/*
129 * Broadcast the event to the cpus, which are set in the mask (mangled). 165 * Broadcast the event to the cpus, which are set in the mask (mangled).
130 */ 166 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d58e552d9fd1..a19a39952c1b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/irq_work.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -28,7 +29,7 @@
28/* 29/*
29 * Per cpu nohz control structure 30 * Per cpu nohz control structure
30 */ 31 */
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 32DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32 33
33/* 34/*
34 * The time, when the last jiffy update happened. Protected by jiffies_lock. 35 * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -331,8 +332,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
331 time_delta = timekeeping_max_deferment(); 332 time_delta = timekeeping_max_deferment();
332 } while (read_seqretry(&jiffies_lock, seq)); 333 } while (read_seqretry(&jiffies_lock, seq));
333 334
334 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || 335 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
335 arch_needs_cpu(cpu)) { 336 arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
336 next_jiffies = last_jiffies + 1; 337 next_jiffies = last_jiffies + 1;
337 delta_jiffies = 1; 338 delta_jiffies = 1;
338 } else { 339 } else {
@@ -553,6 +554,7 @@ void tick_nohz_idle_enter(void)
553 554
554 local_irq_enable(); 555 local_irq_enable();
555} 556}
557EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
556 558
557/** 559/**
558 * tick_nohz_irq_exit - update next tick event from interrupt exit 560 * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -631,8 +633,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
631 633
632static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 634static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
633{ 635{
634#ifndef CONFIG_VIRT_CPU_ACCOUNTING 636#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
635 unsigned long ticks; 637 unsigned long ticks;
638
639 if (vtime_accounting_enabled())
640 return;
636 /* 641 /*
637 * We stopped the tick in idle. Update process times would miss the 642 * We stopped the tick in idle. Update process times would miss the
638 * time we slept as update_process_times does only a 1 tick 643 * time we slept as update_process_times does only a 1 tick
@@ -681,6 +686,7 @@ void tick_nohz_idle_exit(void)
681 686
682 local_irq_enable(); 687 local_irq_enable();
683} 688}
689EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
684 690
685static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 691static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
686{ 692{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb0db3f..9a0bc98fbe1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -29,6 +29,9 @@ static struct timekeeper timekeeper;
29/* flag for if timekeeping is suspended */ 29/* flag for if timekeeping is suspended */
30int __read_mostly timekeeping_suspended; 30int __read_mostly timekeeping_suspended;
31 31
32/* Flag for if there is a persistent clock on this platform */
33bool __read_mostly persistent_clock_exist = false;
34
32static inline void tk_normalize_xtime(struct timekeeper *tk) 35static inline void tk_normalize_xtime(struct timekeeper *tk)
33{ 36{
34 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { 37 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
@@ -135,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
135} 138}
136 139
137/* Timekeeper helper functions. */ 140/* Timekeeper helper functions. */
141
142#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
143u32 (*arch_gettimeoffset)(void);
144
145u32 get_arch_timeoffset(void)
146{
147 if (likely(arch_gettimeoffset))
148 return arch_gettimeoffset();
149 return 0;
150}
151#else
152static inline u32 get_arch_timeoffset(void) { return 0; }
153#endif
154
138static inline s64 timekeeping_get_ns(struct timekeeper *tk) 155static inline s64 timekeeping_get_ns(struct timekeeper *tk)
139{ 156{
140 cycle_t cycle_now, cycle_delta; 157 cycle_t cycle_now, cycle_delta;
@@ -151,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
151 nsec = cycle_delta * tk->mult + tk->xtime_nsec; 168 nsec = cycle_delta * tk->mult + tk->xtime_nsec;
152 nsec >>= tk->shift; 169 nsec >>= tk->shift;
153 170
154 /* If arch requires, add in gettimeoffset() */ 171 /* If arch requires, add in get_arch_timeoffset() */
155 return nsec + arch_gettimeoffset(); 172 return nsec + get_arch_timeoffset();
156} 173}
157 174
158static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) 175static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
@@ -171,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
171 /* convert delta to nanoseconds. */ 188 /* convert delta to nanoseconds. */
172 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 189 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
173 190
174 /* If arch requires, add in gettimeoffset() */ 191 /* If arch requires, add in get_arch_timeoffset() */
175 return nsec + arch_gettimeoffset(); 192 return nsec + get_arch_timeoffset();
176} 193}
177 194
178static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 195static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
@@ -254,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
254 271
255 tk->xtime_nsec += cycle_delta * tk->mult; 272 tk->xtime_nsec += cycle_delta * tk->mult;
256 273
257 /* If arch requires, add in gettimeoffset() */ 274 /* If arch requires, add in get_arch_timeoffset() */
258 tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; 275 tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
259 276
260 tk_normalize_xtime(tk); 277 tk_normalize_xtime(tk);
261 278
@@ -264,19 +281,18 @@ static void timekeeping_forward_now(struct timekeeper *tk)
264} 281}
265 282
266/** 283/**
267 * getnstimeofday - Returns the time of day in a timespec 284 * __getnstimeofday - Returns the time of day in a timespec.
268 * @ts: pointer to the timespec to be set 285 * @ts: pointer to the timespec to be set
269 * 286 *
270 * Returns the time of day in a timespec. 287 * Updates the time of day in the timespec.
288 * Returns 0 on success, or -ve when suspended (timespec will be undefined).
271 */ 289 */
272void getnstimeofday(struct timespec *ts) 290int __getnstimeofday(struct timespec *ts)
273{ 291{
274 struct timekeeper *tk = &timekeeper; 292 struct timekeeper *tk = &timekeeper;
275 unsigned long seq; 293 unsigned long seq;
276 s64 nsecs = 0; 294 s64 nsecs = 0;
277 295
278 WARN_ON(timekeeping_suspended);
279
280 do { 296 do {
281 seq = read_seqbegin(&tk->lock); 297 seq = read_seqbegin(&tk->lock);
282 298
@@ -287,6 +303,26 @@ void getnstimeofday(struct timespec *ts)
287 303
288 ts->tv_nsec = 0; 304 ts->tv_nsec = 0;
289 timespec_add_ns(ts, nsecs); 305 timespec_add_ns(ts, nsecs);
306
307 /*
308 * Do not bail out early, in case there were callers still using
309 * the value, even in the face of the WARN_ON.
310 */
311 if (unlikely(timekeeping_suspended))
312 return -EAGAIN;
313 return 0;
314}
315EXPORT_SYMBOL(__getnstimeofday);
316
317/**
318 * getnstimeofday - Returns the time of day in a timespec.
319 * @ts: pointer to the timespec to be set
320 *
321 * Returns the time of day in a timespec (WARN if suspended).
322 */
323void getnstimeofday(struct timespec *ts)
324{
325 WARN_ON(__getnstimeofday(ts));
290} 326}
291EXPORT_SYMBOL(getnstimeofday); 327EXPORT_SYMBOL(getnstimeofday);
292 328
@@ -640,12 +676,14 @@ void __init timekeeping_init(void)
640 struct timespec now, boot, tmp; 676 struct timespec now, boot, tmp;
641 677
642 read_persistent_clock(&now); 678 read_persistent_clock(&now);
679
643 if (!timespec_valid_strict(&now)) { 680 if (!timespec_valid_strict(&now)) {
644 pr_warn("WARNING: Persistent clock returned invalid value!\n" 681 pr_warn("WARNING: Persistent clock returned invalid value!\n"
645 " Check your CMOS/BIOS settings.\n"); 682 " Check your CMOS/BIOS settings.\n");
646 now.tv_sec = 0; 683 now.tv_sec = 0;
647 now.tv_nsec = 0; 684 now.tv_nsec = 0;
648 } 685 } else if (now.tv_sec || now.tv_nsec)
686 persistent_clock_exist = true;
649 687
650 read_boot_clock(&boot); 688 read_boot_clock(&boot);
651 if (!timespec_valid_strict(&boot)) { 689 if (!timespec_valid_strict(&boot)) {
@@ -718,11 +756,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
718{ 756{
719 struct timekeeper *tk = &timekeeper; 757 struct timekeeper *tk = &timekeeper;
720 unsigned long flags; 758 unsigned long flags;
721 struct timespec ts;
722 759
723 /* Make sure we don't set the clock twice */ 760 /*
724 read_persistent_clock(&ts); 761 * Make sure we don't set the clock twice, as timekeeping_resume()
725 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 762 * already did it
763 */
764 if (has_persistent_clock())
726 return; 765 return;
727 766
728 write_seqlock_irqsave(&tk->lock, flags); 767 write_seqlock_irqsave(&tk->lock, flags);
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
1scale=0
2
3define gcd(a,b) {
4 auto t;
5 while (b) {
6 t = b;
7 b = a % b;
8 a = t;
9 }
10 return a;
11}
12
13/* Division by reciprocal multiplication. */
14define fmul(b,n,d) {
15 return (2^b*n+d-1)/d;
16}
17
18/* Adjustment factor when a ceiling value is used. Use as:
19 (imul * n) + (fmulxx * n + fadjxx) >> xx) */
20define fadj(b,n,d) {
21 auto v;
22 d = d/gcd(n,d);
23 v = 2^b*(d-1)/d;
24 return v;
25}
26
27/* Compute the appropriate mul/adj values as well as a shift count,
28 which brings the mul value into the range 2^b-1 <= x < 2^b. Such
29 a shift value will be correct in the signed integer range and off
30 by at most one in the upper half of the unsigned range. */
31define fmuls(b,n,d) {
32 auto s, m;
33 for (s = 0; 1; s++) {
34 m = fmul(s,n,d);
35 if (m >= 2^(b-1))
36 return s;
37 }
38 return 0;
39}
40
41define timeconst(hz) {
42 print "/* Automatically generated by kernel/timeconst.bc */\n"
43 print "/* Time conversion constants for HZ == ", hz, " */\n"
44 print "\n"
45
46 print "#ifndef KERNEL_TIMECONST_H\n"
47 print "#define KERNEL_TIMECONST_H\n\n"
48
49 print "#include <linux/param.h>\n"
50 print "#include <linux/types.h>\n\n"
51
52 print "#if HZ != ", hz, "\n"
53 print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
54 print "#endif\n\n"
55
56 if (hz < 2) {
57 print "#error Totally bogus HZ value!\n"
58 } else {
59 s=fmuls(32,1000,hz)
60 obase=16
61 print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
62 print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
63 obase=10
64 print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
65
66 s=fmuls(32,hz,1000)
67 obase=16
68 print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
69 print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
70 obase=10
71 print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
72
73 obase=10
74 cd=gcd(hz,1000)
75 print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
76 print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
77 print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
78 print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
79 print "\n"
80
81 s=fmuls(32,1000000,hz)
82 obase=16
83 print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
84 print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
85 obase=10
86 print "#define HZ_TO_USEC_SHR32\t", s, "\n"
87
88 s=fmuls(32,hz,1000000)
89 obase=16
90 print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
91 print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
92 obase=10
93 print "#define USEC_TO_HZ_SHR32\t", s, "\n"
94
95 obase=10
96 cd=gcd(hz,1000000)
97 print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
98 print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
99 print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
100 print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
101 print "\n"
102
103 print "#endif /* KERNEL_TIMECONST_H */\n"
104 }
105 halt
106}
107
108timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index eb51d76e058a..000000000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,378 +0,0 @@
1#!/usr/bin/perl
2# -----------------------------------------------------------------------
3#
4# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5#
6# This file is part of the Linux kernel, and is made available under
7# the terms of the GNU General Public License version 2 or (at your
8# option) any later version; incorporated herein by reference.
9#
10# -----------------------------------------------------------------------
11#
12
13#
14# Usage: timeconst.pl HZ > timeconst.h
15#
16
17# Precomputed values for systems without Math::BigInt
18# Generated by:
19# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
20%canned_values = (
21 24 => [
22 '0xa6aaaaab','0x2aaaaaa',26,
23 125,3,
24 '0xc49ba5e4','0x1fbe76c8b4',37,
25 3,125,
26 '0xa2c2aaab','0xaaaa',16,
27 125000,3,
28 '0xc9539b89','0x7fffbce4217d',47,
29 3,125000,
30 ], 32 => [
31 '0xfa000000','0x6000000',27,
32 125,4,
33 '0x83126e98','0xfdf3b645a',36,
34 4,125,
35 '0xf4240000','0x0',17,
36 31250,1,
37 '0x8637bd06','0x3fff79c842fa',46,
38 1,31250,
39 ], 48 => [
40 '0xa6aaaaab','0x6aaaaaa',27,
41 125,6,
42 '0xc49ba5e4','0xfdf3b645a',36,
43 6,125,
44 '0xa2c2aaab','0x15555',17,
45 62500,3,
46 '0xc9539b89','0x3fffbce4217d',46,
47 3,62500,
48 ], 64 => [
49 '0xfa000000','0xe000000',28,
50 125,8,
51 '0x83126e98','0x7ef9db22d',35,
52 8,125,
53 '0xf4240000','0x0',18,
54 15625,1,
55 '0x8637bd06','0x1fff79c842fa',45,
56 1,15625,
57 ], 100 => [
58 '0xa0000000','0x0',28,
59 10,1,
60 '0xcccccccd','0x733333333',35,
61 1,10,
62 '0x9c400000','0x0',18,
63 10000,1,
64 '0xd1b71759','0x1fff2e48e8a7',45,
65 1,10000,
66 ], 122 => [
67 '0x8325c53f','0xfbcda3a',28,
68 500,61,
69 '0xf9db22d1','0x7fbe76c8b',35,
70 61,500,
71 '0x8012e2a0','0x3ef36',18,
72 500000,61,
73 '0xffda4053','0x1ffffbce4217',45,
74 61,500000,
75 ], 128 => [
76 '0xfa000000','0x1e000000',29,
77 125,16,
78 '0x83126e98','0x3f7ced916',34,
79 16,125,
80 '0xf4240000','0x40000',19,
81 15625,2,
82 '0x8637bd06','0xfffbce4217d',44,
83 2,15625,
84 ], 200 => [
85 '0xa0000000','0x0',29,
86 5,1,
87 '0xcccccccd','0x333333333',34,
88 1,5,
89 '0x9c400000','0x0',19,
90 5000,1,
91 '0xd1b71759','0xfff2e48e8a7',44,
92 1,5000,
93 ], 250 => [
94 '0x80000000','0x0',29,
95 4,1,
96 '0x80000000','0x180000000',33,
97 1,4,
98 '0xfa000000','0x0',20,
99 4000,1,
100 '0x83126e98','0x7ff7ced9168',43,
101 1,4000,
102 ], 256 => [
103 '0xfa000000','0x3e000000',30,
104 125,32,
105 '0x83126e98','0x1fbe76c8b',33,
106 32,125,
107 '0xf4240000','0xc0000',20,
108 15625,4,
109 '0x8637bd06','0x7ffde7210be',43,
110 4,15625,
111 ], 300 => [
112 '0xd5555556','0x2aaaaaaa',30,
113 10,3,
114 '0x9999999a','0x1cccccccc',33,
115 3,10,
116 '0xd0555556','0xaaaaa',20,
117 10000,3,
118 '0x9d495183','0x7ffcb923a29',43,
119 3,10000,
120 ], 512 => [
121 '0xfa000000','0x7e000000',31,
122 125,64,
123 '0x83126e98','0xfdf3b645',32,
124 64,125,
125 '0xf4240000','0x1c0000',21,
126 15625,8,
127 '0x8637bd06','0x3ffef39085f',42,
128 8,15625,
129 ], 1000 => [
130 '0x80000000','0x0',31,
131 1,1,
132 '0x80000000','0x0',31,
133 1,1,
134 '0xfa000000','0x0',22,
135 1000,1,
136 '0x83126e98','0x1ff7ced9168',41,
137 1,1000,
138 ], 1024 => [
139 '0xfa000000','0xfe000000',32,
140 125,128,
141 '0x83126e98','0x7ef9db22',31,
142 128,125,
143 '0xf4240000','0x3c0000',22,
144 15625,16,
145 '0x8637bd06','0x1fff79c842f',41,
146 16,15625,
147 ], 1200 => [
148 '0xd5555556','0xd5555555',32,
149 5,6,
150 '0x9999999a','0x66666666',31,
151 6,5,
152 '0xd0555556','0x2aaaaa',22,
153 2500,3,
154 '0x9d495183','0x1ffcb923a29',41,
155 3,2500,
156 ]
157);
158
159$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
160
161sub bint($)
162{
163 my($x) = @_;
164 return Math::BigInt->new($x);
165}
166
167#
168# Constants for division by reciprocal multiplication.
169# (bits, numerator, denominator)
170#
171sub fmul($$$)
172{
173 my ($b,$n,$d) = @_;
174
175 $n = bint($n);
176 $d = bint($d);
177
178 return scalar (($n << $b)+$d-bint(1))/$d;
179}
180
181sub fadj($$$)
182{
183 my($b,$n,$d) = @_;
184
185 $n = bint($n);
186 $d = bint($d);
187
188 $d = $d/bgcd($n, $d);
189 return scalar (($d-bint(1)) << $b)/$d;
190}
191
192sub fmuls($$$) {
193 my($b,$n,$d) = @_;
194 my($s,$m);
195 my($thres) = bint(1) << ($b-1);
196
197 $n = bint($n);
198 $d = bint($d);
199
200 for ($s = 0; 1; $s++) {
201 $m = fmul($s,$n,$d);
202 return $s if ($m >= $thres);
203 }
204 return 0;
205}
206
207# Generate a hex value if the result fits in 64 bits;
208# otherwise skip.
209sub bignum_hex($) {
210 my($x) = @_;
211 my $s = $x->as_hex();
212
213 return (length($s) > 18) ? undef : $s;
214}
215
216# Provides mul, adj, and shr factors for a specific
217# (bit, time, hz) combination
218sub muladj($$$) {
219 my($b, $t, $hz) = @_;
220 my $s = fmuls($b, $t, $hz);
221 my $m = fmul($s, $t, $hz);
222 my $a = fadj($s, $t, $hz);
223 return (bignum_hex($m), bignum_hex($a), $s);
224}
225
226# Provides numerator, denominator values
227sub numden($$) {
228 my($n, $d) = @_;
229 my $g = bgcd($n, $d);
230 return ($n/$g, $d/$g);
231}
232
233# All values for a specific (time, hz) combo
234sub conversions($$) {
235 my ($t, $hz) = @_;
236 my @val = ();
237
238 # HZ_TO_xx
239 push(@val, muladj(32, $t, $hz));
240 push(@val, numden($t, $hz));
241
242 # xx_TO_HZ
243 push(@val, muladj(32, $hz, $t));
244 push(@val, numden($hz, $t));
245
246 return @val;
247}
248
249sub compute_values($) {
250 my($hz) = @_;
251 my @val = ();
252 my $s, $m, $a, $g;
253
254 if (!$has_bigint) {
255 die "$0: HZ == $hz not canned and ".
256 "Math::BigInt not available\n";
257 }
258
259 # MSEC conversions
260 push(@val, conversions(1000, $hz));
261
262 # USEC conversions
263 push(@val, conversions(1000000, $hz));
264
265 return @val;
266}
267
268sub outputval($$)
269{
270 my($name, $val) = @_;
271 my $csuf;
272
273 if (defined($val)) {
274 if ($name !~ /SHR/) {
275 $val = "U64_C($val)";
276 }
277 printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
278 }
279}
280
281sub output($@)
282{
283 my($hz, @val) = @_;
284 my $pfx, $bit, $suf, $s, $m, $a;
285
286 print "/* Automatically generated by kernel/timeconst.pl */\n";
287 print "/* Conversion constants for HZ == $hz */\n";
288 print "\n";
289 print "#ifndef KERNEL_TIMECONST_H\n";
290 print "#define KERNEL_TIMECONST_H\n";
291 print "\n";
292
293 print "#include <linux/param.h>\n";
294 print "#include <linux/types.h>\n";
295
296 print "\n";
297 print "#if HZ != $hz\n";
298 print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
299 print "#endif\n";
300 print "\n";
301
302 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
303 'HZ_TO_USEC','USEC_TO_HZ') {
304 foreach $bit (32) {
305 foreach $suf ('MUL', 'ADJ', 'SHR') {
306 outputval("${pfx}_$suf$bit", shift(@val));
307 }
308 }
309 foreach $suf ('NUM', 'DEN') {
310 outputval("${pfx}_$suf", shift(@val));
311 }
312 }
313
314 print "\n";
315 print "#endif /* KERNEL_TIMECONST_H */\n";
316}
317
318# Pretty-print Perl values
319sub perlvals(@) {
320 my $v;
321 my @l = ();
322
323 foreach $v (@_) {
324 if (!defined($v)) {
325 push(@l, 'undef');
326 } elsif ($v =~ /^0x/) {
327 push(@l, "\'".$v."\'");
328 } else {
329 push(@l, $v.'');
330 }
331 }
332 return join(',', @l);
333}
334
335($hz) = @ARGV;
336
337# Use this to generate the %canned_values structure
338if ($hz eq '--can') {
339 shift(@ARGV);
340 @hzlist = sort {$a <=> $b} (@ARGV);
341
342 print "# Precomputed values for systems without Math::BigInt\n";
343 print "# Generated by:\n";
344 print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
345 print "\%canned_values = (\n";
346 my $pf = "\t";
347 foreach $hz (@hzlist) {
348 my @values = compute_values($hz);
349 print "$pf$hz => [\n";
350 while (scalar(@values)) {
351 my $bit;
352 foreach $bit (32) {
353 my $m = shift(@values);
354 my $a = shift(@values);
355 my $s = shift(@values);
356 print "\t\t", perlvals($m,$a,$s), ",\n";
357 }
358 my $n = shift(@values);
359 my $d = shift(@values);
360 print "\t\t", perlvals($n,$d), ",\n";
361 }
362 print "\t]";
363 $pf = ', ';
364 }
365 print "\n);\n";
366} else {
367 $hz += 0; # Force to number
368 if ($hz < 1) {
369 die "Usage: $0 HZ\n";
370 }
371
372 @val = @{$canned_values{$hz}};
373 if (!defined(@val)) {
374 @val = compute_values($hz);
375 }
376 output($hz, @val);
377}
378exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d00858482..dbf7a78a1ef1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/irq_work.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/sched/sysctl.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43 44
44#include <asm/uaccess.h> 45#include <asm/uaccess.h>
@@ -1351,7 +1352,6 @@ void update_process_times(int user_tick)
1351 account_process_tick(p, user_tick); 1352 account_process_tick(p, user_tick);
1352 run_local_timers(); 1353 run_local_timers();
1353 rcu_check_callbacks(cpu, user_tick); 1354 rcu_check_callbacks(cpu, user_tick);
1354 printk_tick();
1355#ifdef CONFIG_IRQ_WORK 1355#ifdef CONFIG_IRQ_WORK
1356 if (in_irq()) 1356 if (in_irq())
1357 irq_work_run(); 1357 irq_work_run();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335a485f..fc382d6e2765 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
39 help 39 help
40 See Documentation/trace/ftrace-design.txt 40 See Documentation/trace/ftrace-design.txt
41 41
42config HAVE_DYNAMIC_FTRACE_WITH_REGS
43 bool
44
42config HAVE_FTRACE_MCOUNT_RECORD 45config HAVE_FTRACE_MCOUNT_RECORD
43 bool 46 bool
44 help 47 help
@@ -78,21 +81,6 @@ config EVENT_TRACING
78 select CONTEXT_SWITCH_TRACER 81 select CONTEXT_SWITCH_TRACER
79 bool 82 bool
80 83
81config EVENT_POWER_TRACING_DEPRECATED
82 depends on EVENT_TRACING
83 bool "Deprecated power event trace API, to be removed"
84 default y
85 help
86 Provides old power event types:
87 C-state/idle accounting events:
88 power:power_start
89 power:power_end
90 and old cpufreq accounting event:
91 power:power_frequency
92 This is for userspace compatibility
93 and will vanish after 5 kernel iterations,
94 namely 3.1.
95
96config CONTEXT_SWITCH_TRACER 84config CONTEXT_SWITCH_TRACER
97 bool 85 bool
98 86
@@ -250,6 +238,16 @@ config FTRACE_SYSCALLS
250 help 238 help
251 Basic tracer to catch the syscall entry and exit events. 239 Basic tracer to catch the syscall entry and exit events.
252 240
241config TRACER_SNAPSHOT
242 bool "Create a snapshot trace buffer"
243 select TRACER_MAX_TRACE
244 help
245 Allow tracing users to take snapshot of the current buffer using the
246 ftrace interface, e.g.:
247
248 echo 1 > /sys/kernel/debug/tracing/snapshot
249 cat snapshot
250
253config TRACE_BRANCH_PROFILING 251config TRACE_BRANCH_PROFILING
254 bool 252 bool
255 select GENERIC_TRACER 253 select GENERIC_TRACER
@@ -416,23 +414,32 @@ config PROBE_EVENTS
416 def_bool n 414 def_bool n
417 415
418config DYNAMIC_FTRACE 416config DYNAMIC_FTRACE
419 bool "enable/disable ftrace tracepoints dynamically" 417 bool "enable/disable function tracing dynamically"
420 depends on FUNCTION_TRACER 418 depends on FUNCTION_TRACER
421 depends on HAVE_DYNAMIC_FTRACE 419 depends on HAVE_DYNAMIC_FTRACE
422 default y 420 default y
423 help 421 help
424 This option will modify all the calls to ftrace dynamically 422 This option will modify all the calls to function tracing
425 (will patch them out of the binary image and replace them 423 dynamically (will patch them out of the binary image and
426 with a No-Op instruction) as they are called. A table is 424 replace them with a No-Op instruction) on boot up. During
427 created to dynamically enable them again. 425 compile time, a table is made of all the locations that ftrace
426 can function trace, and this table is linked into the kernel
427 image. When this is enabled, functions can be individually
428 enabled, and the functions not enabled will not affect
429 performance of the system.
430
431 See the files in /sys/kernel/debug/tracing:
432 available_filter_functions
433 set_ftrace_filter
434 set_ftrace_notrace
428 435
429 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but 436 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
430 otherwise has native performance as long as no tracing is active. 437 otherwise has native performance as long as no tracing is active.
431 438
432 The changes to the code are done by a kernel thread that 439config DYNAMIC_FTRACE_WITH_REGS
433 wakes up once a second and checks to see if any ftrace calls 440 def_bool y
434 were made. If so, it runs stop_machine (stops all CPUS) 441 depends on DYNAMIC_FTRACE
435 and modifies the code to jump over the call to ftrace. 442 depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
436 443
437config FUNCTION_PROFILER 444config FUNCTION_PROFILER
438 bool "Kernel function profiler" 445 bool "Kernel function profiler"
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741c..9e5b8c272eec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
147 return; 147 return;
148 148
149 local_irq_save(flags); 149 local_irq_save(flags);
150 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 150 buf = this_cpu_ptr(bt->msg_data);
151 va_start(args, fmt); 151 va_start(args, fmt);
152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); 152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
153 va_end(args); 153 va_end(args);
@@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore,
739 struct request_queue *q, 739 struct request_queue *q,
740 struct request *rq) 740 struct request *rq)
741{ 741{
742 struct blk_trace *bt = q->blk_trace;
743
744 /* if control ever passes through here, it's a request based driver */
745 if (unlikely(bt && !bt->rq_based))
746 bt->rq_based = true;
747
742 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 748 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
743} 749}
744 750
@@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore,
774 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); 780 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
775} 781}
776 782
777static void blk_add_trace_bio_complete(void *ignore, 783static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
778 struct request_queue *q, struct bio *bio,
779 int error)
780{ 784{
785 struct request_queue *q;
786 struct blk_trace *bt;
787
788 if (!bio->bi_bdev)
789 return;
790
791 q = bdev_get_queue(bio->bi_bdev);
792 bt = q->blk_trace;
793
794 /*
795 * Request based drivers will generate both rq and bio completions.
796 * Ignore bio ones.
797 */
798 if (likely(!bt) || bt->rq_based)
799 return;
800
781 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); 801 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
782} 802}
783 803
784static void blk_add_trace_bio_backmerge(void *ignore, 804static void blk_add_trace_bio_backmerge(void *ignore,
785 struct request_queue *q, 805 struct request_queue *q,
806 struct request *rq,
786 struct bio *bio) 807 struct bio *bio)
787{ 808{
788 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); 809 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
790 811
791static void blk_add_trace_bio_frontmerge(void *ignore, 812static void blk_add_trace_bio_frontmerge(void *ignore,
792 struct request_queue *q, 813 struct request_queue *q,
814 struct request *rq,
793 struct bio *bio) 815 struct bio *bio)
794{ 816{
795 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); 817 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 41473b4ad7a4..ab25b88aae56 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) 111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112#endif 112#endif
113 113
114/*
115 * Traverse the ftrace_global_list, invoking all entries. The reason that we
116 * can use rcu_dereference_raw() is that elements removed from this list
117 * are simply leaked, so there is no need to interact with a grace-period
118 * mechanism. The rcu_dereference_raw() calls are needed to handle
119 * concurrent insertions into the ftrace_global_list.
120 *
121 * Silly Alpha and silly pointer-speculation compiler optimizations!
122 */
123#define do_for_each_ftrace_op(op, list) \
124 op = rcu_dereference_raw(list); \
125 do
126
127/*
128 * Optimized for just a single item in the list (as that is the normal case).
129 */
130#define while_for_each_ftrace_op(op) \
131 while (likely(op = rcu_dereference_raw((op)->next)) && \
132 unlikely((op) != &ftrace_list_end))
133
114/** 134/**
115 * ftrace_nr_registered_ops - return number of ops registered 135 * ftrace_nr_registered_ops - return number of ops registered
116 * 136 *
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
132 return cnt; 152 return cnt;
133} 153}
134 154
135/*
136 * Traverse the ftrace_global_list, invoking all entries. The reason that we
137 * can use rcu_dereference_raw() is that elements removed from this list
138 * are simply leaked, so there is no need to interact with a grace-period
139 * mechanism. The rcu_dereference_raw() calls are needed to handle
140 * concurrent insertions into the ftrace_global_list.
141 *
142 * Silly Alpha and silly pointer-speculation compiler optimizations!
143 */
144static void 155static void
145ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, 156ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
146 struct ftrace_ops *op, struct pt_regs *regs) 157 struct ftrace_ops *op, struct pt_regs *regs)
147{ 158{
148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) 159 int bit;
160
161 bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
162 if (bit < 0)
149 return; 163 return;
150 164
151 trace_recursion_set(TRACE_GLOBAL_BIT); 165 do_for_each_ftrace_op(op, ftrace_global_list) {
152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
153 while (op != &ftrace_list_end) {
154 op->func(ip, parent_ip, op, regs); 166 op->func(ip, parent_ip, op, regs);
155 op = rcu_dereference_raw(op->next); /*see above*/ 167 } while_for_each_ftrace_op(op);
156 }; 168
157 trace_recursion_clear(TRACE_GLOBAL_BIT); 169 trace_clear_recursion(bit);
158} 170}
159 171
160static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 172static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
221 * registered callers. 233 * registered callers.
222 */ 234 */
223 if (ftrace_global_list == &ftrace_list_end || 235 if (ftrace_global_list == &ftrace_list_end ||
224 ftrace_global_list->next == &ftrace_list_end) 236 ftrace_global_list->next == &ftrace_list_end) {
225 func = ftrace_global_list->func; 237 func = ftrace_global_list->func;
226 else 238 /*
239 * As we are calling the function directly.
240 * If it does not have recursion protection,
241 * the function_trace_op needs to be updated
242 * accordingly.
243 */
244 if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
245 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
246 else
247 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
248 } else {
227 func = ftrace_global_list_func; 249 func = ftrace_global_list_func;
250 /* The list has its own recursion protection. */
251 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
252 }
253
228 254
229 /* If we filter on pids, update to use the pid function */ 255 /* If we filter on pids, update to use the pid function */
230 if (!list_empty(&ftrace_pids)) { 256 if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) 363 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
338 return -EINVAL; 364 return -EINVAL;
339 365
340#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS 366#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
341 /* 367 /*
342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 368 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. 369 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -736,7 +762,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
736{ 762{
737 struct ftrace_profile *rec; 763 struct ftrace_profile *rec;
738 struct hlist_head *hhd; 764 struct hlist_head *hhd;
739 struct hlist_node *n;
740 unsigned long key; 765 unsigned long key;
741 766
742 key = hash_long(ip, ftrace_profile_bits); 767 key = hash_long(ip, ftrace_profile_bits);
@@ -745,7 +770,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
745 if (hlist_empty(hhd)) 770 if (hlist_empty(hhd))
746 return NULL; 771 return NULL;
747 772
748 hlist_for_each_entry_rcu(rec, n, hhd, node) { 773 hlist_for_each_entry_rcu(rec, hhd, node) {
749 if (rec->ip == ip) 774 if (rec->ip == ip)
750 return rec; 775 return rec;
751 } 776 }
@@ -1107,7 +1132,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1107 unsigned long key; 1132 unsigned long key;
1108 struct ftrace_func_entry *entry; 1133 struct ftrace_func_entry *entry;
1109 struct hlist_head *hhd; 1134 struct hlist_head *hhd;
1110 struct hlist_node *n;
1111 1135
1112 if (ftrace_hash_empty(hash)) 1136 if (ftrace_hash_empty(hash))
1113 return NULL; 1137 return NULL;
@@ -1119,7 +1143,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1119 1143
1120 hhd = &hash->buckets[key]; 1144 hhd = &hash->buckets[key];
1121 1145
1122 hlist_for_each_entry_rcu(entry, n, hhd, hlist) { 1146 hlist_for_each_entry_rcu(entry, hhd, hlist) {
1123 if (entry->ip == ip) 1147 if (entry->ip == ip)
1124 return entry; 1148 return entry;
1125 } 1149 }
@@ -1176,7 +1200,7 @@ remove_hash_entry(struct ftrace_hash *hash,
1176static void ftrace_hash_clear(struct ftrace_hash *hash) 1200static void ftrace_hash_clear(struct ftrace_hash *hash)
1177{ 1201{
1178 struct hlist_head *hhd; 1202 struct hlist_head *hhd;
1179 struct hlist_node *tp, *tn; 1203 struct hlist_node *tn;
1180 struct ftrace_func_entry *entry; 1204 struct ftrace_func_entry *entry;
1181 int size = 1 << hash->size_bits; 1205 int size = 1 << hash->size_bits;
1182 int i; 1206 int i;
@@ -1186,7 +1210,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
1186 1210
1187 for (i = 0; i < size; i++) { 1211 for (i = 0; i < size; i++) {
1188 hhd = &hash->buckets[i]; 1212 hhd = &hash->buckets[i];
1189 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) 1213 hlist_for_each_entry_safe(entry, tn, hhd, hlist)
1190 free_hash_entry(hash, entry); 1214 free_hash_entry(hash, entry);
1191 } 1215 }
1192 FTRACE_WARN_ON(hash->count); 1216 FTRACE_WARN_ON(hash->count);
@@ -1249,7 +1273,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1249{ 1273{
1250 struct ftrace_func_entry *entry; 1274 struct ftrace_func_entry *entry;
1251 struct ftrace_hash *new_hash; 1275 struct ftrace_hash *new_hash;
1252 struct hlist_node *tp;
1253 int size; 1276 int size;
1254 int ret; 1277 int ret;
1255 int i; 1278 int i;
@@ -1264,7 +1287,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1264 1287
1265 size = 1 << hash->size_bits; 1288 size = 1 << hash->size_bits;
1266 for (i = 0; i < size; i++) { 1289 for (i = 0; i < size; i++) {
1267 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { 1290 hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
1268 ret = add_hash_entry(new_hash, entry->ip); 1291 ret = add_hash_entry(new_hash, entry->ip);
1269 if (ret < 0) 1292 if (ret < 0)
1270 goto free_hash; 1293 goto free_hash;
@@ -1290,7 +1313,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1290 struct ftrace_hash **dst, struct ftrace_hash *src) 1313 struct ftrace_hash **dst, struct ftrace_hash *src)
1291{ 1314{
1292 struct ftrace_func_entry *entry; 1315 struct ftrace_func_entry *entry;
1293 struct hlist_node *tp, *tn; 1316 struct hlist_node *tn;
1294 struct hlist_head *hhd; 1317 struct hlist_head *hhd;
1295 struct ftrace_hash *old_hash; 1318 struct ftrace_hash *old_hash;
1296 struct ftrace_hash *new_hash; 1319 struct ftrace_hash *new_hash;
@@ -1336,7 +1359,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1336 size = 1 << src->size_bits; 1359 size = 1 << src->size_bits;
1337 for (i = 0; i < size; i++) { 1360 for (i = 0; i < size; i++) {
1338 hhd = &src->buckets[i]; 1361 hhd = &src->buckets[i];
1339 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { 1362 hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
1340 if (bits > 0) 1363 if (bits > 0)
1341 key = hash_long(entry->ip, bits); 1364 key = hash_long(entry->ip, bits);
1342 else 1365 else
@@ -2875,7 +2898,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2875{ 2898{
2876 struct ftrace_func_probe *entry; 2899 struct ftrace_func_probe *entry;
2877 struct hlist_head *hhd; 2900 struct hlist_head *hhd;
2878 struct hlist_node *n;
2879 unsigned long key; 2901 unsigned long key;
2880 2902
2881 key = hash_long(ip, FTRACE_HASH_BITS); 2903 key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2891,7 +2913,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2891 * on the hash. rcu_read_lock is too dangerous here. 2913 * on the hash. rcu_read_lock is too dangerous here.
2892 */ 2914 */
2893 preempt_disable_notrace(); 2915 preempt_disable_notrace();
2894 hlist_for_each_entry_rcu(entry, n, hhd, node) { 2916 hlist_for_each_entry_rcu(entry, hhd, node) {
2895 if (entry->ip == ip) 2917 if (entry->ip == ip)
2896 entry->ops->func(ip, parent_ip, &entry->data); 2918 entry->ops->func(ip, parent_ip, &entry->data);
2897 } 2919 }
@@ -3042,7 +3064,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3042 void *data, int flags) 3064 void *data, int flags)
3043{ 3065{
3044 struct ftrace_func_probe *entry; 3066 struct ftrace_func_probe *entry;
3045 struct hlist_node *n, *tmp; 3067 struct hlist_node *tmp;
3046 char str[KSYM_SYMBOL_LEN]; 3068 char str[KSYM_SYMBOL_LEN];
3047 int type = MATCH_FULL; 3069 int type = MATCH_FULL;
3048 int i, len = 0; 3070 int i, len = 0;
@@ -3065,7 +3087,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3065 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3087 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3066 struct hlist_head *hhd = &ftrace_func_hash[i]; 3088 struct hlist_head *hhd = &ftrace_func_hash[i];
3067 3089
3068 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { 3090 hlist_for_each_entry_safe(entry, tmp, hhd, node) {
3069 3091
3070 /* break up if statements for readability */ 3092 /* break up if statements for readability */
3071 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) 3093 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
@@ -3970,37 +3992,51 @@ static void ftrace_init_module(struct module *mod,
3970 ftrace_process_locs(mod, start, end); 3992 ftrace_process_locs(mod, start, end);
3971} 3993}
3972 3994
3973static int ftrace_module_notify(struct notifier_block *self, 3995static int ftrace_module_notify_enter(struct notifier_block *self,
3974 unsigned long val, void *data) 3996 unsigned long val, void *data)
3975{ 3997{
3976 struct module *mod = data; 3998 struct module *mod = data;
3977 3999
3978 switch (val) { 4000 if (val == MODULE_STATE_COMING)
3979 case MODULE_STATE_COMING:
3980 ftrace_init_module(mod, mod->ftrace_callsites, 4001 ftrace_init_module(mod, mod->ftrace_callsites,
3981 mod->ftrace_callsites + 4002 mod->ftrace_callsites +
3982 mod->num_ftrace_callsites); 4003 mod->num_ftrace_callsites);
3983 break; 4004 return 0;
3984 case MODULE_STATE_GOING: 4005}
4006
4007static int ftrace_module_notify_exit(struct notifier_block *self,
4008 unsigned long val, void *data)
4009{
4010 struct module *mod = data;
4011
4012 if (val == MODULE_STATE_GOING)
3985 ftrace_release_mod(mod); 4013 ftrace_release_mod(mod);
3986 break;
3987 }
3988 4014
3989 return 0; 4015 return 0;
3990} 4016}
3991#else 4017#else
3992static int ftrace_module_notify(struct notifier_block *self, 4018static int ftrace_module_notify_enter(struct notifier_block *self,
3993 unsigned long val, void *data) 4019 unsigned long val, void *data)
4020{
4021 return 0;
4022}
4023static int ftrace_module_notify_exit(struct notifier_block *self,
4024 unsigned long val, void *data)
3994{ 4025{
3995 return 0; 4026 return 0;
3996} 4027}
3997#endif /* CONFIG_MODULES */ 4028#endif /* CONFIG_MODULES */
3998 4029
3999struct notifier_block ftrace_module_nb = { 4030struct notifier_block ftrace_module_enter_nb = {
4000 .notifier_call = ftrace_module_notify, 4031 .notifier_call = ftrace_module_notify_enter,
4001 .priority = INT_MAX, /* Run before anything that can use kprobes */ 4032 .priority = INT_MAX, /* Run before anything that can use kprobes */
4002}; 4033};
4003 4034
4035struct notifier_block ftrace_module_exit_nb = {
4036 .notifier_call = ftrace_module_notify_exit,
4037 .priority = INT_MIN, /* Run after anything that can remove kprobes */
4038};
4039
4004extern unsigned long __start_mcount_loc[]; 4040extern unsigned long __start_mcount_loc[];
4005extern unsigned long __stop_mcount_loc[]; 4041extern unsigned long __stop_mcount_loc[];
4006 4042
@@ -4032,9 +4068,13 @@ void __init ftrace_init(void)
4032 __start_mcount_loc, 4068 __start_mcount_loc,
4033 __stop_mcount_loc); 4069 __stop_mcount_loc);
4034 4070
4035 ret = register_module_notifier(&ftrace_module_nb); 4071 ret = register_module_notifier(&ftrace_module_enter_nb);
4036 if (ret) 4072 if (ret)
4037 pr_warning("Failed to register trace ftrace module notifier\n"); 4073 pr_warning("Failed to register trace ftrace module enter notifier\n");
4074
4075 ret = register_module_notifier(&ftrace_module_exit_nb);
4076 if (ret)
4077 pr_warning("Failed to register trace ftrace module exit notifier\n");
4038 4078
4039 set_ftrace_early_filters(); 4079 set_ftrace_early_filters();
4040 4080
@@ -4090,14 +4130,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4090 */ 4130 */
4091 preempt_disable_notrace(); 4131 preempt_disable_notrace();
4092 trace_recursion_set(TRACE_CONTROL_BIT); 4132 trace_recursion_set(TRACE_CONTROL_BIT);
4093 op = rcu_dereference_raw(ftrace_control_list); 4133 do_for_each_ftrace_op(op, ftrace_control_list) {
4094 while (op != &ftrace_list_end) {
4095 if (!ftrace_function_local_disabled(op) && 4134 if (!ftrace_function_local_disabled(op) &&
4096 ftrace_ops_test(op, ip)) 4135 ftrace_ops_test(op, ip))
4097 op->func(ip, parent_ip, op, regs); 4136 op->func(ip, parent_ip, op, regs);
4098 4137 } while_for_each_ftrace_op(op);
4099 op = rcu_dereference_raw(op->next);
4100 };
4101 trace_recursion_clear(TRACE_CONTROL_BIT); 4138 trace_recursion_clear(TRACE_CONTROL_BIT);
4102 preempt_enable_notrace(); 4139 preempt_enable_notrace();
4103} 4140}
@@ -4112,27 +4149,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112 struct ftrace_ops *ignored, struct pt_regs *regs) 4149 struct ftrace_ops *ignored, struct pt_regs *regs)
4113{ 4150{
4114 struct ftrace_ops *op; 4151 struct ftrace_ops *op;
4152 int bit;
4115 4153
4116 if (function_trace_stop) 4154 if (function_trace_stop)
4117 return; 4155 return;
4118 4156
4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) 4157 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
4158 if (bit < 0)
4120 return; 4159 return;
4121 4160
4122 trace_recursion_set(TRACE_INTERNAL_BIT);
4123 /* 4161 /*
4124 * Some of the ops may be dynamically allocated, 4162 * Some of the ops may be dynamically allocated,
4125 * they must be freed after a synchronize_sched(). 4163 * they must be freed after a synchronize_sched().
4126 */ 4164 */
4127 preempt_disable_notrace(); 4165 preempt_disable_notrace();
4128 op = rcu_dereference_raw(ftrace_ops_list); 4166 do_for_each_ftrace_op(op, ftrace_ops_list) {
4129 while (op != &ftrace_list_end) {
4130 if (ftrace_ops_test(op, ip)) 4167 if (ftrace_ops_test(op, ip))
4131 op->func(ip, parent_ip, op, regs); 4168 op->func(ip, parent_ip, op, regs);
4132 op = rcu_dereference_raw(op->next); 4169 } while_for_each_ftrace_op(op);
4133 };
4134 preempt_enable_notrace(); 4170 preempt_enable_notrace();
4135 trace_recursion_clear(TRACE_INTERNAL_BIT); 4171 trace_clear_recursion(bit);
4136} 4172}
4137 4173
4138/* 4174/*
@@ -4143,8 +4179,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4143 * Archs are to support both the regs and ftrace_ops at the same time. 4179 * Archs are to support both the regs and ftrace_ops at the same time.
4144 * If they support ftrace_ops, it is assumed they support regs. 4180 * If they support ftrace_ops, it is assumed they support regs.
4145 * If call backs want to use regs, they must either check for regs 4181 * If call backs want to use regs, they must either check for regs
4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. 4182 * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. 4183 * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
4148 * An architecture can pass partial regs with ftrace_ops and still 4184 * An architecture can pass partial regs with ftrace_ops and still
4149 * set the ARCH_SUPPORT_FTARCE_OPS. 4185 * set the ARCH_SUPPORT_FTARCE_OPS.
4150 */ 4186 */
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
20 17
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedcd..6989df2ba194 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
3 * 3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ftrace_event.h>
6#include <linux/ring_buffer.h> 7#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/debugfs.h> 11#include <linux/debugfs.h>
10#include <linux/uaccess.h> 12#include <linux/uaccess.h>
@@ -21,7 +23,6 @@
21#include <linux/fs.h> 23#include <linux/fs.h>
22 24
23#include <asm/local.h> 25#include <asm/local.h>
24#include "trace.h"
25 26
26static void update_pages_handler(struct work_struct *work); 27static void update_pages_handler(struct work_struct *work);
27 28
@@ -177,7 +178,7 @@ void tracing_off_permanent(void)
177#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 178#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
178#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 179#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
179 180
180#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 181#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
181# define RB_FORCE_8BYTE_ALIGNMENT 0 182# define RB_FORCE_8BYTE_ALIGNMENT 0
182# define RB_ARCH_ALIGNMENT RB_ALIGNMENT 183# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
183#else 184#else
@@ -185,6 +186,8 @@ void tracing_off_permanent(void)
185# define RB_ARCH_ALIGNMENT 8U 186# define RB_ARCH_ALIGNMENT 8U
186#endif 187#endif
187 188
189#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
190
188/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 191/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
189#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 192#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
190 193
@@ -333,7 +336,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
333struct buffer_data_page { 336struct buffer_data_page {
334 u64 time_stamp; /* page time stamp */ 337 u64 time_stamp; /* page time stamp */
335 local_t commit; /* write committed index */ 338 local_t commit; /* write committed index */
336 unsigned char data[]; /* data of buffer page */ 339 unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
337}; 340};
338 341
339/* 342/*
@@ -2432,41 +2435,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2432 2435
2433#ifdef CONFIG_TRACING 2436#ifdef CONFIG_TRACING
2434 2437
2435#define TRACE_RECURSIVE_DEPTH 16 2438/*
2439 * The lock and unlock are done within a preempt disable section.
2440 * The current_context per_cpu variable can only be modified
2441 * by the current task between lock and unlock. But it can
2442 * be modified more than once via an interrupt. To pass this
2443 * information from the lock to the unlock without having to
2444 * access the 'in_interrupt()' functions again (which do show
2445 * a bit of overhead in something as critical as function tracing,
2446 * we use a bitmask trick.
2447 *
2448 * bit 0 = NMI context
2449 * bit 1 = IRQ context
2450 * bit 2 = SoftIRQ context
2451 * bit 3 = normal context.
2452 *
2453 * This works because this is the order of contexts that can
2454 * preempt other contexts. A SoftIRQ never preempts an IRQ
2455 * context.
2456 *
2457 * When the context is determined, the corresponding bit is
2458 * checked and set (if it was set, then a recursion of that context
2459 * happened).
2460 *
2461 * On unlock, we need to clear this bit. To do so, just subtract
2462 * 1 from the current_context and AND it to itself.
2463 *
2464 * (binary)
2465 * 101 - 1 = 100
2466 * 101 & 100 = 100 (clearing bit zero)
2467 *
2468 * 1010 - 1 = 1001
2469 * 1010 & 1001 = 1000 (clearing bit 1)
2470 *
2471 * The least significant bit can be cleared this way, and it
2472 * just so happens that it is the same bit corresponding to
2473 * the current context.
2474 */
2475static DEFINE_PER_CPU(unsigned int, current_context);
2436 2476
2437/* Keep this code out of the fast path cache */ 2477static __always_inline int trace_recursive_lock(void)
2438static noinline void trace_recursive_fail(void)
2439{ 2478{
2440 /* Disable all tracing before we do anything else */ 2479 unsigned int val = this_cpu_read(current_context);
2441 tracing_off_permanent(); 2480 int bit;
2442
2443 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2444 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2445 trace_recursion_buffer(),
2446 hardirq_count() >> HARDIRQ_SHIFT,
2447 softirq_count() >> SOFTIRQ_SHIFT,
2448 in_nmi());
2449
2450 WARN_ON_ONCE(1);
2451}
2452 2481
2453static inline int trace_recursive_lock(void) 2482 if (in_interrupt()) {
2454{ 2483 if (in_nmi())
2455 trace_recursion_inc(); 2484 bit = 0;
2485 else if (in_irq())
2486 bit = 1;
2487 else
2488 bit = 2;
2489 } else
2490 bit = 3;
2456 2491
2457 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) 2492 if (unlikely(val & (1 << bit)))
2458 return 0; 2493 return 1;
2459 2494
2460 trace_recursive_fail(); 2495 val |= (1 << bit);
2496 this_cpu_write(current_context, val);
2461 2497
2462 return -1; 2498 return 0;
2463} 2499}
2464 2500
2465static inline void trace_recursive_unlock(void) 2501static __always_inline void trace_recursive_unlock(void)
2466{ 2502{
2467 WARN_ON_ONCE(!trace_recursion_buffer()); 2503 unsigned int val = this_cpu_read(current_context);
2468 2504
2469 trace_recursion_dec(); 2505 val--;
2506 val &= this_cpu_read(current_context);
2507 this_cpu_write(current_context, val);
2470} 2508}
2471 2509
2472#else 2510#else
@@ -3067,6 +3105,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3067EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); 3105EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3068 3106
3069/** 3107/**
3108 * ring_buffer_read_events_cpu - get the number of events successfully read
3109 * @buffer: The ring buffer
3110 * @cpu: The per CPU buffer to get the number of events read
3111 */
3112unsigned long
3113ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
3114{
3115 struct ring_buffer_per_cpu *cpu_buffer;
3116
3117 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3118 return 0;
3119
3120 cpu_buffer = buffer->buffers[cpu];
3121 return cpu_buffer->read;
3122}
3123EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
3124
3125/**
3070 * ring_buffer_entries - get the number of entries in a buffer 3126 * ring_buffer_entries - get the number of entries in a buffer
3071 * @buffer: The ring buffer 3127 * @buffer: The ring buffer
3072 * 3128 *
@@ -3425,7 +3481,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3425 /* check for end of page padding */ 3481 /* check for end of page padding */
3426 if ((iter->head >= rb_page_size(iter->head_page)) && 3482 if ((iter->head >= rb_page_size(iter->head_page)) &&
3427 (iter->head_page != cpu_buffer->commit_page)) 3483 (iter->head_page != cpu_buffer->commit_page))
3428 rb_advance_iter(iter); 3484 rb_inc_iter(iter);
3429} 3485}
3430 3486
3431static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) 3487static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46d7d24..1f835a83cb2c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
39#include <linux/poll.h> 39#include <linux/poll.h>
40#include <linux/nmi.h> 40#include <linux/nmi.h>
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/sched/rt.h>
42 43
43#include "trace.h" 44#include "trace.h"
44#include "trace_output.h" 45#include "trace_output.h"
@@ -249,7 +250,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
249static struct tracer *trace_types __read_mostly; 250static struct tracer *trace_types __read_mostly;
250 251
251/* current_trace points to the tracer that is currently active */ 252/* current_trace points to the tracer that is currently active */
252static struct tracer *current_trace __read_mostly; 253static struct tracer *current_trace __read_mostly = &nop_trace;
253 254
254/* 255/*
255 * trace_types_lock is used to protect the trace_types list. 256 * trace_types_lock is used to protect the trace_types list.
@@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
709 return; 710 return;
710 711
711 WARN_ON_ONCE(!irqs_disabled()); 712 WARN_ON_ONCE(!irqs_disabled());
712 if (!current_trace->use_max_tr) { 713
713 WARN_ON_ONCE(1); 714 if (!current_trace->allocated_snapshot) {
715 /* Only the nop tracer should hit this when disabling */
716 WARN_ON_ONCE(current_trace != &nop_trace);
714 return; 717 return;
715 } 718 }
719
716 arch_spin_lock(&ftrace_max_lock); 720 arch_spin_lock(&ftrace_max_lock);
717 721
718 tr->buffer = max_tr.buffer; 722 tr->buffer = max_tr.buffer;
@@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
739 return; 743 return;
740 744
741 WARN_ON_ONCE(!irqs_disabled()); 745 WARN_ON_ONCE(!irqs_disabled());
742 if (!current_trace->use_max_tr) { 746 if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
743 WARN_ON_ONCE(1);
744 return; 747 return;
745 }
746 748
747 arch_spin_lock(&ftrace_max_lock); 749 arch_spin_lock(&ftrace_max_lock);
748 750
@@ -862,10 +864,13 @@ int register_tracer(struct tracer *type)
862 864
863 current_trace = type; 865 current_trace = type;
864 866
865 /* If we expanded the buffers, make sure the max is expanded too */ 867 if (type->use_max_tr) {
866 if (ring_buffer_expanded && type->use_max_tr) 868 /* If we expanded the buffers, make sure the max is expanded too */
867 ring_buffer_resize(max_tr.buffer, trace_buf_size, 869 if (ring_buffer_expanded)
868 RING_BUFFER_ALL_CPUS); 870 ring_buffer_resize(max_tr.buffer, trace_buf_size,
871 RING_BUFFER_ALL_CPUS);
872 type->allocated_snapshot = true;
873 }
869 874
870 /* the test is responsible for initializing and enabling */ 875 /* the test is responsible for initializing and enabling */
871 pr_info("Testing tracer %s: ", type->name); 876 pr_info("Testing tracer %s: ", type->name);
@@ -881,10 +886,14 @@ int register_tracer(struct tracer *type)
881 /* Only reset on passing, to avoid touching corrupted buffers */ 886 /* Only reset on passing, to avoid touching corrupted buffers */
882 tracing_reset_online_cpus(tr); 887 tracing_reset_online_cpus(tr);
883 888
884 /* Shrink the max buffer again */ 889 if (type->use_max_tr) {
885 if (ring_buffer_expanded && type->use_max_tr) 890 type->allocated_snapshot = false;
886 ring_buffer_resize(max_tr.buffer, 1, 891
887 RING_BUFFER_ALL_CPUS); 892 /* Shrink the max buffer again */
893 if (ring_buffer_expanded)
894 ring_buffer_resize(max_tr.buffer, 1,
895 RING_BUFFER_ALL_CPUS);
896 }
888 897
889 printk(KERN_CONT "PASSED\n"); 898 printk(KERN_CONT "PASSED\n");
890 } 899 }
@@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
922{ 931{
923 struct ring_buffer *buffer = tr->buffer; 932 struct ring_buffer *buffer = tr->buffer;
924 933
934 if (!buffer)
935 return;
936
925 ring_buffer_record_disable(buffer); 937 ring_buffer_record_disable(buffer);
926 938
927 /* Make sure all commits have finished */ 939 /* Make sure all commits have finished */
@@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
936 struct ring_buffer *buffer = tr->buffer; 948 struct ring_buffer *buffer = tr->buffer;
937 int cpu; 949 int cpu;
938 950
951 if (!buffer)
952 return;
953
939 ring_buffer_record_disable(buffer); 954 ring_buffer_record_disable(buffer);
940 955
941 /* Make sure all commits have finished */ 956 /* Make sure all commits have finished */
@@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1167 1182
1168 entry->preempt_count = pc & 0xff; 1183 entry->preempt_count = pc & 0xff;
1169 entry->pid = (tsk) ? tsk->pid : 0; 1184 entry->pid = (tsk) ? tsk->pid : 0;
1170 entry->padding = 0;
1171 entry->flags = 1185 entry->flags =
1172#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1186#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1173 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1187 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1335 */ 1349 */
1336 preempt_disable_notrace(); 1350 preempt_disable_notrace();
1337 1351
1338 use_stack = ++__get_cpu_var(ftrace_stack_reserve); 1352 use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
1339 /* 1353 /*
1340 * We don't need any atomic variables, just a barrier. 1354 * We don't need any atomic variables, just a barrier.
1341 * If an interrupt comes in, we don't care, because it would 1355 * If an interrupt comes in, we don't care, because it would
@@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1389 out: 1403 out:
1390 /* Again, don't let gcc optimize things here */ 1404 /* Again, don't let gcc optimize things here */
1391 barrier(); 1405 barrier();
1392 __get_cpu_var(ftrace_stack_reserve)--; 1406 __this_cpu_dec(ftrace_stack_reserve);
1393 preempt_enable_notrace(); 1407 preempt_enable_notrace();
1394 1408
1395} 1409}
@@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1517static char *get_trace_buf(void) 1531static char *get_trace_buf(void)
1518{ 1532{
1519 struct trace_buffer_struct *percpu_buffer; 1533 struct trace_buffer_struct *percpu_buffer;
1520 struct trace_buffer_struct *buffer;
1521 1534
1522 /* 1535 /*
1523 * If we have allocated per cpu buffers, then we do not 1536 * If we have allocated per cpu buffers, then we do not
@@ -1535,9 +1548,7 @@ static char *get_trace_buf(void)
1535 if (!percpu_buffer) 1548 if (!percpu_buffer)
1536 return NULL; 1549 return NULL;
1537 1550
1538 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); 1551 return this_cpu_ptr(&percpu_buffer->buffer[0]);
1539
1540 return buffer->buffer;
1541} 1552}
1542 1553
1543static int alloc_percpu_trace_buffer(void) 1554static int alloc_percpu_trace_buffer(void)
@@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1942static void *s_start(struct seq_file *m, loff_t *pos) 1953static void *s_start(struct seq_file *m, loff_t *pos)
1943{ 1954{
1944 struct trace_iterator *iter = m->private; 1955 struct trace_iterator *iter = m->private;
1945 static struct tracer *old_tracer;
1946 int cpu_file = iter->cpu_file; 1956 int cpu_file = iter->cpu_file;
1947 void *p = NULL; 1957 void *p = NULL;
1948 loff_t l = 0; 1958 loff_t l = 0;
1949 int cpu; 1959 int cpu;
1950 1960
1951 /* copy the tracer to avoid using a global lock all around */ 1961 /*
1962 * copy the tracer to avoid using a global lock all around.
1963 * iter->trace is a copy of current_trace, the pointer to the
1964 * name may be used instead of a strcmp(), as iter->trace->name
1965 * will point to the same string as current_trace->name.
1966 */
1952 mutex_lock(&trace_types_lock); 1967 mutex_lock(&trace_types_lock);
1953 if (unlikely(old_tracer != current_trace && current_trace)) { 1968 if (unlikely(current_trace && iter->trace->name != current_trace->name))
1954 old_tracer = current_trace;
1955 *iter->trace = *current_trace; 1969 *iter->trace = *current_trace;
1956 }
1957 mutex_unlock(&trace_types_lock); 1970 mutex_unlock(&trace_types_lock);
1958 1971
1959 atomic_inc(&trace_record_cmdline_disabled); 1972 if (iter->snapshot && iter->trace->use_max_tr)
1973 return ERR_PTR(-EBUSY);
1974
1975 if (!iter->snapshot)
1976 atomic_inc(&trace_record_cmdline_disabled);
1960 1977
1961 if (*pos != iter->pos) { 1978 if (*pos != iter->pos) {
1962 iter->ent = NULL; 1979 iter->ent = NULL;
@@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p)
1995{ 2012{
1996 struct trace_iterator *iter = m->private; 2013 struct trace_iterator *iter = m->private;
1997 2014
1998 atomic_dec(&trace_record_cmdline_disabled); 2015 if (iter->snapshot && iter->trace->use_max_tr)
2016 return;
2017
2018 if (!iter->snapshot)
2019 atomic_dec(&trace_record_cmdline_disabled);
1999 trace_access_unlock(iter->cpu_file); 2020 trace_access_unlock(iter->cpu_file);
2000 trace_event_read_unlock(); 2021 trace_event_read_unlock();
2001} 2022}
@@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2080 unsigned long total; 2101 unsigned long total;
2081 const char *name = "preemption"; 2102 const char *name = "preemption";
2082 2103
2083 if (type) 2104 name = type->name;
2084 name = type->name;
2085 2105
2086 get_total_entries(tr, &total, &entries); 2106 get_total_entries(tr, &total, &entries);
2087 2107
@@ -2380,6 +2400,27 @@ static void test_ftrace_alive(struct seq_file *m)
2380 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); 2400 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
2381} 2401}
2382 2402
2403#ifdef CONFIG_TRACER_MAX_TRACE
2404static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2405{
2406 if (iter->trace->allocated_snapshot)
2407 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2408 else
2409 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2410
2411 seq_printf(m, "# Snapshot commands:\n");
2412 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
2413 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2414 seq_printf(m, "# Takes a snapshot of the main buffer.\n");
2415 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
2416 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2417 seq_printf(m, "# is not a '0' or '1')\n");
2418}
2419#else
2420/* Should never be called */
2421static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
2422#endif
2423
2383static int s_show(struct seq_file *m, void *v) 2424static int s_show(struct seq_file *m, void *v)
2384{ 2425{
2385 struct trace_iterator *iter = v; 2426 struct trace_iterator *iter = v;
@@ -2391,7 +2432,9 @@ static int s_show(struct seq_file *m, void *v)
2391 seq_puts(m, "#\n"); 2432 seq_puts(m, "#\n");
2392 test_ftrace_alive(m); 2433 test_ftrace_alive(m);
2393 } 2434 }
2394 if (iter->trace && iter->trace->print_header) 2435 if (iter->snapshot && trace_empty(iter))
2436 print_snapshot_help(m, iter);
2437 else if (iter->trace && iter->trace->print_header)
2395 iter->trace->print_header(m); 2438 iter->trace->print_header(m);
2396 else 2439 else
2397 trace_default_header(m); 2440 trace_default_header(m);
@@ -2430,7 +2473,7 @@ static const struct seq_operations tracer_seq_ops = {
2430}; 2473};
2431 2474
2432static struct trace_iterator * 2475static struct trace_iterator *
2433__tracing_open(struct inode *inode, struct file *file) 2476__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2434{ 2477{
2435 long cpu_file = (long) inode->i_private; 2478 long cpu_file = (long) inode->i_private;
2436 struct trace_iterator *iter; 2479 struct trace_iterator *iter;
@@ -2457,16 +2500,16 @@ __tracing_open(struct inode *inode, struct file *file)
2457 if (!iter->trace) 2500 if (!iter->trace)
2458 goto fail; 2501 goto fail;
2459 2502
2460 if (current_trace) 2503 *iter->trace = *current_trace;
2461 *iter->trace = *current_trace;
2462 2504
2463 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2505 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2464 goto fail; 2506 goto fail;
2465 2507
2466 if (current_trace && current_trace->print_max) 2508 if (current_trace->print_max || snapshot)
2467 iter->tr = &max_tr; 2509 iter->tr = &max_tr;
2468 else 2510 else
2469 iter->tr = &global_trace; 2511 iter->tr = &global_trace;
2512 iter->snapshot = snapshot;
2470 iter->pos = -1; 2513 iter->pos = -1;
2471 mutex_init(&iter->mutex); 2514 mutex_init(&iter->mutex);
2472 iter->cpu_file = cpu_file; 2515 iter->cpu_file = cpu_file;
@@ -2483,8 +2526,9 @@ __tracing_open(struct inode *inode, struct file *file)
2483 if (trace_clocks[trace_clock_id].in_ns) 2526 if (trace_clocks[trace_clock_id].in_ns)
2484 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 2527 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2485 2528
2486 /* stop the trace while dumping */ 2529 /* stop the trace while dumping if we are not opening "snapshot" */
2487 tracing_stop(); 2530 if (!iter->snapshot)
2531 tracing_stop();
2488 2532
2489 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2533 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2490 for_each_tracing_cpu(cpu) { 2534 for_each_tracing_cpu(cpu) {
@@ -2547,8 +2591,9 @@ static int tracing_release(struct inode *inode, struct file *file)
2547 if (iter->trace && iter->trace->close) 2591 if (iter->trace && iter->trace->close)
2548 iter->trace->close(iter); 2592 iter->trace->close(iter);
2549 2593
2550 /* reenable tracing if it was previously enabled */ 2594 if (!iter->snapshot)
2551 tracing_start(); 2595 /* reenable tracing if it was previously enabled */
2596 tracing_start();
2552 mutex_unlock(&trace_types_lock); 2597 mutex_unlock(&trace_types_lock);
2553 2598
2554 mutex_destroy(&iter->mutex); 2599 mutex_destroy(&iter->mutex);
@@ -2576,7 +2621,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2576 } 2621 }
2577 2622
2578 if (file->f_mode & FMODE_READ) { 2623 if (file->f_mode & FMODE_READ) {
2579 iter = __tracing_open(inode, file); 2624 iter = __tracing_open(inode, file, false);
2580 if (IS_ERR(iter)) 2625 if (IS_ERR(iter))
2581 ret = PTR_ERR(iter); 2626 ret = PTR_ERR(iter);
2582 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 2627 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -3014,10 +3059,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
3014 int r; 3059 int r;
3015 3060
3016 mutex_lock(&trace_types_lock); 3061 mutex_lock(&trace_types_lock);
3017 if (current_trace) 3062 r = sprintf(buf, "%s\n", current_trace->name);
3018 r = sprintf(buf, "%s\n", current_trace->name);
3019 else
3020 r = sprintf(buf, "\n");
3021 mutex_unlock(&trace_types_lock); 3063 mutex_unlock(&trace_types_lock);
3022 3064
3023 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3065 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3183,6 +3225,7 @@ static int tracing_set_tracer(const char *buf)
3183 static struct trace_option_dentry *topts; 3225 static struct trace_option_dentry *topts;
3184 struct trace_array *tr = &global_trace; 3226 struct trace_array *tr = &global_trace;
3185 struct tracer *t; 3227 struct tracer *t;
3228 bool had_max_tr;
3186 int ret = 0; 3229 int ret = 0;
3187 3230
3188 mutex_lock(&trace_types_lock); 3231 mutex_lock(&trace_types_lock);
@@ -3207,9 +3250,21 @@ static int tracing_set_tracer(const char *buf)
3207 goto out; 3250 goto out;
3208 3251
3209 trace_branch_disable(); 3252 trace_branch_disable();
3210 if (current_trace && current_trace->reset) 3253 if (current_trace->reset)
3211 current_trace->reset(tr); 3254 current_trace->reset(tr);
3212 if (current_trace && current_trace->use_max_tr) { 3255
3256 had_max_tr = current_trace->allocated_snapshot;
3257 current_trace = &nop_trace;
3258
3259 if (had_max_tr && !t->use_max_tr) {
3260 /*
3261 * We need to make sure that the update_max_tr sees that
3262 * current_trace changed to nop_trace to keep it from
3263 * swapping the buffers after we resize it.
3264 * The update_max_tr is called from interrupts disabled
3265 * so a synchronized_sched() is sufficient.
3266 */
3267 synchronize_sched();
3213 /* 3268 /*
3214 * We don't free the ring buffer. instead, resize it because 3269 * We don't free the ring buffer. instead, resize it because
3215 * The max_tr ring buffer has some state (e.g. ring->clock) and 3270 * The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3217,18 +3272,19 @@ static int tracing_set_tracer(const char *buf)
3217 */ 3272 */
3218 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); 3273 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3219 set_buffer_entries(&max_tr, 1); 3274 set_buffer_entries(&max_tr, 1);
3275 tracing_reset_online_cpus(&max_tr);
3276 current_trace->allocated_snapshot = false;
3220 } 3277 }
3221 destroy_trace_option_files(topts); 3278 destroy_trace_option_files(topts);
3222 3279
3223 current_trace = &nop_trace;
3224
3225 topts = create_trace_option_files(t); 3280 topts = create_trace_option_files(t);
3226 if (t->use_max_tr) { 3281 if (t->use_max_tr && !had_max_tr) {
3227 /* we need to make per cpu buffer sizes equivalent */ 3282 /* we need to make per cpu buffer sizes equivalent */
3228 ret = resize_buffer_duplicate_size(&max_tr, &global_trace, 3283 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3229 RING_BUFFER_ALL_CPUS); 3284 RING_BUFFER_ALL_CPUS);
3230 if (ret < 0) 3285 if (ret < 0)
3231 goto out; 3286 goto out;
3287 t->allocated_snapshot = true;
3232 } 3288 }
3233 3289
3234 if (t->init) { 3290 if (t->init) {
@@ -3336,8 +3392,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3336 ret = -ENOMEM; 3392 ret = -ENOMEM;
3337 goto fail; 3393 goto fail;
3338 } 3394 }
3339 if (current_trace) 3395 *iter->trace = *current_trace;
3340 *iter->trace = *current_trace;
3341 3396
3342 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3397 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3343 ret = -ENOMEM; 3398 ret = -ENOMEM;
@@ -3477,7 +3532,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3477 size_t cnt, loff_t *ppos) 3532 size_t cnt, loff_t *ppos)
3478{ 3533{
3479 struct trace_iterator *iter = filp->private_data; 3534 struct trace_iterator *iter = filp->private_data;
3480 static struct tracer *old_tracer;
3481 ssize_t sret; 3535 ssize_t sret;
3482 3536
3483 /* return any leftover data */ 3537 /* return any leftover data */
@@ -3489,10 +3543,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3489 3543
3490 /* copy the tracer to avoid using a global lock all around */ 3544 /* copy the tracer to avoid using a global lock all around */
3491 mutex_lock(&trace_types_lock); 3545 mutex_lock(&trace_types_lock);
3492 if (unlikely(old_tracer != current_trace && current_trace)) { 3546 if (unlikely(iter->trace->name != current_trace->name))
3493 old_tracer = current_trace;
3494 *iter->trace = *current_trace; 3547 *iter->trace = *current_trace;
3495 }
3496 mutex_unlock(&trace_types_lock); 3548 mutex_unlock(&trace_types_lock);
3497 3549
3498 /* 3550 /*
@@ -3648,7 +3700,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3648 .ops = &tracing_pipe_buf_ops, 3700 .ops = &tracing_pipe_buf_ops,
3649 .spd_release = tracing_spd_release_pipe, 3701 .spd_release = tracing_spd_release_pipe,
3650 }; 3702 };
3651 static struct tracer *old_tracer;
3652 ssize_t ret; 3703 ssize_t ret;
3653 size_t rem; 3704 size_t rem;
3654 unsigned int i; 3705 unsigned int i;
@@ -3658,10 +3709,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3658 3709
3659 /* copy the tracer to avoid using a global lock all around */ 3710 /* copy the tracer to avoid using a global lock all around */
3660 mutex_lock(&trace_types_lock); 3711 mutex_lock(&trace_types_lock);
3661 if (unlikely(old_tracer != current_trace && current_trace)) { 3712 if (unlikely(iter->trace->name != current_trace->name))
3662 old_tracer = current_trace;
3663 *iter->trace = *current_trace; 3713 *iter->trace = *current_trace;
3664 }
3665 mutex_unlock(&trace_types_lock); 3714 mutex_unlock(&trace_types_lock);
3666 3715
3667 mutex_lock(&iter->mutex); 3716 mutex_lock(&iter->mutex);
@@ -4037,8 +4086,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4037 * Reset the buffer so that it doesn't have incomparable timestamps. 4086 * Reset the buffer so that it doesn't have incomparable timestamps.
4038 */ 4087 */
4039 tracing_reset_online_cpus(&global_trace); 4088 tracing_reset_online_cpus(&global_trace);
4040 if (max_tr.buffer) 4089 tracing_reset_online_cpus(&max_tr);
4041 tracing_reset_online_cpus(&max_tr);
4042 4090
4043 mutex_unlock(&trace_types_lock); 4091 mutex_unlock(&trace_types_lock);
4044 4092
@@ -4054,6 +4102,85 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4054 return single_open(file, tracing_clock_show, NULL); 4102 return single_open(file, tracing_clock_show, NULL);
4055} 4103}
4056 4104
4105#ifdef CONFIG_TRACER_SNAPSHOT
4106static int tracing_snapshot_open(struct inode *inode, struct file *file)
4107{
4108 struct trace_iterator *iter;
4109 int ret = 0;
4110
4111 if (file->f_mode & FMODE_READ) {
4112 iter = __tracing_open(inode, file, true);
4113 if (IS_ERR(iter))
4114 ret = PTR_ERR(iter);
4115 }
4116 return ret;
4117}
4118
4119static ssize_t
4120tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4121 loff_t *ppos)
4122{
4123 unsigned long val;
4124 int ret;
4125
4126 ret = tracing_update_buffers();
4127 if (ret < 0)
4128 return ret;
4129
4130 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4131 if (ret)
4132 return ret;
4133
4134 mutex_lock(&trace_types_lock);
4135
4136 if (current_trace->use_max_tr) {
4137 ret = -EBUSY;
4138 goto out;
4139 }
4140
4141 switch (val) {
4142 case 0:
4143 if (current_trace->allocated_snapshot) {
4144 /* free spare buffer */
4145 ring_buffer_resize(max_tr.buffer, 1,
4146 RING_BUFFER_ALL_CPUS);
4147 set_buffer_entries(&max_tr, 1);
4148 tracing_reset_online_cpus(&max_tr);
4149 current_trace->allocated_snapshot = false;
4150 }
4151 break;
4152 case 1:
4153 if (!current_trace->allocated_snapshot) {
4154 /* allocate spare buffer */
4155 ret = resize_buffer_duplicate_size(&max_tr,
4156 &global_trace, RING_BUFFER_ALL_CPUS);
4157 if (ret < 0)
4158 break;
4159 current_trace->allocated_snapshot = true;
4160 }
4161
4162 local_irq_disable();
4163 /* Now, we're going to swap */
4164 update_max_tr(&global_trace, current, smp_processor_id());
4165 local_irq_enable();
4166 break;
4167 default:
4168 if (current_trace->allocated_snapshot)
4169 tracing_reset_online_cpus(&max_tr);
4170 break;
4171 }
4172
4173 if (ret >= 0) {
4174 *ppos += cnt;
4175 ret = cnt;
4176 }
4177out:
4178 mutex_unlock(&trace_types_lock);
4179 return ret;
4180}
4181#endif /* CONFIG_TRACER_SNAPSHOT */
4182
4183
4057static const struct file_operations tracing_max_lat_fops = { 4184static const struct file_operations tracing_max_lat_fops = {
4058 .open = tracing_open_generic, 4185 .open = tracing_open_generic,
4059 .read = tracing_max_lat_read, 4186 .read = tracing_max_lat_read,
@@ -4110,6 +4237,16 @@ static const struct file_operations trace_clock_fops = {
4110 .write = tracing_clock_write, 4237 .write = tracing_clock_write,
4111}; 4238};
4112 4239
4240#ifdef CONFIG_TRACER_SNAPSHOT
4241static const struct file_operations snapshot_fops = {
4242 .open = tracing_snapshot_open,
4243 .read = seq_read,
4244 .write = tracing_snapshot_write,
4245 .llseek = tracing_seek,
4246 .release = tracing_release,
4247};
4248#endif /* CONFIG_TRACER_SNAPSHOT */
4249
4113struct ftrace_buffer_info { 4250struct ftrace_buffer_info {
4114 struct trace_array *tr; 4251 struct trace_array *tr;
4115 void *spare; 4252 void *spare;
@@ -4414,6 +4551,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4414 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); 4551 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4415 trace_seq_printf(s, "dropped events: %ld\n", cnt); 4552 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4416 4553
4554 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
4555 trace_seq_printf(s, "read events: %ld\n", cnt);
4556
4417 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4557 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4418 4558
4419 kfree(s); 4559 kfree(s);
@@ -4490,7 +4630,7 @@ struct dentry *tracing_init_dentry(void)
4490 4630
4491static struct dentry *d_percpu; 4631static struct dentry *d_percpu;
4492 4632
4493struct dentry *tracing_dentry_percpu(void) 4633static struct dentry *tracing_dentry_percpu(void)
4494{ 4634{
4495 static int once; 4635 static int once;
4496 struct dentry *d_tracer; 4636 struct dentry *d_tracer;
@@ -4906,6 +5046,11 @@ static __init int tracer_init_debugfs(void)
4906 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5046 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4907#endif 5047#endif
4908 5048
5049#ifdef CONFIG_TRACER_SNAPSHOT
5050 trace_create_file("snapshot", 0644, d_tracer,
5051 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5052#endif
5053
4909 create_trace_options_dir(); 5054 create_trace_options_dir();
4910 5055
4911 for_each_tracing_cpu(cpu) 5056 for_each_tracing_cpu(cpu)
@@ -5014,6 +5159,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5014 if (disable_tracing) 5159 if (disable_tracing)
5015 ftrace_kill(); 5160 ftrace_kill();
5016 5161
5162 /* Simulate the iterator */
5017 trace_init_global_iter(&iter); 5163 trace_init_global_iter(&iter);
5018 5164
5019 for_each_tracing_cpu(cpu) { 5165 for_each_tracing_cpu(cpu) {
@@ -5025,10 +5171,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5025 /* don't look at user memory in panic mode */ 5171 /* don't look at user memory in panic mode */
5026 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 5172 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
5027 5173
5028 /* Simulate the iterator */
5029 iter.tr = &global_trace;
5030 iter.trace = current_trace;
5031
5032 switch (oops_dump_mode) { 5174 switch (oops_dump_mode) {
5033 case DUMP_ALL: 5175 case DUMP_ALL:
5034 iter.cpu_file = TRACE_PIPE_ALL_CPU; 5176 iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5173,7 +5315,7 @@ __init static int tracer_alloc_buffers(void)
5173 init_irq_work(&trace_work_wakeup, trace_wake_up); 5315 init_irq_work(&trace_work_wakeup, trace_wake_up);
5174 5316
5175 register_tracer(&nop_trace); 5317 register_tracer(&nop_trace);
5176 current_trace = &nop_trace; 5318
5177 /* All seems OK, enable tracing */ 5319 /* All seems OK, enable tracing */
5178 tracing_disabled = 0; 5320 tracing_disabled = 0;
5179 5321
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902c..57d7e5397d56 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -287,20 +287,62 @@ struct tracer {
287 struct tracer_flags *flags; 287 struct tracer_flags *flags;
288 bool print_max; 288 bool print_max;
289 bool use_max_tr; 289 bool use_max_tr;
290 bool allocated_snapshot;
290}; 291};
291 292
292 293
293/* Only current can touch trace_recursion */ 294/* Only current can touch trace_recursion */
294#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
295#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
296 295
297/* Ring buffer has the 10 LSB bits to count */ 296/*
298#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) 297 * For function tracing recursion:
299 298 * The order of these bits are important.
300/* for function tracing recursion */ 299 *
301#define TRACE_INTERNAL_BIT (1<<11) 300 * When function tracing occurs, the following steps are made:
302#define TRACE_GLOBAL_BIT (1<<12) 301 * If arch does not support a ftrace feature:
303#define TRACE_CONTROL_BIT (1<<13) 302 * call internal function (uses INTERNAL bits) which calls...
303 * If callback is registered to the "global" list, the list
304 * function is called and recursion checks the GLOBAL bits.
305 * then this function calls...
306 * The function callback, which can use the FTRACE bits to
307 * check for recursion.
308 *
309 * Now if the arch does not suppport a feature, and it calls
310 * the global list function which calls the ftrace callback
311 * all three of these steps will do a recursion protection.
312 * There's no reason to do one if the previous caller already
313 * did. The recursion that we are protecting against will
314 * go through the same steps again.
315 *
316 * To prevent the multiple recursion checks, if a recursion
317 * bit is set that is higher than the MAX bit of the current
318 * check, then we know that the check was made by the previous
319 * caller, and we can skip the current check.
320 */
321enum {
322 TRACE_BUFFER_BIT,
323 TRACE_BUFFER_NMI_BIT,
324 TRACE_BUFFER_IRQ_BIT,
325 TRACE_BUFFER_SIRQ_BIT,
326
327 /* Start of function recursion bits */
328 TRACE_FTRACE_BIT,
329 TRACE_FTRACE_NMI_BIT,
330 TRACE_FTRACE_IRQ_BIT,
331 TRACE_FTRACE_SIRQ_BIT,
332
333 /* GLOBAL_BITs must be greater than FTRACE_BITs */
334 TRACE_GLOBAL_BIT,
335 TRACE_GLOBAL_NMI_BIT,
336 TRACE_GLOBAL_IRQ_BIT,
337 TRACE_GLOBAL_SIRQ_BIT,
338
339 /* INTERNAL_BITs must be greater than GLOBAL_BITs */
340 TRACE_INTERNAL_BIT,
341 TRACE_INTERNAL_NMI_BIT,
342 TRACE_INTERNAL_IRQ_BIT,
343 TRACE_INTERNAL_SIRQ_BIT,
344
345 TRACE_CONTROL_BIT,
304 346
305/* 347/*
306 * Abuse of the trace_recursion. 348 * Abuse of the trace_recursion.
@@ -309,11 +351,77 @@ struct tracer {
309 * was called in irq context but we have irq tracing off. Since this 351 * was called in irq context but we have irq tracing off. Since this
310 * can only be modified by current, we can reuse trace_recursion. 352 * can only be modified by current, we can reuse trace_recursion.
311 */ 353 */
312#define TRACE_IRQ_BIT (1<<13) 354 TRACE_IRQ_BIT,
355};
356
357#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
358#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
359#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
360
361#define TRACE_CONTEXT_BITS 4
362
363#define TRACE_FTRACE_START TRACE_FTRACE_BIT
364#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
365
366#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
367#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
368
369#define TRACE_LIST_START TRACE_INTERNAL_BIT
370#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
371
372#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
373
374static __always_inline int trace_get_context_bit(void)
375{
376 int bit;
313 377
314#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) 378 if (in_interrupt()) {
315#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) 379 if (in_nmi())
316#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) 380 bit = 0;
381
382 else if (in_irq())
383 bit = 1;
384 else
385 bit = 2;
386 } else
387 bit = 3;
388
389 return bit;
390}
391
392static __always_inline int trace_test_and_set_recursion(int start, int max)
393{
394 unsigned int val = current->trace_recursion;
395 int bit;
396
397 /* A previous recursion check was made */
398 if ((val & TRACE_CONTEXT_MASK) > max)
399 return 0;
400
401 bit = trace_get_context_bit() + start;
402 if (unlikely(val & (1 << bit)))
403 return -1;
404
405 val |= 1 << bit;
406 current->trace_recursion = val;
407 barrier();
408
409 return bit;
410}
411
412static __always_inline void trace_clear_recursion(int bit)
413{
414 unsigned int val = current->trace_recursion;
415
416 if (!bit)
417 return;
418
419 bit = 1 << bit;
420 val &= ~bit;
421
422 barrier();
423 current->trace_recursion = val;
424}
317 425
318#define TRACE_PIPE_ALL_CPU -1 426#define TRACE_PIPE_ALL_CPU -1
319 427
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cbb..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
21#include <linux/ktime.h> 21#include <linux/ktime.h>
22#include <linux/trace_clock.h> 22#include <linux/trace_clock.h>
23 23
24#include "trace.h"
25
26/* 24/*
27 * trace_clock_local(): the simplest and least coherent tracing clock. 25 * trace_clock_local(): the simplest and least coherent tracing clock.
28 * 26 *
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
44 42
45 return clock; 43 return clock;
46} 44}
45EXPORT_SYMBOL_GPL(trace_clock_local);
47 46
48/* 47/*
49 * trace_clock(): 'between' trace clock. Not completely serialized, 48 * trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)
86 local_irq_save(flags); 85 local_irq_save(flags);
87 86
88 this_cpu = raw_smp_processor_id(); 87 this_cpu = raw_smp_processor_id();
89 now = cpu_clock(this_cpu); 88 now = sched_clock_cpu(this_cpu);
90 /* 89 /*
91 * If in an NMI context then dont risk lockups and return the 90 * If in an NMI context then dont risk lockups and return the
92 * cpu_clock() time: 91 * cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b946..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, padding);
120 119
121 return ret; 120 return ret;
122} 121}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab7..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(tr);
48} 48}
49 49
50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
52 struct ftrace_ops *op, struct pt_regs *pt_regs)
53{
54 struct trace_array *tr = func_trace;
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 long disabled;
58 int cpu;
59 int pc;
60
61 if (unlikely(!ftrace_function_enabled))
62 return;
63
64 pc = preempt_count();
65 preempt_disable_notrace();
66 local_save_flags(flags);
67 cpu = raw_smp_processor_id();
68 data = tr->data[cpu];
69 disabled = atomic_inc_return(&data->disabled);
70
71 if (likely(disabled == 1))
72 trace_function(tr, ip, parent_ip, flags, pc);
73
74 atomic_dec(&data->disabled);
75 preempt_enable_notrace();
76}
77
78/* Our option */ 50/* Our option */
79enum { 51enum {
80 TRACE_FUNC_OPT_STACK = 0x1, 52 TRACE_FUNC_OPT_STACK = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
85static void 57static void
86function_trace_call(unsigned long ip, unsigned long parent_ip, 58function_trace_call(unsigned long ip, unsigned long parent_ip,
87 struct ftrace_ops *op, struct pt_regs *pt_regs) 59 struct ftrace_ops *op, struct pt_regs *pt_regs)
88
89{ 60{
90 struct trace_array *tr = func_trace; 61 struct trace_array *tr = func_trace;
91 struct trace_array_cpu *data; 62 struct trace_array_cpu *data;
92 unsigned long flags; 63 unsigned long flags;
93 long disabled; 64 int bit;
94 int cpu; 65 int cpu;
95 int pc; 66 int pc;
96 67
97 if (unlikely(!ftrace_function_enabled)) 68 if (unlikely(!ftrace_function_enabled))
98 return; 69 return;
99 70
100 /* 71 pc = preempt_count();
101 * Need to use raw, since this must be called before the 72 preempt_disable_notrace();
102 * recursive protection is performed.
103 */
104 local_irq_save(flags);
105 cpu = raw_smp_processor_id();
106 data = tr->data[cpu];
107 disabled = atomic_inc_return(&data->disabled);
108 73
109 if (likely(disabled == 1)) { 74 bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
110 pc = preempt_count(); 75 if (bit < 0)
76 goto out;
77
78 cpu = smp_processor_id();
79 data = tr->data[cpu];
80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags);
111 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
112 } 83 }
84 trace_clear_recursion(bit);
113 85
114 atomic_dec(&data->disabled); 86 out:
115 local_irq_restore(flags); 87 preempt_enable_notrace();
116} 88}
117 89
118static void 90static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
185{ 157{
186 ftrace_function_enabled = 0; 158 ftrace_function_enabled = 0;
187 159
188 if (trace_flags & TRACE_ITER_PREEMPTONLY)
189 trace_ops.func = function_trace_call_preempt_only;
190 else
191 trace_ops.func = function_trace_call;
192
193 if (func_flags.val & TRACE_FUNC_OPT_STACK) 160 if (func_flags.val & TRACE_FUNC_OPT_STACK)
194 register_ftrace_function(&trace_stack_ops); 161 register_ftrace_function(&trace_stack_ops);
195 else 162 else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7e..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40 48#define TRACE_GRAPH_PRINT_IRQS 0x40
49 49
50static unsigned int max_depth;
51
50static struct tracer_opt trace_opts[] = { 52static struct tracer_opt trace_opts[] = {
51 /* Display overruns? (for self-debug purpose) */ 53 /* Display overruns? (for self-debug purpose) */
52 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, 54 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
189 191
190 ftrace_pop_return_trace(&trace, &ret, frame_pointer); 192 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
191 trace.rettime = trace_clock_local(); 193 trace.rettime = trace_clock_local();
192 ftrace_graph_return(&trace);
193 barrier(); 194 barrier();
194 current->curr_ret_stack--; 195 current->curr_ret_stack--;
195 196
197 /*
198 * The trace should run after decrementing the ret counter
199 * in case an interrupt were to come in. We don't want to
200 * lose the interrupt if max_depth is set.
201 */
202 ftrace_graph_return(&trace);
203
196 if (unlikely(!ret)) { 204 if (unlikely(!ret)) {
197 ftrace_graph_stop(); 205 ftrace_graph_stop();
198 WARN_ON(1); 206 WARN_ON(1);
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
250 return 0; 258 return 0;
251 259
252 /* trace it when it is-nested-in or is a function enabled. */ 260 /* trace it when it is-nested-in or is a function enabled. */
253 if (!(trace->depth || ftrace_graph_addr(trace->func)) || 261 if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
254 ftrace_graph_ignore_irqs()) 262 ftrace_graph_ignore_irqs()) ||
263 (max_depth && trace->depth >= max_depth))
255 return 0; 264 return 0;
256 265
257 local_irq_save(flags); 266 local_irq_save(flags);
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
1457#endif 1466#endif
1458}; 1467};
1459 1468
1469
1470static ssize_t
1471graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
1472 loff_t *ppos)
1473{
1474 unsigned long val;
1475 int ret;
1476
1477 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
1478 if (ret)
1479 return ret;
1480
1481 max_depth = val;
1482
1483 *ppos += cnt;
1484
1485 return cnt;
1486}
1487
1488static ssize_t
1489graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
1490 loff_t *ppos)
1491{
1492 char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
1493 int n;
1494
1495 n = sprintf(buf, "%d\n", max_depth);
1496
1497 return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
1498}
1499
1500static const struct file_operations graph_depth_fops = {
1501 .open = tracing_open_generic,
1502 .write = graph_depth_write,
1503 .read = graph_depth_read,
1504 .llseek = generic_file_llseek,
1505};
1506
1507static __init int init_graph_debugfs(void)
1508{
1509 struct dentry *d_tracer;
1510
1511 d_tracer = tracing_init_dentry();
1512 if (!d_tracer)
1513 return 0;
1514
1515 trace_create_file("max_graph_depth", 0644, d_tracer,
1516 NULL, &graph_depth_fops);
1517
1518 return 0;
1519}
1520fs_initcall(init_graph_debugfs);
1521
1460static __init int init_graph_trace(void) 1522static __init int init_graph_trace(void)
1461{ 1523{
1462 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1524 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc7..697e88d13907 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -739,12 +739,11 @@ static int task_state_char(unsigned long state)
739struct trace_event *ftrace_find_event(int type) 739struct trace_event *ftrace_find_event(int type)
740{ 740{
741 struct trace_event *event; 741 struct trace_event *event;
742 struct hlist_node *n;
743 unsigned key; 742 unsigned key;
744 743
745 key = type & (EVENT_HASHSIZE - 1); 744 key = type & (EVENT_HASHSIZE - 1);
746 745
747 hlist_for_each_entry(event, n, &event_hash[key], node) { 746 hlist_for_each_entry(event, &event_hash[key], node) {
748 if (event->type == type) 747 if (event->type == type)
749 return event; 748 return event;
750 } 749 }
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
66#define TP_FLAG_TRACE 1 66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2 67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4 68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70 69
71 70
72/* data_rloc: data relative location, compatible with u32 */ 71/* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fcefca0..75aa97fbe1a1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,8 +15,8 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/sched/rt.h>
18#include <trace/events/sched.h> 19#include <trace/events/sched.h>
19
20#include "trace.h" 20#include "trace.h"
21 21
22static struct trace_array *wakeup_trace; 22static struct trace_array *wakeup_trace;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a815..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
415 * The ftrace infrastructure should provide the recursion 415 * The ftrace infrastructure should provide the recursion
416 * protection. If not, this will crash the kernel! 416 * protection. If not, this will crash the kernel!
417 */ 417 */
418 trace_selftest_recursion_cnt++; 418 if (trace_selftest_recursion_cnt++ > 10)
419 return;
419 DYN_FTRACE_TEST_NAME(); 420 DYN_FTRACE_TEST_NAME();
420} 421}
421 422
@@ -452,7 +453,6 @@ trace_selftest_function_recursion(void)
452 char *func_name; 453 char *func_name;
453 int len; 454 int len;
454 int ret; 455 int ret;
455 int cnt;
456 456
457 /* The previous test PASSED */ 457 /* The previous test PASSED */
458 pr_cont("PASSED\n"); 458 pr_cont("PASSED\n");
@@ -510,19 +510,10 @@ trace_selftest_function_recursion(void)
510 510
511 unregister_ftrace_function(&test_recsafe_probe); 511 unregister_ftrace_function(&test_recsafe_probe);
512 512
513 /*
514 * If arch supports all ftrace features, and no other task
515 * was on the list, we should be fine.
516 */
517 if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
518 cnt = 2; /* Should have recursed */
519 else
520 cnt = 1;
521
522 ret = -1; 513 ret = -1;
523 if (trace_selftest_recursion_cnt != cnt) { 514 if (trace_selftest_recursion_cnt != 2) {
524 pr_cont("*callback not called expected %d times (%d)* ", 515 pr_cont("*callback not called expected 2 times (%d)* ",
525 cnt, trace_selftest_recursion_cnt); 516 trace_selftest_recursion_cnt);
526 goto out; 517 goto out;
527 } 518 }
528 519
@@ -568,7 +559,7 @@ trace_selftest_function_regs(void)
568 int ret; 559 int ret;
569 int supported = 0; 560 int supported = 0;
570 561
571#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS 562#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
572 supported = 1; 563 supported = 1;
573#endif 564#endif
574 565
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c2..7a809e321058 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/syscalls.h>
3#include <linux/slab.h> 4#include <linux/slab.h>
4#include <linux/kernel.h> 5#include <linux/kernel.h>
5#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ 6#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
@@ -47,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
47} 48}
48#endif 49#endif
49 50
51#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
52/*
53 * Some architectures that allow for 32bit applications
54 * to run on a 64bit kernel, do not map the syscalls for
55 * the 32bit tasks the same as they do for 64bit tasks.
56 *
57 * *cough*x86*cough*
58 *
59 * In such a case, instead of reporting the wrong syscalls,
60 * simply ignore them.
61 *
62 * For an arch to ignore the compat syscalls it needs to
63 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
64 * define the function arch_trace_is_compat_syscall() to let
65 * the tracing system know that it should ignore it.
66 */
67static int
68trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
69{
70 if (unlikely(arch_trace_is_compat_syscall(regs)))
71 return -1;
72
73 return syscall_get_nr(task, regs);
74}
75#else
76static inline int
77trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
78{
79 return syscall_get_nr(task, regs);
80}
81#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
82
50static __init struct syscall_metadata * 83static __init struct syscall_metadata *
51find_syscall_meta(unsigned long syscall) 84find_syscall_meta(unsigned long syscall)
52{ 85{
@@ -77,7 +110,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
77 return syscalls_metadata[nr]; 110 return syscalls_metadata[nr];
78} 111}
79 112
80enum print_line_t 113static enum print_line_t
81print_syscall_enter(struct trace_iterator *iter, int flags, 114print_syscall_enter(struct trace_iterator *iter, int flags,
82 struct trace_event *event) 115 struct trace_event *event)
83{ 116{
@@ -130,7 +163,7 @@ end:
130 return TRACE_TYPE_HANDLED; 163 return TRACE_TYPE_HANDLED;
131} 164}
132 165
133enum print_line_t 166static enum print_line_t
134print_syscall_exit(struct trace_iterator *iter, int flags, 167print_syscall_exit(struct trace_iterator *iter, int flags,
135 struct trace_event *event) 168 struct trace_event *event)
136{ 169{
@@ -270,16 +303,16 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
270 return ret; 303 return ret;
271} 304}
272 305
273void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 306static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
274{ 307{
275 struct syscall_trace_enter *entry; 308 struct syscall_trace_enter *entry;
276 struct syscall_metadata *sys_data; 309 struct syscall_metadata *sys_data;
277 struct ring_buffer_event *event; 310 struct ring_buffer_event *event;
278 struct ring_buffer *buffer; 311 struct ring_buffer *buffer;
279 int size;
280 int syscall_nr; 312 int syscall_nr;
313 int size;
281 314
282 syscall_nr = syscall_get_nr(current, regs); 315 syscall_nr = trace_get_syscall_nr(current, regs);
283 if (syscall_nr < 0) 316 if (syscall_nr < 0)
284 return; 317 return;
285 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 318 if (!test_bit(syscall_nr, enabled_enter_syscalls))
@@ -305,7 +338,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
305 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 338 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
306} 339}
307 340
308void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 341static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
309{ 342{
310 struct syscall_trace_exit *entry; 343 struct syscall_trace_exit *entry;
311 struct syscall_metadata *sys_data; 344 struct syscall_metadata *sys_data;
@@ -313,7 +346,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
313 struct ring_buffer *buffer; 346 struct ring_buffer *buffer;
314 int syscall_nr; 347 int syscall_nr;
315 348
316 syscall_nr = syscall_get_nr(current, regs); 349 syscall_nr = trace_get_syscall_nr(current, regs);
317 if (syscall_nr < 0) 350 if (syscall_nr < 0)
318 return; 351 return;
319 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 352 if (!test_bit(syscall_nr, enabled_exit_syscalls))
@@ -337,7 +370,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
337 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 370 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
338} 371}
339 372
340int reg_event_syscall_enter(struct ftrace_event_call *call) 373static int reg_event_syscall_enter(struct ftrace_event_call *call)
341{ 374{
342 int ret = 0; 375 int ret = 0;
343 int num; 376 int num;
@@ -356,7 +389,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
356 return ret; 389 return ret;
357} 390}
358 391
359void unreg_event_syscall_enter(struct ftrace_event_call *call) 392static void unreg_event_syscall_enter(struct ftrace_event_call *call)
360{ 393{
361 int num; 394 int num;
362 395
@@ -371,7 +404,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
371 mutex_unlock(&syscall_trace_lock); 404 mutex_unlock(&syscall_trace_lock);
372} 405}
373 406
374int reg_event_syscall_exit(struct ftrace_event_call *call) 407static int reg_event_syscall_exit(struct ftrace_event_call *call)
375{ 408{
376 int ret = 0; 409 int ret = 0;
377 int num; 410 int num;
@@ -390,7 +423,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
390 return ret; 423 return ret;
391} 424}
392 425
393void unreg_event_syscall_exit(struct ftrace_event_call *call) 426static void unreg_event_syscall_exit(struct ftrace_event_call *call)
394{ 427{
395 int num; 428 int num;
396 429
@@ -459,7 +492,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
459 return (unsigned long)sys_call_table[nr]; 492 return (unsigned long)sys_call_table[nr];
460} 493}
461 494
462int __init init_ftrace_syscalls(void) 495static int __init init_ftrace_syscalls(void)
463{ 496{
464 struct syscall_metadata *meta; 497 struct syscall_metadata *meta;
465 unsigned long addr; 498 unsigned long addr;
@@ -502,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
502 int rctx; 535 int rctx;
503 int size; 536 int size;
504 537
505 syscall_nr = syscall_get_nr(current, regs); 538 syscall_nr = trace_get_syscall_nr(current, regs);
506 if (syscall_nr < 0) 539 if (syscall_nr < 0)
507 return; 540 return;
508 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 541 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
@@ -578,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
578 int rctx; 611 int rctx;
579 int size; 612 int size;
580 613
581 syscall_nr = syscall_get_nr(current, regs); 614 syscall_nr = trace_get_syscall_nr(current, regs);
582 if (syscall_nr < 0) 615 if (syscall_nr < 0)
583 return; 616 return;
584 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 617 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c86e6d4f67fb..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,20 +28,21 @@
28 28
29#define UPROBE_EVENT_SYSTEM "uprobes" 29#define UPROBE_EVENT_SYSTEM "uprobes"
30 30
31struct trace_uprobe_filter {
32 rwlock_t rwlock;
33 int nr_systemwide;
34 struct list_head perf_events;
35};
36
31/* 37/*
32 * uprobe event core functions 38 * uprobe event core functions
33 */ 39 */
34struct trace_uprobe;
35struct uprobe_trace_consumer {
36 struct uprobe_consumer cons;
37 struct trace_uprobe *tu;
38};
39
40struct trace_uprobe { 40struct trace_uprobe {
41 struct list_head list; 41 struct list_head list;
42 struct ftrace_event_class class; 42 struct ftrace_event_class class;
43 struct ftrace_event_call call; 43 struct ftrace_event_call call;
44 struct uprobe_trace_consumer *consumer; 44 struct trace_uprobe_filter filter;
45 struct uprobe_consumer consumer;
45 struct inode *inode; 46 struct inode *inode;
46 char *filename; 47 char *filename;
47 unsigned long offset; 48 unsigned long offset;
@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
64 65
65static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 66static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
66 67
68static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
69{
70 rwlock_init(&filter->rwlock);
71 filter->nr_systemwide = 0;
72 INIT_LIST_HEAD(&filter->perf_events);
73}
74
75static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
76{
77 return !filter->nr_systemwide && list_empty(&filter->perf_events);
78}
79
67/* 80/*
68 * Allocate new trace_uprobe and initialize it (including uprobes). 81 * Allocate new trace_uprobe and initialize it (including uprobes).
69 */ 82 */
@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
92 goto error; 105 goto error;
93 106
94 INIT_LIST_HEAD(&tu->list); 107 INIT_LIST_HEAD(&tu->list);
108 tu->consumer.handler = uprobe_dispatcher;
109 init_trace_uprobe_filter(&tu->filter);
95 return tu; 110 return tu;
96 111
97error: 112error:
@@ -253,12 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
253 if (ret) 268 if (ret)
254 goto fail_address_parse; 269 goto fail_address_parse;
255 270
271 inode = igrab(path.dentry->d_inode);
272 path_put(&path);
273
274 if (!inode || !S_ISREG(inode->i_mode)) {
275 ret = -EINVAL;
276 goto fail_address_parse;
277 }
278
256 ret = kstrtoul(arg, 0, &offset); 279 ret = kstrtoul(arg, 0, &offset);
257 if (ret) 280 if (ret)
258 goto fail_address_parse; 281 goto fail_address_parse;
259 282
260 inode = igrab(path.dentry->d_inode);
261
262 argc -= 2; 283 argc -= 2;
263 argv += 2; 284 argv += 2;
264 285
@@ -356,7 +377,7 @@ fail_address_parse:
356 if (inode) 377 if (inode)
357 iput(inode); 378 iput(inode);
358 379
359 pr_info("Failed to parse address.\n"); 380 pr_info("Failed to parse address or file.\n");
360 381
361 return ret; 382 return ret;
362} 383}
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
465}; 486};
466 487
467/* uprobe handler */ 488/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 489static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{ 490{
470 struct uprobe_trace_entry_head *entry; 491 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event; 492 struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
475 unsigned long irq_flags; 496 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call; 497 struct ftrace_event_call *call = &tu->call;
477 498
478 tu->nhit++;
479
480 local_save_flags(irq_flags); 499 local_save_flags(irq_flags);
481 pc = preempt_count(); 500 pc = preempt_count();
482 501
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 504 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc); 505 size, irq_flags, pc);
487 if (!event) 506 if (!event)
488 return; 507 return 0;
489 508
490 entry = ring_buffer_event_data(event); 509 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 510 entry->ip = instruction_pointer(task_pt_regs(current));
492 data = (u8 *)&entry[1]; 511 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++) 512 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 513 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495 514
496 if (!filter_current_check_discard(buffer, call, entry, event)) 515 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc); 516 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
517
518 return 0;
498} 519}
499 520
500/* Event entry printers */ 521/* Event entry printers */
@@ -533,42 +554,43 @@ partial:
533 return TRACE_TYPE_PARTIAL_LINE; 554 return TRACE_TYPE_PARTIAL_LINE;
534} 555}
535 556
536static int probe_event_enable(struct trace_uprobe *tu, int flag) 557static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
537{ 558{
538 struct uprobe_trace_consumer *utc; 559 return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
539 int ret = 0; 560}
540 561
541 if (!tu->inode || tu->consumer) 562typedef bool (*filter_func_t)(struct uprobe_consumer *self,
542 return -EINTR; 563 enum uprobe_filter_ctx ctx,
564 struct mm_struct *mm);
543 565
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); 566static int
545 if (!utc) 567probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
568{
569 int ret = 0;
570
571 if (is_trace_uprobe_enabled(tu))
546 return -EINTR; 572 return -EINTR;
547 573
548 utc->cons.handler = uprobe_dispatcher; 574 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555 575
556 tu->flags |= flag; 576 tu->flags |= flag;
557 utc->tu = tu; 577 tu->consumer.filter = filter;
558 tu->consumer = utc; 578 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
579 if (ret)
580 tu->flags &= ~flag;
559 581
560 return 0; 582 return ret;
561} 583}
562 584
563static void probe_event_disable(struct trace_uprobe *tu, int flag) 585static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{ 586{
565 if (!tu->inode || !tu->consumer) 587 if (!is_trace_uprobe_enabled(tu))
566 return; 588 return;
567 589
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); 590 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
591
592 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
569 tu->flags &= ~flag; 593 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572} 594}
573 595
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 596static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
642} 664}
643 665
644#ifdef CONFIG_PERF_EVENTS 666#ifdef CONFIG_PERF_EVENTS
667static bool
668__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
669{
670 struct perf_event *event;
671
672 if (filter->nr_systemwide)
673 return true;
674
675 list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
676 if (event->hw.tp_target->mm == mm)
677 return true;
678 }
679
680 return false;
681}
682
683static inline bool
684uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
685{
686 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
687}
688
689static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
690{
691 bool done;
692
693 write_lock(&tu->filter.rwlock);
694 if (event->hw.tp_target) {
695 /*
696 * event->parent != NULL means copy_process(), we can avoid
697 * uprobe_apply(). current->mm must be probed and we can rely
698 * on dup_mmap() which preserves the already installed bp's.
699 *
700 * attr.enable_on_exec means that exec/mmap will install the
701 * breakpoints we need.
702 */
703 done = tu->filter.nr_systemwide ||
704 event->parent || event->attr.enable_on_exec ||
705 uprobe_filter_event(tu, event);
706 list_add(&event->hw.tp_list, &tu->filter.perf_events);
707 } else {
708 done = tu->filter.nr_systemwide;
709 tu->filter.nr_systemwide++;
710 }
711 write_unlock(&tu->filter.rwlock);
712
713 if (!done)
714 uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
715
716 return 0;
717}
718
719static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
720{
721 bool done;
722
723 write_lock(&tu->filter.rwlock);
724 if (event->hw.tp_target) {
725 list_del(&event->hw.tp_list);
726 done = tu->filter.nr_systemwide ||
727 (event->hw.tp_target->flags & PF_EXITING) ||
728 uprobe_filter_event(tu, event);
729 } else {
730 tu->filter.nr_systemwide--;
731 done = tu->filter.nr_systemwide;
732 }
733 write_unlock(&tu->filter.rwlock);
734
735 if (!done)
736 uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
737
738 return 0;
739}
740
741static bool uprobe_perf_filter(struct uprobe_consumer *uc,
742 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
743{
744 struct trace_uprobe *tu;
745 int ret;
746
747 tu = container_of(uc, struct trace_uprobe, consumer);
748 read_lock(&tu->filter.rwlock);
749 ret = __uprobe_perf_filter(&tu->filter, mm);
750 read_unlock(&tu->filter.rwlock);
751
752 return ret;
753}
754
645/* uprobe profile handler */ 755/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 756static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{ 757{
648 struct ftrace_event_call *call = &tu->call; 758 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry; 759 struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
652 int size, __size, i; 762 int size, __size, i;
653 int rctx; 763 int rctx;
654 764
765 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
766 return UPROBE_HANDLER_REMOVE;
767
655 __size = sizeof(*entry) + tu->size; 768 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 769 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32); 770 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 771 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return; 772 return 0;
660 773
661 preempt_disable(); 774 preempt_disable();
662 775
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
664 if (!entry) 777 if (!entry)
665 goto out; 778 goto out;
666 779
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 780 entry->ip = instruction_pointer(task_pt_regs(current));
668 data = (u8 *)&entry[1]; 781 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++) 782 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 783 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
674 787
675 out: 788 out:
676 preempt_enable(); 789 preempt_enable();
790 return 0;
677} 791}
678#endif /* CONFIG_PERF_EVENTS */ 792#endif /* CONFIG_PERF_EVENTS */
679 793
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
684 798
685 switch (type) { 799 switch (type) {
686 case TRACE_REG_REGISTER: 800 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE); 801 return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
688 802
689 case TRACE_REG_UNREGISTER: 803 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE); 804 probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
692 806
693#ifdef CONFIG_PERF_EVENTS 807#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER: 808 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE); 809 return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
696 810
697 case TRACE_REG_PERF_UNREGISTER: 811 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE); 812 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0; 813 return 0;
814
815 case TRACE_REG_PERF_OPEN:
816 return uprobe_perf_open(tu, data);
817
818 case TRACE_REG_PERF_CLOSE:
819 return uprobe_perf_close(tu, data);
820
700#endif 821#endif
701 default: 822 default:
702 return 0; 823 return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
706 827
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) 828static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{ 829{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu; 830 struct trace_uprobe *tu;
831 int ret = 0;
711 832
712 utc = container_of(con, struct uprobe_trace_consumer, cons); 833 tu = container_of(con, struct trace_uprobe, consumer);
713 tu = utc->tu; 834 tu->nhit++;
714 if (!tu || tu->consumer != utc)
715 return 0;
716 835
717 if (tu->flags & TP_FLAG_TRACE) 836 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs); 837 ret |= uprobe_trace_func(tu, regs);
719 838
720#ifdef CONFIG_PERF_EVENTS 839#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE) 840 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs); 841 ret |= uprobe_perf_func(tu, regs);
723#endif 842#endif
724 return 0; 843 return ret;
725} 844}
726 845
727static struct trace_event_functions uprobe_funcs = { 846static struct trace_event_functions uprobe_funcs = {
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabfa..0c05a4592047 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
192static struct tracepoint_entry *get_tracepoint(const char *name) 192static struct tracepoint_entry *get_tracepoint(const char *name)
193{ 193{
194 struct hlist_head *head; 194 struct hlist_head *head;
195 struct hlist_node *node;
196 struct tracepoint_entry *e; 195 struct tracepoint_entry *e;
197 u32 hash = jhash(name, strlen(name), 0); 196 u32 hash = jhash(name, strlen(name), 0);
198 197
199 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; 198 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
200 hlist_for_each_entry(e, node, head, hlist) { 199 hlist_for_each_entry(e, head, hlist) {
201 if (!strcmp(name, e->name)) 200 if (!strcmp(name, e->name))
202 return e; 201 return e;
203 } 202 }
@@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name)
211static struct tracepoint_entry *add_tracepoint(const char *name) 210static struct tracepoint_entry *add_tracepoint(const char *name)
212{ 211{
213 struct hlist_head *head; 212 struct hlist_head *head;
214 struct hlist_node *node;
215 struct tracepoint_entry *e; 213 struct tracepoint_entry *e;
216 size_t name_len = strlen(name) + 1; 214 size_t name_len = strlen(name) + 1;
217 u32 hash = jhash(name, name_len-1, 0); 215 u32 hash = jhash(name, name_len-1, 0);
218 216
219 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; 217 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
220 hlist_for_each_entry(e, node, head, hlist) { 218 hlist_for_each_entry(e, head, hlist) {
221 if (!strcmp(name, e->name)) { 219 if (!strcmp(name, e->name)) {
222 printk(KERN_NOTICE 220 printk(KERN_NOTICE
223 "tracepoint %s busy\n", name); 221 "tracepoint %s busy\n", name);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b44690..a1dd9a1b1327 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
32{ 32{
33 const struct cred *tcred; 33 const struct cred *tcred;
34 struct timespec uptime, ts; 34 struct timespec uptime, ts;
35 cputime_t utime, stime, utimescaled, stimescaled;
35 u64 ac_etime; 36 u64 ac_etime;
36 37
37 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 38 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns,
65 stats->ac_ppid = pid_alive(tsk) ? 66 stats->ac_ppid = pid_alive(tsk) ?
66 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; 67 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
67 rcu_read_unlock(); 68 rcu_read_unlock();
68 stats->ac_utime = cputime_to_usecs(tsk->utime); 69
69 stats->ac_stime = cputime_to_usecs(tsk->stime); 70 task_cputime(tsk, &utime, &stime);
70 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); 71 stats->ac_utime = cputime_to_usecs(utime);
71 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); 72 stats->ac_stime = cputime_to_usecs(stime);
73
74 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
75 stats->ac_utimescaled = cputime_to_usecs(utimescaled);
76 stats->ac_stimescaled = cputime_to_usecs(stimescaled);
77
72 stats->ac_minflt = tsk->min_flt; 78 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 79 stats->ac_majflt = tsk->maj_flt;
74 80
@@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
115#undef KB 121#undef KB
116#undef MB 122#undef MB
117 123
118/** 124static void __acct_update_integrals(struct task_struct *tsk,
119 * acct_update_integrals - update mm integral fields in task_struct 125 cputime_t utime, cputime_t stime)
120 * @tsk: task_struct for accounting
121 */
122void acct_update_integrals(struct task_struct *tsk)
123{ 126{
124 if (likely(tsk->mm)) { 127 if (likely(tsk->mm)) {
125 cputime_t time, dtime; 128 cputime_t time, dtime;
@@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)
128 u64 delta; 131 u64 delta;
129 132
130 local_irq_save(flags); 133 local_irq_save(flags);
131 time = tsk->stime + tsk->utime; 134 time = stime + utime;
132 dtime = time - tsk->acct_timexpd; 135 dtime = time - tsk->acct_timexpd;
133 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 136 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
134 delta = value.tv_sec; 137 delta = value.tv_sec;
@@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)
145} 148}
146 149
147/** 150/**
151 * acct_update_integrals - update mm integral fields in task_struct
152 * @tsk: task_struct for accounting
153 */
154void acct_update_integrals(struct task_struct *tsk)
155{
156 cputime_t utime, stime;
157
158 task_cputime(tsk, &utime, &stime);
159 __acct_update_integrals(tsk, utime, stime);
160}
161
162/**
163 * acct_account_cputime - update mm integral after cputime update
164 * @tsk: task_struct for accounting
165 */
166void acct_account_cputime(struct task_struct *tsk)
167{
168 __acct_update_integrals(tsk, tsk->utime, tsk->stime);
169}
170
171/**
148 * acct_clear_integrals - clear the mm integral fields in task_struct 172 * acct_clear_integrals - clear the mm integral fields in task_struct
149 * @tsk: task_struct whose accounting fields are cleared 173 * @tsk: task_struct whose accounting fields are cleared
150 */ 174 */
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1fb..394f70b17162 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
34void fire_user_return_notifiers(void) 34void fire_user_return_notifiers(void)
35{ 35{
36 struct user_return_notifier *urn; 36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2; 37 struct hlist_node *tmp2;
38 struct hlist_head *head; 38 struct hlist_head *head;
39 39
40 head = &get_cpu_var(return_notifier_list); 40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) 41 hlist_for_each_entry_safe(urn, tmp2, head, link)
42 urn->on_user_return(urn); 42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list); 43 put_cpu_var(return_notifier_list);
44} 44}
diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e53a5f..e81978e8c03b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -47,9 +47,7 @@ struct user_namespace init_user_ns = {
47 .count = 4294967295U, 47 .count = 4294967295U,
48 }, 48 },
49 }, 49 },
50 .kref = { 50 .count = ATOMIC_INIT(3),
51 .refcount = ATOMIC_INIT(3),
52 },
53 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
54 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
55 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
@@ -107,9 +105,8 @@ static void uid_hash_remove(struct user_struct *up)
107static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) 105static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
108{ 106{
109 struct user_struct *user; 107 struct user_struct *user;
110 struct hlist_node *h;
111 108
112 hlist_for_each_entry(user, h, hashent, uidhash_node) { 109 hlist_for_each_entry(user, hashent, uidhash_node) {
113 if (uid_eq(user->uid, uid)) { 110 if (uid_eq(user->uid, uid)) {
114 atomic_inc(&user->__count); 111 atomic_inc(&user->__count);
115 return user; 112 return user;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2b042c42fbc4..b14f4d342043 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/projid.h> 23#include <linux/projid.h>
24#include <linux/fs_struct.h>
24 25
25static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
26 27
@@ -78,7 +79,7 @@ int create_user_ns(struct cred *new)
78 return ret; 79 return ret;
79 } 80 }
80 81
81 kref_init(&ns->kref); 82 atomic_set(&ns->count, 1);
82 /* Leave the new->user_ns reference with the new user namespace. */ 83 /* Leave the new->user_ns reference with the new user namespace. */
83 ns->parent = parent_ns; 84 ns->parent = parent_ns;
84 ns->owner = owner; 85 ns->owner = owner;
@@ -104,15 +105,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
104 return create_user_ns(cred); 105 return create_user_ns(cred);
105} 106}
106 107
107void free_user_ns(struct kref *kref) 108void free_user_ns(struct user_namespace *ns)
108{ 109{
109 struct user_namespace *parent, *ns = 110 struct user_namespace *parent;
110 container_of(kref, struct user_namespace, kref);
111 111
112 parent = ns->parent; 112 do {
113 proc_free_inum(ns->proc_inum); 113 parent = ns->parent;
114 kmem_cache_free(user_ns_cachep, ns); 114 proc_free_inum(ns->proc_inum);
115 put_user_ns(parent); 115 kmem_cache_free(user_ns_cachep, ns);
116 ns = parent;
117 } while (atomic_dec_and_test(&parent->count));
116} 118}
117EXPORT_SYMBOL(free_user_ns); 119EXPORT_SYMBOL(free_user_ns);
118 120
@@ -519,6 +521,42 @@ struct seq_operations proc_projid_seq_operations = {
519 .show = projid_m_show, 521 .show = projid_m_show,
520}; 522};
521 523
524static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
525{
526 u32 upper_first, lower_first, upper_last, lower_last;
527 unsigned idx;
528
529 upper_first = extent->first;
530 lower_first = extent->lower_first;
531 upper_last = upper_first + extent->count - 1;
532 lower_last = lower_first + extent->count - 1;
533
534 for (idx = 0; idx < new_map->nr_extents; idx++) {
535 u32 prev_upper_first, prev_lower_first;
536 u32 prev_upper_last, prev_lower_last;
537 struct uid_gid_extent *prev;
538
539 prev = &new_map->extent[idx];
540
541 prev_upper_first = prev->first;
542 prev_lower_first = prev->lower_first;
543 prev_upper_last = prev_upper_first + prev->count - 1;
544 prev_lower_last = prev_lower_first + prev->count - 1;
545
546 /* Does the upper range intersect a previous extent? */
547 if ((prev_upper_first <= upper_last) &&
548 (prev_upper_last >= upper_first))
549 return true;
550
551 /* Does the lower range intersect a previous extent? */
552 if ((prev_lower_first <= lower_last) &&
553 (prev_lower_last >= lower_first))
554 return true;
555 }
556 return false;
557}
558
559
522static DEFINE_MUTEX(id_map_mutex); 560static DEFINE_MUTEX(id_map_mutex);
523 561
524static ssize_t map_write(struct file *file, const char __user *buf, 562static ssize_t map_write(struct file *file, const char __user *buf,
@@ -531,7 +569,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
531 struct user_namespace *ns = seq->private; 569 struct user_namespace *ns = seq->private;
532 struct uid_gid_map new_map; 570 struct uid_gid_map new_map;
533 unsigned idx; 571 unsigned idx;
534 struct uid_gid_extent *extent, *last = NULL; 572 struct uid_gid_extent *extent = NULL;
535 unsigned long page = 0; 573 unsigned long page = 0;
536 char *kbuf, *pos, *next_line; 574 char *kbuf, *pos, *next_line;
537 ssize_t ret = -EINVAL; 575 ssize_t ret = -EINVAL;
@@ -634,14 +672,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,
634 if ((extent->lower_first + extent->count) <= extent->lower_first) 672 if ((extent->lower_first + extent->count) <= extent->lower_first)
635 goto out; 673 goto out;
636 674
637 /* For now only accept extents that are strictly in order */ 675 /* Do the ranges in extent overlap any previous extents? */
638 if (last && 676 if (mappings_overlap(&new_map, extent))
639 (((last->first + last->count) > extent->first) ||
640 ((last->lower_first + last->count) > extent->lower_first)))
641 goto out; 677 goto out;
642 678
643 new_map.nr_extents++; 679 new_map.nr_extents++;
644 last = extent;
645 680
646 /* Fail if the file contains too many extents */ 681 /* Fail if the file contains too many extents */
647 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && 682 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
@@ -803,6 +838,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
803 if (atomic_read(&current->mm->mm_users) > 1) 838 if (atomic_read(&current->mm->mm_users) > 1)
804 return -EINVAL; 839 return -EINVAL;
805 840
841 if (current->fs->users != 1)
842 return -EINVAL;
843
806 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 844 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
807 return -EPERM; 845 return -EPERM;
808 846
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 08b197e8c485..a47fc5de3113 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,7 +30,7 @@ static struct uts_namespace *create_uts_ns(void)
30/* 30/*
31 * Clone a new ns copying an original utsname, setting refcount to 1 31 * Clone a new ns copying an original utsname, setting refcount to 1
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, 35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d820..4f69f9a5e221 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17 17
18#ifdef CONFIG_PROC_SYSCTL
19
18static void *get_uts(ctl_table *table, int write) 20static void *get_uts(ctl_table *table, int write)
19{ 21{
20 char *which = table->data; 22 char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
38 up_write(&uts_sem); 40 up_write(&uts_sem);
39} 41}
40 42
41#ifdef CONFIG_PROC_SYSCTL
42/* 43/*
43 * Special case of dostring for the UTS structure. This has locks 44 * Special case of dostring for the UTS structure. This has locks
44 * to observe. Should this be in kernel/sys.c ???? 45 * to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3d0b02..4a944676358e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25#include <linux/smpboot.h> 25#include <linux/smpboot.h>
26#include <linux/sched/rt.h>
26 27
27#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
28#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
@@ -112,9 +113,9 @@ static int get_softlockup_thresh(void)
112 * resolution, and we don't need to waste time with a big divide when 113 * resolution, and we don't need to waste time with a big divide when
113 * 2^30ns == 1.074s. 114 * 2^30ns == 1.074s.
114 */ 115 */
115static unsigned long get_timestamp(int this_cpu) 116static unsigned long get_timestamp(void)
116{ 117{
117 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 118 return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
118} 119}
119 120
120static void set_sample_period(void) 121static void set_sample_period(void)
@@ -132,9 +133,7 @@ static void set_sample_period(void)
132/* Commands for resetting the watchdog */ 133/* Commands for resetting the watchdog */
133static void __touch_watchdog(void) 134static void __touch_watchdog(void)
134{ 135{
135 int this_cpu = smp_processor_id(); 136 __this_cpu_write(watchdog_touch_ts, get_timestamp());
136
137 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
138} 137}
139 138
140void touch_softlockup_watchdog(void) 139void touch_softlockup_watchdog(void)
@@ -195,7 +194,7 @@ static int is_hardlockup(void)
195 194
196static int is_softlockup(unsigned long touch_ts) 195static int is_softlockup(unsigned long touch_ts)
197{ 196{
198 unsigned long now = get_timestamp(smp_processor_id()); 197 unsigned long now = get_timestamp();
199 198
200 /* Warn about unreasonable delays: */ 199 /* Warn about unreasonable delays: */
201 if (time_after(now, touch_ts + get_softlockup_thresh())) 200 if (time_after(now, touch_ts + get_softlockup_thresh()))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fbc6576a83c3..55fac5b991b7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,32 +41,31 @@
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/hashtable.h>
44 45
45#include "workqueue_sched.h" 46#include "workqueue_internal.h"
46 47
47enum { 48enum {
48 /* 49 /*
49 * global_cwq flags 50 * worker_pool flags
50 * 51 *
51 * A bound gcwq is either associated or disassociated with its CPU. 52 * A bound pool is either associated or disassociated with its CPU.
52 * While associated (!DISASSOCIATED), all workers are bound to the 53 * While associated (!DISASSOCIATED), all workers are bound to the
53 * CPU and none has %WORKER_UNBOUND set and concurrency management 54 * CPU and none has %WORKER_UNBOUND set and concurrency management
54 * is in effect. 55 * is in effect.
55 * 56 *
56 * While DISASSOCIATED, the cpu may be offline and all workers have 57 * While DISASSOCIATED, the cpu may be offline and all workers have
57 * %WORKER_UNBOUND set and concurrency management disabled, and may 58 * %WORKER_UNBOUND set and concurrency management disabled, and may
58 * be executing on any CPU. The gcwq behaves as an unbound one. 59 * be executing on any CPU. The pool behaves as an unbound one.
59 * 60 *
60 * Note that DISASSOCIATED can be flipped only while holding 61 * Note that DISASSOCIATED can be flipped only while holding
61 * assoc_mutex of all pools on the gcwq to avoid changing binding 62 * assoc_mutex to avoid changing binding state while
62 * state while create_worker() is in progress. 63 * create_worker() is in progress.
63 */ 64 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65 GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66
67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 65 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
69 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ 66 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
67 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
68 POOL_FREEZING = 1 << 3, /* freeze in progress */
70 69
71 /* worker flags */ 70 /* worker flags */
72 WORKER_STARTED = 1 << 0, /* started */ 71 WORKER_STARTED = 1 << 0, /* started */
@@ -79,11 +78,9 @@ enum {
79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 78 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
80 WORKER_CPU_INTENSIVE, 79 WORKER_CPU_INTENSIVE,
81 80
82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 81 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
83 82
84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 83 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
85 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
86 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
87 84
88 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 85 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
89 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 86 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
@@ -111,48 +108,24 @@ enum {
111 * P: Preemption protected. Disabling preemption is enough and should 108 * P: Preemption protected. Disabling preemption is enough and should
112 * only be modified and accessed from the local cpu. 109 * only be modified and accessed from the local cpu.
113 * 110 *
114 * L: gcwq->lock protected. Access with gcwq->lock held. 111 * L: pool->lock protected. Access with pool->lock held.
115 * 112 *
116 * X: During normal operation, modification requires gcwq->lock and 113 * X: During normal operation, modification requires pool->lock and should
117 * should be done only from local cpu. Either disabling preemption 114 * be done only from local cpu. Either disabling preemption on local
118 * on local cpu or grabbing gcwq->lock is enough for read access. 115 * cpu or grabbing pool->lock is enough for read access. If
119 * If GCWQ_DISASSOCIATED is set, it's identical to L. 116 * POOL_DISASSOCIATED is set, it's identical to L.
120 * 117 *
121 * F: wq->flush_mutex protected. 118 * F: wq->flush_mutex protected.
122 * 119 *
123 * W: workqueue_lock protected. 120 * W: workqueue_lock protected.
124 */ 121 */
125 122
126struct global_cwq; 123/* struct worker is defined in workqueue_internal.h */
127struct worker_pool;
128
129/*
130 * The poor guys doing the actual heavy lifting. All on-duty workers
131 * are either serving the manager role, on idle list or on busy hash.
132 */
133struct worker {
134 /* on idle list while idle, on busy hash table while busy */
135 union {
136 struct list_head entry; /* L: while idle */
137 struct hlist_node hentry; /* L: while busy */
138 };
139
140 struct work_struct *current_work; /* L: work being processed */
141 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
142 struct list_head scheduled; /* L: scheduled works */
143 struct task_struct *task; /* I: worker task */
144 struct worker_pool *pool; /* I: the associated pool */
145 /* 64 bytes boundary on 64bit, 32 on 32bit */
146 unsigned long last_active; /* L: last active timestamp */
147 unsigned int flags; /* X: flags */
148 int id; /* I: worker id */
149
150 /* for rebinding worker to CPU */
151 struct work_struct rebind_work; /* L: for busy worker */
152};
153 124
154struct worker_pool { 125struct worker_pool {
155 struct global_cwq *gcwq; /* I: the owning gcwq */ 126 spinlock_t lock; /* the pool lock */
127 unsigned int cpu; /* I: the associated cpu */
128 int id; /* I: pool ID */
156 unsigned int flags; /* X: flags */ 129 unsigned int flags; /* X: flags */
157 130
158 struct list_head worklist; /* L: list of pending works */ 131 struct list_head worklist; /* L: list of pending works */
@@ -165,34 +138,28 @@ struct worker_pool {
165 struct timer_list idle_timer; /* L: worker idle timeout */ 138 struct timer_list idle_timer; /* L: worker idle timeout */
166 struct timer_list mayday_timer; /* L: SOS timer for workers */ 139 struct timer_list mayday_timer; /* L: SOS timer for workers */
167 140
168 struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ 141 /* workers are chained either in busy_hash or idle_list */
169 struct ida worker_ida; /* L: for worker IDs */ 142 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
170};
171
172/*
173 * Global per-cpu workqueue. There's one and only one for each cpu
174 * and all works are queued and processed here regardless of their
175 * target workqueues.
176 */
177struct global_cwq {
178 spinlock_t lock; /* the gcwq lock */
179 unsigned int cpu; /* I: the associated cpu */
180 unsigned int flags; /* L: GCWQ_* flags */
181
182 /* workers are chained either in busy_hash or pool idle_list */
183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
184 /* L: hash of busy workers */ 143 /* L: hash of busy workers */
185 144
186 struct worker_pool pools[NR_WORKER_POOLS]; 145 struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */
187 /* normal and highpri pools */ 146 struct ida worker_ida; /* L: for worker IDs */
147
148 /*
149 * The current concurrency level. As it's likely to be accessed
150 * from other CPUs during try_to_wake_up(), put it in a separate
151 * cacheline.
152 */
153 atomic_t nr_running ____cacheline_aligned_in_smp;
188} ____cacheline_aligned_in_smp; 154} ____cacheline_aligned_in_smp;
189 155
190/* 156/*
191 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of 157 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
192 * work_struct->data are used for flags and thus cwqs need to be 158 * of work_struct->data are used for flags and the remaining high bits
193 * aligned at two's power of the number of flag bits. 159 * point to the pwq; thus, pwqs need to be aligned at two's power of the
160 * number of flag bits.
194 */ 161 */
195struct cpu_workqueue_struct { 162struct pool_workqueue {
196 struct worker_pool *pool; /* I: the associated pool */ 163 struct worker_pool *pool; /* I: the associated pool */
197 struct workqueue_struct *wq; /* I: the owning workqueue */ 164 struct workqueue_struct *wq; /* I: the owning workqueue */
198 int work_color; /* L: current color */ 165 int work_color; /* L: current color */
@@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t;
241struct workqueue_struct { 208struct workqueue_struct {
242 unsigned int flags; /* W: WQ_* flags */ 209 unsigned int flags; /* W: WQ_* flags */
243 union { 210 union {
244 struct cpu_workqueue_struct __percpu *pcpu; 211 struct pool_workqueue __percpu *pcpu;
245 struct cpu_workqueue_struct *single; 212 struct pool_workqueue *single;
246 unsigned long v; 213 unsigned long v;
247 } cpu_wq; /* I: cwq's */ 214 } pool_wq; /* I: pwq's */
248 struct list_head list; /* W: list of all workqueues */ 215 struct list_head list; /* W: list of all workqueues */
249 216
250 struct mutex flush_mutex; /* protects wq flushing */ 217 struct mutex flush_mutex; /* protects wq flushing */
251 int work_color; /* F: current work color */ 218 int work_color; /* F: current work color */
252 int flush_color; /* F: current flush color */ 219 int flush_color; /* F: current flush color */
253 atomic_t nr_cwqs_to_flush; /* flush in progress */ 220 atomic_t nr_pwqs_to_flush; /* flush in progress */
254 struct wq_flusher *first_flusher; /* F: first flusher */ 221 struct wq_flusher *first_flusher; /* F: first flusher */
255 struct list_head flusher_queue; /* F: flush waiters */ 222 struct list_head flusher_queue; /* F: flush waiters */
256 struct list_head flusher_overflow; /* F: flush overflow list */ 223 struct list_head flusher_overflow; /* F: flush overflow list */
@@ -259,7 +226,7 @@ struct workqueue_struct {
259 struct worker *rescuer; /* I: rescue worker */ 226 struct worker *rescuer; /* I: rescue worker */
260 227
261 int nr_drainers; /* W: drain in progress */ 228 int nr_drainers; /* W: drain in progress */
262 int saved_max_active; /* W: saved cwq max_active */ 229 int saved_max_active; /* W: saved pwq max_active */
263#ifdef CONFIG_LOCKDEP 230#ifdef CONFIG_LOCKDEP
264 struct lockdep_map lockdep_map; 231 struct lockdep_map lockdep_map;
265#endif 232#endif
@@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
280#define CREATE_TRACE_POINTS 247#define CREATE_TRACE_POINTS
281#include <trace/events/workqueue.h> 248#include <trace/events/workqueue.h>
282 249
283#define for_each_worker_pool(pool, gcwq) \ 250#define for_each_std_worker_pool(pool, cpu) \
284 for ((pool) = &(gcwq)->pools[0]; \ 251 for ((pool) = &std_worker_pools(cpu)[0]; \
285 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) 252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
286 253
287#define for_each_busy_worker(worker, i, pos, gcwq) \ 254#define for_each_busy_worker(worker, i, pool) \
288 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 255 hash_for_each(pool->busy_hash, i, worker, hentry)
289 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
290 256
291static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, 257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
292 unsigned int sw) 258 unsigned int sw)
293{ 259{
294 if (cpu < nr_cpu_ids) { 260 if (cpu < nr_cpu_ids) {
295 if (sw & 1) { 261 if (sw & 1) {
@@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
300 if (sw & 2) 266 if (sw & 2)
301 return WORK_CPU_UNBOUND; 267 return WORK_CPU_UNBOUND;
302 } 268 }
303 return WORK_CPU_NONE; 269 return WORK_CPU_END;
304} 270}
305 271
306static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, 272static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
307 struct workqueue_struct *wq) 273 struct workqueue_struct *wq)
308{ 274{
309 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); 275 return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
310} 276}
311 277
312/* 278/*
313 * CPU iterators 279 * CPU iterators
314 * 280 *
315 * An extra gcwq is defined for an invalid cpu number 281 * An extra cpu number is defined using an invalid cpu number
316 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any 282 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
317 * specific CPU. The following iterators are similar to 283 * specific CPU. The following iterators are similar to for_each_*_cpu()
318 * for_each_*_cpu() iterators but also considers the unbound gcwq. 284 * iterators but also considers the unbound CPU.
319 * 285 *
320 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND 286 * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND
321 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND 287 * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND
322 * for_each_cwq_cpu() : possible CPUs for bound workqueues, 288 * for_each_pwq_cpu() : possible CPUs for bound workqueues,
323 * WORK_CPU_UNBOUND for unbound workqueues 289 * WORK_CPU_UNBOUND for unbound workqueues
324 */ 290 */
325#define for_each_gcwq_cpu(cpu) \ 291#define for_each_wq_cpu(cpu) \
326 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ 292 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \
327 (cpu) < WORK_CPU_NONE; \ 293 (cpu) < WORK_CPU_END; \
328 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) 294 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
329 295
330#define for_each_online_gcwq_cpu(cpu) \ 296#define for_each_online_wq_cpu(cpu) \
331 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ 297 for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \
332 (cpu) < WORK_CPU_NONE; \ 298 (cpu) < WORK_CPU_END; \
333 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) 299 (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
334 300
335#define for_each_cwq_cpu(cpu, wq) \ 301#define for_each_pwq_cpu(cpu, wq) \
336 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ 302 for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \
337 (cpu) < WORK_CPU_NONE; \ 303 (cpu) < WORK_CPU_END; \
338 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) 304 (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
339 305
340#ifdef CONFIG_DEBUG_OBJECTS_WORK 306#ifdef CONFIG_DEBUG_OBJECTS_WORK
341 307
@@ -459,57 +425,70 @@ static LIST_HEAD(workqueues);
459static bool workqueue_freezing; /* W: have wqs started freezing? */ 425static bool workqueue_freezing; /* W: have wqs started freezing? */
460 426
461/* 427/*
462 * The almighty global cpu workqueues. nr_running is the only field 428 * The CPU and unbound standard worker pools. The unbound ones have
463 * which is expected to be used frequently by other cpus via 429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
464 * try_to_wake_up(). Put it in a separate cacheline.
465 */ 430 */
466static DEFINE_PER_CPU(struct global_cwq, global_cwq); 431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
467static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); 432 cpu_std_worker_pools);
433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
468 434
469/* 435/* idr of all pools */
470 * Global cpu workqueue and nr_running counter for unbound gcwq. The 436static DEFINE_MUTEX(worker_pool_idr_mutex);
471 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its 437static DEFINE_IDR(worker_pool_idr);
472 * workers have WORKER_UNBOUND set.
473 */
474static struct global_cwq unbound_global_cwq;
475static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
476 [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
477};
478 438
479static int worker_thread(void *__worker); 439static int worker_thread(void *__worker);
480 440
481static int worker_pool_pri(struct worker_pool *pool) 441static struct worker_pool *std_worker_pools(int cpu)
482{ 442{
483 return pool - pool->gcwq->pools; 443 if (cpu != WORK_CPU_UNBOUND)
444 return per_cpu(cpu_std_worker_pools, cpu);
445 else
446 return unbound_std_worker_pools;
484} 447}
485 448
486static struct global_cwq *get_gcwq(unsigned int cpu) 449static int std_worker_pool_pri(struct worker_pool *pool)
487{ 450{
488 if (cpu != WORK_CPU_UNBOUND) 451 return pool - std_worker_pools(pool->cpu);
489 return &per_cpu(global_cwq, cpu);
490 else
491 return &unbound_global_cwq;
492} 452}
493 453
494static atomic_t *get_pool_nr_running(struct worker_pool *pool) 454/* allocate ID and assign it to @pool */
455static int worker_pool_assign_id(struct worker_pool *pool)
495{ 456{
496 int cpu = pool->gcwq->cpu; 457 int ret;
497 int idx = worker_pool_pri(pool);
498 458
499 if (cpu != WORK_CPU_UNBOUND) 459 mutex_lock(&worker_pool_idr_mutex);
500 return &per_cpu(pool_nr_running, cpu)[idx]; 460 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
501 else 461 if (ret >= 0)
502 return &unbound_pool_nr_running[idx]; 462 pool->id = ret;
463 mutex_unlock(&worker_pool_idr_mutex);
464
465 return ret < 0 ? ret : 0;
466}
467
468/*
469 * Lookup worker_pool by id. The idr currently is built during boot and
470 * never modified. Don't worry about locking for now.
471 */
472static struct worker_pool *worker_pool_by_id(int pool_id)
473{
474 return idr_find(&worker_pool_idr, pool_id);
503} 475}
504 476
505static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, 477static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
506 struct workqueue_struct *wq) 478{
479 struct worker_pool *pools = std_worker_pools(cpu);
480
481 return &pools[highpri];
482}
483
484static struct pool_workqueue *get_pwq(unsigned int cpu,
485 struct workqueue_struct *wq)
507{ 486{
508 if (!(wq->flags & WQ_UNBOUND)) { 487 if (!(wq->flags & WQ_UNBOUND)) {
509 if (likely(cpu < nr_cpu_ids)) 488 if (likely(cpu < nr_cpu_ids))
510 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 489 return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
511 } else if (likely(cpu == WORK_CPU_UNBOUND)) 490 } else if (likely(cpu == WORK_CPU_UNBOUND))
512 return wq->cpu_wq.single; 491 return wq->pool_wq.single;
513 return NULL; 492 return NULL;
514} 493}
515 494
@@ -530,19 +509,19 @@ static int work_next_color(int color)
530} 509}
531 510
532/* 511/*
533 * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data 512 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
534 * contain the pointer to the queued cwq. Once execution starts, the flag 513 * contain the pointer to the queued pwq. Once execution starts, the flag
535 * is cleared and the high bits contain OFFQ flags and CPU number. 514 * is cleared and the high bits contain OFFQ flags and pool ID.
536 * 515 *
537 * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() 516 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
538 * and clear_work_data() can be used to set the cwq, cpu or clear 517 * and clear_work_data() can be used to set the pwq, pool or clear
539 * work->data. These functions should only be called while the work is 518 * work->data. These functions should only be called while the work is
540 * owned - ie. while the PENDING bit is set. 519 * owned - ie. while the PENDING bit is set.
541 * 520 *
542 * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to 521 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
543 * a work. gcwq is available once the work has been queued anywhere after 522 * corresponding to a work. Pool is available once the work has been
544 * initialization until it is sync canceled. cwq is available only while 523 * queued anywhere after initialization until it is sync canceled. pwq is
545 * the work item is queued. 524 * available only while the work item is queued.
546 * 525 *
547 * %WORK_OFFQ_CANCELING is used to mark a work item which is being 526 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
548 * canceled. While being canceled, a work item may have its PENDING set 527 * canceled. While being canceled, a work item may have its PENDING set
@@ -556,16 +535,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data,
556 atomic_long_set(&work->data, data | flags | work_static(work)); 535 atomic_long_set(&work->data, data | flags | work_static(work));
557} 536}
558 537
559static void set_work_cwq(struct work_struct *work, 538static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
560 struct cpu_workqueue_struct *cwq,
561 unsigned long extra_flags) 539 unsigned long extra_flags)
562{ 540{
563 set_work_data(work, (unsigned long)cwq, 541 set_work_data(work, (unsigned long)pwq,
564 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); 542 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
565} 543}
566 544
567static void set_work_cpu_and_clear_pending(struct work_struct *work, 545static void set_work_pool_and_keep_pending(struct work_struct *work,
568 unsigned int cpu) 546 int pool_id)
547{
548 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
549 WORK_STRUCT_PENDING);
550}
551
552static void set_work_pool_and_clear_pending(struct work_struct *work,
553 int pool_id)
569{ 554{
570 /* 555 /*
571 * The following wmb is paired with the implied mb in 556 * The following wmb is paired with the implied mb in
@@ -574,67 +559,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work,
574 * owner. 559 * owner.
575 */ 560 */
576 smp_wmb(); 561 smp_wmb();
577 set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); 562 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
578} 563}
579 564
580static void clear_work_data(struct work_struct *work) 565static void clear_work_data(struct work_struct *work)
581{ 566{
582 smp_wmb(); /* see set_work_cpu_and_clear_pending() */ 567 smp_wmb(); /* see set_work_pool_and_clear_pending() */
583 set_work_data(work, WORK_STRUCT_NO_CPU, 0); 568 set_work_data(work, WORK_STRUCT_NO_POOL, 0);
584} 569}
585 570
586static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) 571static struct pool_workqueue *get_work_pwq(struct work_struct *work)
587{ 572{
588 unsigned long data = atomic_long_read(&work->data); 573 unsigned long data = atomic_long_read(&work->data);
589 574
590 if (data & WORK_STRUCT_CWQ) 575 if (data & WORK_STRUCT_PWQ)
591 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); 576 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
592 else 577 else
593 return NULL; 578 return NULL;
594} 579}
595 580
596static struct global_cwq *get_work_gcwq(struct work_struct *work) 581/**
582 * get_work_pool - return the worker_pool a given work was associated with
583 * @work: the work item of interest
584 *
585 * Return the worker_pool @work was last associated with. %NULL if none.
586 */
587static struct worker_pool *get_work_pool(struct work_struct *work)
597{ 588{
598 unsigned long data = atomic_long_read(&work->data); 589 unsigned long data = atomic_long_read(&work->data);
599 unsigned int cpu; 590 struct worker_pool *pool;
591 int pool_id;
600 592
601 if (data & WORK_STRUCT_CWQ) 593 if (data & WORK_STRUCT_PWQ)
602 return ((struct cpu_workqueue_struct *) 594 return ((struct pool_workqueue *)
603 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; 595 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
604 596
605 cpu = data >> WORK_OFFQ_CPU_SHIFT; 597 pool_id = data >> WORK_OFFQ_POOL_SHIFT;
606 if (cpu == WORK_CPU_NONE) 598 if (pool_id == WORK_OFFQ_POOL_NONE)
607 return NULL; 599 return NULL;
608 600
609 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); 601 pool = worker_pool_by_id(pool_id);
610 return get_gcwq(cpu); 602 WARN_ON_ONCE(!pool);
603 return pool;
604}
605
606/**
607 * get_work_pool_id - return the worker pool ID a given work is associated with
608 * @work: the work item of interest
609 *
610 * Return the worker_pool ID @work was last associated with.
611 * %WORK_OFFQ_POOL_NONE if none.
612 */
613static int get_work_pool_id(struct work_struct *work)
614{
615 unsigned long data = atomic_long_read(&work->data);
616
617 if (data & WORK_STRUCT_PWQ)
618 return ((struct pool_workqueue *)
619 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
620
621 return data >> WORK_OFFQ_POOL_SHIFT;
611} 622}
612 623
613static void mark_work_canceling(struct work_struct *work) 624static void mark_work_canceling(struct work_struct *work)
614{ 625{
615 struct global_cwq *gcwq = get_work_gcwq(work); 626 unsigned long pool_id = get_work_pool_id(work);
616 unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
617 627
618 set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, 628 pool_id <<= WORK_OFFQ_POOL_SHIFT;
619 WORK_STRUCT_PENDING); 629 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
620} 630}
621 631
622static bool work_is_canceling(struct work_struct *work) 632static bool work_is_canceling(struct work_struct *work)
623{ 633{
624 unsigned long data = atomic_long_read(&work->data); 634 unsigned long data = atomic_long_read(&work->data);
625 635
626 return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); 636 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
627} 637}
628 638
629/* 639/*
630 * Policy functions. These define the policies on how the global worker 640 * Policy functions. These define the policies on how the global worker
631 * pools are managed. Unless noted otherwise, these functions assume that 641 * pools are managed. Unless noted otherwise, these functions assume that
632 * they're being called with gcwq->lock held. 642 * they're being called with pool->lock held.
633 */ 643 */
634 644
635static bool __need_more_worker(struct worker_pool *pool) 645static bool __need_more_worker(struct worker_pool *pool)
636{ 646{
637 return !atomic_read(get_pool_nr_running(pool)); 647 return !atomic_read(&pool->nr_running);
638} 648}
639 649
640/* 650/*
@@ -642,7 +652,7 @@ static bool __need_more_worker(struct worker_pool *pool)
642 * running workers. 652 * running workers.
643 * 653 *
644 * Note that, because unbound workers never contribute to nr_running, this 654 * Note that, because unbound workers never contribute to nr_running, this
645 * function will always return %true for unbound gcwq as long as the 655 * function will always return %true for unbound pools as long as the
646 * worklist isn't empty. 656 * worklist isn't empty.
647 */ 657 */
648static bool need_more_worker(struct worker_pool *pool) 658static bool need_more_worker(struct worker_pool *pool)
@@ -659,9 +669,8 @@ static bool may_start_working(struct worker_pool *pool)
659/* Do I need to keep working? Called from currently running workers. */ 669/* Do I need to keep working? Called from currently running workers. */
660static bool keep_working(struct worker_pool *pool) 670static bool keep_working(struct worker_pool *pool)
661{ 671{
662 atomic_t *nr_running = get_pool_nr_running(pool); 672 return !list_empty(&pool->worklist) &&
663 673 atomic_read(&pool->nr_running) <= 1;
664 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
665} 674}
666 675
667/* Do we need a new worker? Called from manager. */ 676/* Do we need a new worker? Called from manager. */
@@ -714,7 +723,7 @@ static struct worker *first_worker(struct worker_pool *pool)
714 * Wake up the first idle worker of @pool. 723 * Wake up the first idle worker of @pool.
715 * 724 *
716 * CONTEXT: 725 * CONTEXT:
717 * spin_lock_irq(gcwq->lock). 726 * spin_lock_irq(pool->lock).
718 */ 727 */
719static void wake_up_worker(struct worker_pool *pool) 728static void wake_up_worker(struct worker_pool *pool)
720{ 729{
@@ -740,8 +749,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
740 struct worker *worker = kthread_data(task); 749 struct worker *worker = kthread_data(task);
741 750
742 if (!(worker->flags & WORKER_NOT_RUNNING)) { 751 if (!(worker->flags & WORKER_NOT_RUNNING)) {
743 WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); 752 WARN_ON_ONCE(worker->pool->cpu != cpu);
744 atomic_inc(get_pool_nr_running(worker->pool)); 753 atomic_inc(&worker->pool->nr_running);
745 } 754 }
746} 755}
747 756
@@ -764,12 +773,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
764 unsigned int cpu) 773 unsigned int cpu)
765{ 774{
766 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 775 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
767 struct worker_pool *pool = worker->pool; 776 struct worker_pool *pool;
768 atomic_t *nr_running = get_pool_nr_running(pool);
769 777
778 /*
779 * Rescuers, which may not have all the fields set up like normal
780 * workers, also reach here, let's not access anything before
781 * checking NOT_RUNNING.
782 */
770 if (worker->flags & WORKER_NOT_RUNNING) 783 if (worker->flags & WORKER_NOT_RUNNING)
771 return NULL; 784 return NULL;
772 785
786 pool = worker->pool;
787
773 /* this can only happen on the local cpu */ 788 /* this can only happen on the local cpu */
774 BUG_ON(cpu != raw_smp_processor_id()); 789 BUG_ON(cpu != raw_smp_processor_id());
775 790
@@ -781,10 +796,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
781 * NOT_RUNNING is clear. This means that we're bound to and 796 * NOT_RUNNING is clear. This means that we're bound to and
782 * running on the local cpu w/ rq lock held and preemption 797 * running on the local cpu w/ rq lock held and preemption
783 * disabled, which in turn means that none else could be 798 * disabled, which in turn means that none else could be
784 * manipulating idle_list, so dereferencing idle_list without gcwq 799 * manipulating idle_list, so dereferencing idle_list without pool
785 * lock is safe. 800 * lock is safe.
786 */ 801 */
787 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) 802 if (atomic_dec_and_test(&pool->nr_running) &&
803 !list_empty(&pool->worklist))
788 to_wakeup = first_worker(pool); 804 to_wakeup = first_worker(pool);
789 return to_wakeup ? to_wakeup->task : NULL; 805 return to_wakeup ? to_wakeup->task : NULL;
790} 806}
@@ -800,7 +816,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
800 * woken up. 816 * woken up.
801 * 817 *
802 * CONTEXT: 818 * CONTEXT:
803 * spin_lock_irq(gcwq->lock) 819 * spin_lock_irq(pool->lock)
804 */ 820 */
805static inline void worker_set_flags(struct worker *worker, unsigned int flags, 821static inline void worker_set_flags(struct worker *worker, unsigned int flags,
806 bool wakeup) 822 bool wakeup)
@@ -816,14 +832,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
816 */ 832 */
817 if ((flags & WORKER_NOT_RUNNING) && 833 if ((flags & WORKER_NOT_RUNNING) &&
818 !(worker->flags & WORKER_NOT_RUNNING)) { 834 !(worker->flags & WORKER_NOT_RUNNING)) {
819 atomic_t *nr_running = get_pool_nr_running(pool);
820
821 if (wakeup) { 835 if (wakeup) {
822 if (atomic_dec_and_test(nr_running) && 836 if (atomic_dec_and_test(&pool->nr_running) &&
823 !list_empty(&pool->worklist)) 837 !list_empty(&pool->worklist))
824 wake_up_worker(pool); 838 wake_up_worker(pool);
825 } else 839 } else
826 atomic_dec(nr_running); 840 atomic_dec(&pool->nr_running);
827 } 841 }
828 842
829 worker->flags |= flags; 843 worker->flags |= flags;
@@ -837,7 +851,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
837 * Clear @flags in @worker->flags and adjust nr_running accordingly. 851 * Clear @flags in @worker->flags and adjust nr_running accordingly.
838 * 852 *
839 * CONTEXT: 853 * CONTEXT:
840 * spin_lock_irq(gcwq->lock) 854 * spin_lock_irq(pool->lock)
841 */ 855 */
842static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 856static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
843{ 857{
@@ -855,87 +869,55 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
855 */ 869 */
856 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 870 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
857 if (!(worker->flags & WORKER_NOT_RUNNING)) 871 if (!(worker->flags & WORKER_NOT_RUNNING))
858 atomic_inc(get_pool_nr_running(pool)); 872 atomic_inc(&pool->nr_running);
859} 873}
860 874
861/** 875/**
862 * busy_worker_head - return the busy hash head for a work 876 * find_worker_executing_work - find worker which is executing a work
863 * @gcwq: gcwq of interest 877 * @pool: pool of interest
864 * @work: work to be hashed
865 *
866 * Return hash head of @gcwq for @work.
867 *
868 * CONTEXT:
869 * spin_lock_irq(gcwq->lock).
870 *
871 * RETURNS:
872 * Pointer to the hash head.
873 */
874static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
875 struct work_struct *work)
876{
877 const int base_shift = ilog2(sizeof(struct work_struct));
878 unsigned long v = (unsigned long)work;
879
880 /* simple shift and fold hash, do we need something better? */
881 v >>= base_shift;
882 v += v >> BUSY_WORKER_HASH_ORDER;
883 v &= BUSY_WORKER_HASH_MASK;
884
885 return &gcwq->busy_hash[v];
886}
887
888/**
889 * __find_worker_executing_work - find worker which is executing a work
890 * @gcwq: gcwq of interest
891 * @bwh: hash head as returned by busy_worker_head()
892 * @work: work to find worker for 878 * @work: work to find worker for
893 * 879 *
894 * Find a worker which is executing @work on @gcwq. @bwh should be 880 * Find a worker which is executing @work on @pool by searching
895 * the hash head obtained by calling busy_worker_head() with the same 881 * @pool->busy_hash which is keyed by the address of @work. For a worker
896 * work. 882 * to match, its current execution should match the address of @work and
883 * its work function. This is to avoid unwanted dependency between
884 * unrelated work executions through a work item being recycled while still
885 * being executed.
886 *
887 * This is a bit tricky. A work item may be freed once its execution
888 * starts and nothing prevents the freed area from being recycled for
889 * another work item. If the same work item address ends up being reused
890 * before the original execution finishes, workqueue will identify the
891 * recycled work item as currently executing and make it wait until the
892 * current execution finishes, introducing an unwanted dependency.
893 *
894 * This function checks the work item address, work function and workqueue
895 * to avoid false positives. Note that this isn't complete as one may
896 * construct a work function which can introduce dependency onto itself
897 * through a recycled work item. Well, if somebody wants to shoot oneself
898 * in the foot that badly, there's only so much we can do, and if such
899 * deadlock actually occurs, it should be easy to locate the culprit work
900 * function.
897 * 901 *
898 * CONTEXT: 902 * CONTEXT:
899 * spin_lock_irq(gcwq->lock). 903 * spin_lock_irq(pool->lock).
900 * 904 *
901 * RETURNS: 905 * RETURNS:
902 * Pointer to worker which is executing @work if found, NULL 906 * Pointer to worker which is executing @work if found, NULL
903 * otherwise. 907 * otherwise.
904 */ 908 */
905static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, 909static struct worker *find_worker_executing_work(struct worker_pool *pool,
906 struct hlist_head *bwh, 910 struct work_struct *work)
907 struct work_struct *work)
908{ 911{
909 struct worker *worker; 912 struct worker *worker;
910 struct hlist_node *tmp;
911 913
912 hlist_for_each_entry(worker, tmp, bwh, hentry) 914 hash_for_each_possible(pool->busy_hash, worker, hentry,
913 if (worker->current_work == work) 915 (unsigned long)work)
916 if (worker->current_work == work &&
917 worker->current_func == work->func)
914 return worker; 918 return worker;
915 return NULL;
916}
917 919
918/** 920 return NULL;
919 * find_worker_executing_work - find worker which is executing a work
920 * @gcwq: gcwq of interest
921 * @work: work to find worker for
922 *
923 * Find a worker which is executing @work on @gcwq. This function is
924 * identical to __find_worker_executing_work() except that this
925 * function calculates @bwh itself.
926 *
927 * CONTEXT:
928 * spin_lock_irq(gcwq->lock).
929 *
930 * RETURNS:
931 * Pointer to worker which is executing @work if found, NULL
932 * otherwise.
933 */
934static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
935 struct work_struct *work)
936{
937 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
938 work);
939} 921}
940 922
941/** 923/**
@@ -953,7 +935,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
953 * nested inside outer list_for_each_entry_safe(). 935 * nested inside outer list_for_each_entry_safe().
954 * 936 *
955 * CONTEXT: 937 * CONTEXT:
956 * spin_lock_irq(gcwq->lock). 938 * spin_lock_irq(pool->lock).
957 */ 939 */
958static void move_linked_works(struct work_struct *work, struct list_head *head, 940static void move_linked_works(struct work_struct *work, struct list_head *head,
959 struct work_struct **nextp) 941 struct work_struct **nextp)
@@ -979,67 +961,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
979 *nextp = n; 961 *nextp = n;
980} 962}
981 963
982static void cwq_activate_delayed_work(struct work_struct *work) 964static void pwq_activate_delayed_work(struct work_struct *work)
983{ 965{
984 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 966 struct pool_workqueue *pwq = get_work_pwq(work);
985 967
986 trace_workqueue_activate_work(work); 968 trace_workqueue_activate_work(work);
987 move_linked_works(work, &cwq->pool->worklist, NULL); 969 move_linked_works(work, &pwq->pool->worklist, NULL);
988 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 970 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
989 cwq->nr_active++; 971 pwq->nr_active++;
990} 972}
991 973
992static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) 974static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
993{ 975{
994 struct work_struct *work = list_first_entry(&cwq->delayed_works, 976 struct work_struct *work = list_first_entry(&pwq->delayed_works,
995 struct work_struct, entry); 977 struct work_struct, entry);
996 978
997 cwq_activate_delayed_work(work); 979 pwq_activate_delayed_work(work);
998} 980}
999 981
1000/** 982/**
1001 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight 983 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1002 * @cwq: cwq of interest 984 * @pwq: pwq of interest
1003 * @color: color of work which left the queue 985 * @color: color of work which left the queue
1004 * 986 *
1005 * A work either has completed or is removed from pending queue, 987 * A work either has completed or is removed from pending queue,
1006 * decrement nr_in_flight of its cwq and handle workqueue flushing. 988 * decrement nr_in_flight of its pwq and handle workqueue flushing.
1007 * 989 *
1008 * CONTEXT: 990 * CONTEXT:
1009 * spin_lock_irq(gcwq->lock). 991 * spin_lock_irq(pool->lock).
1010 */ 992 */
1011static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) 993static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1012{ 994{
1013 /* ignore uncolored works */ 995 /* ignore uncolored works */
1014 if (color == WORK_NO_COLOR) 996 if (color == WORK_NO_COLOR)
1015 return; 997 return;
1016 998
1017 cwq->nr_in_flight[color]--; 999 pwq->nr_in_flight[color]--;
1018 1000
1019 cwq->nr_active--; 1001 pwq->nr_active--;
1020 if (!list_empty(&cwq->delayed_works)) { 1002 if (!list_empty(&pwq->delayed_works)) {
1021 /* one down, submit a delayed one */ 1003 /* one down, submit a delayed one */
1022 if (cwq->nr_active < cwq->max_active) 1004 if (pwq->nr_active < pwq->max_active)
1023 cwq_activate_first_delayed(cwq); 1005 pwq_activate_first_delayed(pwq);
1024 } 1006 }
1025 1007
1026 /* is flush in progress and are we at the flushing tip? */ 1008 /* is flush in progress and are we at the flushing tip? */
1027 if (likely(cwq->flush_color != color)) 1009 if (likely(pwq->flush_color != color))
1028 return; 1010 return;
1029 1011
1030 /* are there still in-flight works? */ 1012 /* are there still in-flight works? */
1031 if (cwq->nr_in_flight[color]) 1013 if (pwq->nr_in_flight[color])
1032 return; 1014 return;
1033 1015
1034 /* this cwq is done, clear flush_color */ 1016 /* this pwq is done, clear flush_color */
1035 cwq->flush_color = -1; 1017 pwq->flush_color = -1;
1036 1018
1037 /* 1019 /*
1038 * If this was the last cwq, wake up the first flusher. It 1020 * If this was the last pwq, wake up the first flusher. It
1039 * will handle the rest. 1021 * will handle the rest.
1040 */ 1022 */
1041 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) 1023 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1042 complete(&cwq->wq->first_flusher->done); 1024 complete(&pwq->wq->first_flusher->done);
1043} 1025}
1044 1026
1045/** 1027/**
@@ -1070,7 +1052,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1070static int try_to_grab_pending(struct work_struct *work, bool is_dwork, 1052static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1071 unsigned long *flags) 1053 unsigned long *flags)
1072{ 1054{
1073 struct global_cwq *gcwq; 1055 struct worker_pool *pool;
1056 struct pool_workqueue *pwq;
1074 1057
1075 local_irq_save(*flags); 1058 local_irq_save(*flags);
1076 1059
@@ -1095,41 +1078,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1095 * The queueing is in progress, or it is already queued. Try to 1078 * The queueing is in progress, or it is already queued. Try to
1096 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 1079 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1097 */ 1080 */
1098 gcwq = get_work_gcwq(work); 1081 pool = get_work_pool(work);
1099 if (!gcwq) 1082 if (!pool)
1100 goto fail; 1083 goto fail;
1101 1084
1102 spin_lock(&gcwq->lock); 1085 spin_lock(&pool->lock);
1103 if (!list_empty(&work->entry)) { 1086 /*
1087 * work->data is guaranteed to point to pwq only while the work
1088 * item is queued on pwq->wq, and both updating work->data to point
1089 * to pwq on queueing and to pool on dequeueing are done under
1090 * pwq->pool->lock. This in turn guarantees that, if work->data
1091 * points to pwq which is associated with a locked pool, the work
1092 * item is currently queued on that pool.
1093 */
1094 pwq = get_work_pwq(work);
1095 if (pwq && pwq->pool == pool) {
1096 debug_work_deactivate(work);
1097
1104 /* 1098 /*
1105 * This work is queued, but perhaps we locked the wrong gcwq. 1099 * A delayed work item cannot be grabbed directly because
1106 * In that case we must see the new value after rmb(), see 1100 * it might have linked NO_COLOR work items which, if left
1107 * insert_work()->wmb(). 1101 * on the delayed_list, will confuse pwq->nr_active
1102 * management later on and cause stall. Make sure the work
1103 * item is activated before grabbing.
1108 */ 1104 */
1109 smp_rmb(); 1105 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1110 if (gcwq == get_work_gcwq(work)) { 1106 pwq_activate_delayed_work(work);
1111 debug_work_deactivate(work);
1112 1107
1113 /* 1108 list_del_init(&work->entry);
1114 * A delayed work item cannot be grabbed directly 1109 pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
1115 * because it might have linked NO_COLOR work items
1116 * which, if left on the delayed_list, will confuse
1117 * cwq->nr_active management later on and cause
1118 * stall. Make sure the work item is activated
1119 * before grabbing.
1120 */
1121 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1122 cwq_activate_delayed_work(work);
1123 1110
1124 list_del_init(&work->entry); 1111 /* work->data points to pwq iff queued, point to pool */
1125 cwq_dec_nr_in_flight(get_work_cwq(work), 1112 set_work_pool_and_keep_pending(work, pool->id);
1126 get_work_color(work));
1127 1113
1128 spin_unlock(&gcwq->lock); 1114 spin_unlock(&pool->lock);
1129 return 1; 1115 return 1;
1130 }
1131 } 1116 }
1132 spin_unlock(&gcwq->lock); 1117 spin_unlock(&pool->lock);
1133fail: 1118fail:
1134 local_irq_restore(*flags); 1119 local_irq_restore(*flags);
1135 if (work_is_canceling(work)) 1120 if (work_is_canceling(work))
@@ -1139,33 +1124,25 @@ fail:
1139} 1124}
1140 1125
1141/** 1126/**
1142 * insert_work - insert a work into gcwq 1127 * insert_work - insert a work into a pool
1143 * @cwq: cwq @work belongs to 1128 * @pwq: pwq @work belongs to
1144 * @work: work to insert 1129 * @work: work to insert
1145 * @head: insertion point 1130 * @head: insertion point
1146 * @extra_flags: extra WORK_STRUCT_* flags to set 1131 * @extra_flags: extra WORK_STRUCT_* flags to set
1147 * 1132 *
1148 * Insert @work which belongs to @cwq into @gcwq after @head. 1133 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
1149 * @extra_flags is or'd to work_struct flags. 1134 * work_struct flags.
1150 * 1135 *
1151 * CONTEXT: 1136 * CONTEXT:
1152 * spin_lock_irq(gcwq->lock). 1137 * spin_lock_irq(pool->lock).
1153 */ 1138 */
1154static void insert_work(struct cpu_workqueue_struct *cwq, 1139static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1155 struct work_struct *work, struct list_head *head, 1140 struct list_head *head, unsigned int extra_flags)
1156 unsigned int extra_flags)
1157{ 1141{
1158 struct worker_pool *pool = cwq->pool; 1142 struct worker_pool *pool = pwq->pool;
1159 1143
1160 /* we own @work, set data and link */ 1144 /* we own @work, set data and link */
1161 set_work_cwq(work, cwq, extra_flags); 1145 set_work_pwq(work, pwq, extra_flags);
1162
1163 /*
1164 * Ensure that we get the right work->data if we see the
1165 * result of list_add() below, see try_to_grab_pending().
1166 */
1167 smp_wmb();
1168
1169 list_add_tail(&work->entry, head); 1146 list_add_tail(&work->entry, head);
1170 1147
1171 /* 1148 /*
@@ -1181,41 +1158,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
1181 1158
1182/* 1159/*
1183 * Test whether @work is being queued from another work executing on the 1160 * Test whether @work is being queued from another work executing on the
1184 * same workqueue. This is rather expensive and should only be used from 1161 * same workqueue.
1185 * cold paths.
1186 */ 1162 */
1187static bool is_chained_work(struct workqueue_struct *wq) 1163static bool is_chained_work(struct workqueue_struct *wq)
1188{ 1164{
1189 unsigned long flags; 1165 struct worker *worker;
1190 unsigned int cpu;
1191
1192 for_each_gcwq_cpu(cpu) {
1193 struct global_cwq *gcwq = get_gcwq(cpu);
1194 struct worker *worker;
1195 struct hlist_node *pos;
1196 int i;
1197 1166
1198 spin_lock_irqsave(&gcwq->lock, flags); 1167 worker = current_wq_worker();
1199 for_each_busy_worker(worker, i, pos, gcwq) { 1168 /*
1200 if (worker->task != current) 1169 * Return %true iff I'm a worker execuing a work item on @wq. If
1201 continue; 1170 * I'm @worker, it's safe to dereference it without locking.
1202 spin_unlock_irqrestore(&gcwq->lock, flags); 1171 */
1203 /* 1172 return worker && worker->current_pwq->wq == wq;
1204 * I'm @worker, no locking necessary. See if @work
1205 * is headed to the same workqueue.
1206 */
1207 return worker->current_cwq->wq == wq;
1208 }
1209 spin_unlock_irqrestore(&gcwq->lock, flags);
1210 }
1211 return false;
1212} 1173}
1213 1174
1214static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 1175static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1215 struct work_struct *work) 1176 struct work_struct *work)
1216{ 1177{
1217 struct global_cwq *gcwq; 1178 struct pool_workqueue *pwq;
1218 struct cpu_workqueue_struct *cwq;
1219 struct list_head *worklist; 1179 struct list_head *worklist;
1220 unsigned int work_flags; 1180 unsigned int work_flags;
1221 unsigned int req_cpu = cpu; 1181 unsigned int req_cpu = cpu;
@@ -1235,9 +1195,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1235 WARN_ON_ONCE(!is_chained_work(wq))) 1195 WARN_ON_ONCE(!is_chained_work(wq)))
1236 return; 1196 return;
1237 1197
1238 /* determine gcwq to use */ 1198 /* determine the pwq to use */
1239 if (!(wq->flags & WQ_UNBOUND)) { 1199 if (!(wq->flags & WQ_UNBOUND)) {
1240 struct global_cwq *last_gcwq; 1200 struct worker_pool *last_pool;
1241 1201
1242 if (cpu == WORK_CPU_UNBOUND) 1202 if (cpu == WORK_CPU_UNBOUND)
1243 cpu = raw_smp_processor_id(); 1203 cpu = raw_smp_processor_id();
@@ -1248,55 +1208,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1248 * work needs to be queued on that cpu to guarantee 1208 * work needs to be queued on that cpu to guarantee
1249 * non-reentrancy. 1209 * non-reentrancy.
1250 */ 1210 */
1251 gcwq = get_gcwq(cpu); 1211 pwq = get_pwq(cpu, wq);
1252 last_gcwq = get_work_gcwq(work); 1212 last_pool = get_work_pool(work);
1253 1213
1254 if (last_gcwq && last_gcwq != gcwq) { 1214 if (last_pool && last_pool != pwq->pool) {
1255 struct worker *worker; 1215 struct worker *worker;
1256 1216
1257 spin_lock(&last_gcwq->lock); 1217 spin_lock(&last_pool->lock);
1258 1218
1259 worker = find_worker_executing_work(last_gcwq, work); 1219 worker = find_worker_executing_work(last_pool, work);
1260 1220
1261 if (worker && worker->current_cwq->wq == wq) 1221 if (worker && worker->current_pwq->wq == wq) {
1262 gcwq = last_gcwq; 1222 pwq = get_pwq(last_pool->cpu, wq);
1263 else { 1223 } else {
1264 /* meh... not running there, queue here */ 1224 /* meh... not running there, queue here */
1265 spin_unlock(&last_gcwq->lock); 1225 spin_unlock(&last_pool->lock);
1266 spin_lock(&gcwq->lock); 1226 spin_lock(&pwq->pool->lock);
1267 } 1227 }
1268 } else { 1228 } else {
1269 spin_lock(&gcwq->lock); 1229 spin_lock(&pwq->pool->lock);
1270 } 1230 }
1271 } else { 1231 } else {
1272 gcwq = get_gcwq(WORK_CPU_UNBOUND); 1232 pwq = get_pwq(WORK_CPU_UNBOUND, wq);
1273 spin_lock(&gcwq->lock); 1233 spin_lock(&pwq->pool->lock);
1274 } 1234 }
1275 1235
1276 /* gcwq determined, get cwq and queue */ 1236 /* pwq determined, queue */
1277 cwq = get_cwq(gcwq->cpu, wq); 1237 trace_workqueue_queue_work(req_cpu, pwq, work);
1278 trace_workqueue_queue_work(req_cpu, cwq, work);
1279 1238
1280 if (WARN_ON(!list_empty(&work->entry))) { 1239 if (WARN_ON(!list_empty(&work->entry))) {
1281 spin_unlock(&gcwq->lock); 1240 spin_unlock(&pwq->pool->lock);
1282 return; 1241 return;
1283 } 1242 }
1284 1243
1285 cwq->nr_in_flight[cwq->work_color]++; 1244 pwq->nr_in_flight[pwq->work_color]++;
1286 work_flags = work_color_to_flags(cwq->work_color); 1245 work_flags = work_color_to_flags(pwq->work_color);
1287 1246
1288 if (likely(cwq->nr_active < cwq->max_active)) { 1247 if (likely(pwq->nr_active < pwq->max_active)) {
1289 trace_workqueue_activate_work(work); 1248 trace_workqueue_activate_work(work);
1290 cwq->nr_active++; 1249 pwq->nr_active++;
1291 worklist = &cwq->pool->worklist; 1250 worklist = &pwq->pool->worklist;
1292 } else { 1251 } else {
1293 work_flags |= WORK_STRUCT_DELAYED; 1252 work_flags |= WORK_STRUCT_DELAYED;
1294 worklist = &cwq->delayed_works; 1253 worklist = &pwq->delayed_works;
1295 } 1254 }
1296 1255
1297 insert_work(cwq, work, worklist, work_flags); 1256 insert_work(pwq, work, worklist, work_flags);
1298 1257
1299 spin_unlock(&gcwq->lock); 1258 spin_unlock(&pwq->pool->lock);
1300} 1259}
1301 1260
1302/** 1261/**
@@ -1347,19 +1306,17 @@ EXPORT_SYMBOL_GPL(queue_work);
1347void delayed_work_timer_fn(unsigned long __data) 1306void delayed_work_timer_fn(unsigned long __data)
1348{ 1307{
1349 struct delayed_work *dwork = (struct delayed_work *)__data; 1308 struct delayed_work *dwork = (struct delayed_work *)__data;
1350 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1351 1309
1352 /* should have been called from irqsafe timer with irq already off */ 1310 /* should have been called from irqsafe timer with irq already off */
1353 __queue_work(dwork->cpu, cwq->wq, &dwork->work); 1311 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
1354} 1312}
1355EXPORT_SYMBOL_GPL(delayed_work_timer_fn); 1313EXPORT_SYMBOL(delayed_work_timer_fn);
1356 1314
1357static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, 1315static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1358 struct delayed_work *dwork, unsigned long delay) 1316 struct delayed_work *dwork, unsigned long delay)
1359{ 1317{
1360 struct timer_list *timer = &dwork->timer; 1318 struct timer_list *timer = &dwork->timer;
1361 struct work_struct *work = &dwork->work; 1319 struct work_struct *work = &dwork->work;
1362 unsigned int lcpu;
1363 1320
1364 WARN_ON_ONCE(timer->function != delayed_work_timer_fn || 1321 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1365 timer->data != (unsigned long)dwork); 1322 timer->data != (unsigned long)dwork);
@@ -1379,30 +1336,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1379 1336
1380 timer_stats_timer_set_start_info(&dwork->timer); 1337 timer_stats_timer_set_start_info(&dwork->timer);
1381 1338
1382 /* 1339 dwork->wq = wq;
1383 * This stores cwq for the moment, for the timer_fn. Note that the
1384 * work's gcwq is preserved to allow reentrance detection for
1385 * delayed works.
1386 */
1387 if (!(wq->flags & WQ_UNBOUND)) {
1388 struct global_cwq *gcwq = get_work_gcwq(work);
1389
1390 /*
1391 * If we cannot get the last gcwq from @work directly,
1392 * select the last CPU such that it avoids unnecessarily
1393 * triggering non-reentrancy check in __queue_work().
1394 */
1395 lcpu = cpu;
1396 if (gcwq)
1397 lcpu = gcwq->cpu;
1398 if (lcpu == WORK_CPU_UNBOUND)
1399 lcpu = raw_smp_processor_id();
1400 } else {
1401 lcpu = WORK_CPU_UNBOUND;
1402 }
1403
1404 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1405
1406 dwork->cpu = cpu; 1340 dwork->cpu = cpu;
1407 timer->expires = jiffies + delay; 1341 timer->expires = jiffies + delay;
1408 1342
@@ -1519,12 +1453,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
1519 * necessary. 1453 * necessary.
1520 * 1454 *
1521 * LOCKING: 1455 * LOCKING:
1522 * spin_lock_irq(gcwq->lock). 1456 * spin_lock_irq(pool->lock).
1523 */ 1457 */
1524static void worker_enter_idle(struct worker *worker) 1458static void worker_enter_idle(struct worker *worker)
1525{ 1459{
1526 struct worker_pool *pool = worker->pool; 1460 struct worker_pool *pool = worker->pool;
1527 struct global_cwq *gcwq = pool->gcwq;
1528 1461
1529 BUG_ON(worker->flags & WORKER_IDLE); 1462 BUG_ON(worker->flags & WORKER_IDLE);
1530 BUG_ON(!list_empty(&worker->entry) && 1463 BUG_ON(!list_empty(&worker->entry) &&
@@ -1542,14 +1475,14 @@ static void worker_enter_idle(struct worker *worker)
1542 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 1475 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1543 1476
1544 /* 1477 /*
1545 * Sanity check nr_running. Because gcwq_unbind_fn() releases 1478 * Sanity check nr_running. Because wq_unbind_fn() releases
1546 * gcwq->lock between setting %WORKER_UNBOUND and zapping 1479 * pool->lock between setting %WORKER_UNBOUND and zapping
1547 * nr_running, the warning may trigger spuriously. Check iff 1480 * nr_running, the warning may trigger spuriously. Check iff
1548 * unbind is not in progress. 1481 * unbind is not in progress.
1549 */ 1482 */
1550 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && 1483 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
1551 pool->nr_workers == pool->nr_idle && 1484 pool->nr_workers == pool->nr_idle &&
1552 atomic_read(get_pool_nr_running(pool))); 1485 atomic_read(&pool->nr_running));
1553} 1486}
1554 1487
1555/** 1488/**
@@ -1559,7 +1492,7 @@ static void worker_enter_idle(struct worker *worker)
1559 * @worker is leaving idle state. Update stats. 1492 * @worker is leaving idle state. Update stats.
1560 * 1493 *
1561 * LOCKING: 1494 * LOCKING:
1562 * spin_lock_irq(gcwq->lock). 1495 * spin_lock_irq(pool->lock).
1563 */ 1496 */
1564static void worker_leave_idle(struct worker *worker) 1497static void worker_leave_idle(struct worker *worker)
1565{ 1498{
@@ -1572,7 +1505,7 @@ static void worker_leave_idle(struct worker *worker)
1572} 1505}
1573 1506
1574/** 1507/**
1575 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq 1508 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
1576 * @worker: self 1509 * @worker: self
1577 * 1510 *
1578 * Works which are scheduled while the cpu is online must at least be 1511 * Works which are scheduled while the cpu is online must at least be
@@ -1584,27 +1517,27 @@ static void worker_leave_idle(struct worker *worker)
1584 * themselves to the target cpu and may race with cpu going down or 1517 * themselves to the target cpu and may race with cpu going down or
1585 * coming online. kthread_bind() can't be used because it may put the 1518 * coming online. kthread_bind() can't be used because it may put the
1586 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used 1519 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1587 * verbatim as it's best effort and blocking and gcwq may be 1520 * verbatim as it's best effort and blocking and pool may be
1588 * [dis]associated in the meantime. 1521 * [dis]associated in the meantime.
1589 * 1522 *
1590 * This function tries set_cpus_allowed() and locks gcwq and verifies the 1523 * This function tries set_cpus_allowed() and locks pool and verifies the
1591 * binding against %GCWQ_DISASSOCIATED which is set during 1524 * binding against %POOL_DISASSOCIATED which is set during
1592 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker 1525 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1593 * enters idle state or fetches works without dropping lock, it can 1526 * enters idle state or fetches works without dropping lock, it can
1594 * guarantee the scheduling requirement described in the first paragraph. 1527 * guarantee the scheduling requirement described in the first paragraph.
1595 * 1528 *
1596 * CONTEXT: 1529 * CONTEXT:
1597 * Might sleep. Called without any lock but returns with gcwq->lock 1530 * Might sleep. Called without any lock but returns with pool->lock
1598 * held. 1531 * held.
1599 * 1532 *
1600 * RETURNS: 1533 * RETURNS:
1601 * %true if the associated gcwq is online (@worker is successfully 1534 * %true if the associated pool is online (@worker is successfully
1602 * bound), %false if offline. 1535 * bound), %false if offline.
1603 */ 1536 */
1604static bool worker_maybe_bind_and_lock(struct worker *worker) 1537static bool worker_maybe_bind_and_lock(struct worker *worker)
1605__acquires(&gcwq->lock) 1538__acquires(&pool->lock)
1606{ 1539{
1607 struct global_cwq *gcwq = worker->pool->gcwq; 1540 struct worker_pool *pool = worker->pool;
1608 struct task_struct *task = worker->task; 1541 struct task_struct *task = worker->task;
1609 1542
1610 while (true) { 1543 while (true) {
@@ -1612,19 +1545,19 @@ __acquires(&gcwq->lock)
1612 * The following call may fail, succeed or succeed 1545 * The following call may fail, succeed or succeed
1613 * without actually migrating the task to the cpu if 1546 * without actually migrating the task to the cpu if
1614 * it races with cpu hotunplug operation. Verify 1547 * it races with cpu hotunplug operation. Verify
1615 * against GCWQ_DISASSOCIATED. 1548 * against POOL_DISASSOCIATED.
1616 */ 1549 */
1617 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) 1550 if (!(pool->flags & POOL_DISASSOCIATED))
1618 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); 1551 set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
1619 1552
1620 spin_lock_irq(&gcwq->lock); 1553 spin_lock_irq(&pool->lock);
1621 if (gcwq->flags & GCWQ_DISASSOCIATED) 1554 if (pool->flags & POOL_DISASSOCIATED)
1622 return false; 1555 return false;
1623 if (task_cpu(task) == gcwq->cpu && 1556 if (task_cpu(task) == pool->cpu &&
1624 cpumask_equal(&current->cpus_allowed, 1557 cpumask_equal(&current->cpus_allowed,
1625 get_cpu_mask(gcwq->cpu))) 1558 get_cpu_mask(pool->cpu)))
1626 return true; 1559 return true;
1627 spin_unlock_irq(&gcwq->lock); 1560 spin_unlock_irq(&pool->lock);
1628 1561
1629 /* 1562 /*
1630 * We've raced with CPU hot[un]plug. Give it a breather 1563 * We've raced with CPU hot[un]plug. Give it a breather
@@ -1643,15 +1576,13 @@ __acquires(&gcwq->lock)
1643 */ 1576 */
1644static void idle_worker_rebind(struct worker *worker) 1577static void idle_worker_rebind(struct worker *worker)
1645{ 1578{
1646 struct global_cwq *gcwq = worker->pool->gcwq;
1647
1648 /* CPU may go down again inbetween, clear UNBOUND only on success */ 1579 /* CPU may go down again inbetween, clear UNBOUND only on success */
1649 if (worker_maybe_bind_and_lock(worker)) 1580 if (worker_maybe_bind_and_lock(worker))
1650 worker_clr_flags(worker, WORKER_UNBOUND); 1581 worker_clr_flags(worker, WORKER_UNBOUND);
1651 1582
1652 /* rebind complete, become available again */ 1583 /* rebind complete, become available again */
1653 list_add(&worker->entry, &worker->pool->idle_list); 1584 list_add(&worker->entry, &worker->pool->idle_list);
1654 spin_unlock_irq(&gcwq->lock); 1585 spin_unlock_irq(&worker->pool->lock);
1655} 1586}
1656 1587
1657/* 1588/*
@@ -1663,19 +1594,18 @@ static void idle_worker_rebind(struct worker *worker)
1663static void busy_worker_rebind_fn(struct work_struct *work) 1594static void busy_worker_rebind_fn(struct work_struct *work)
1664{ 1595{
1665 struct worker *worker = container_of(work, struct worker, rebind_work); 1596 struct worker *worker = container_of(work, struct worker, rebind_work);
1666 struct global_cwq *gcwq = worker->pool->gcwq;
1667 1597
1668 if (worker_maybe_bind_and_lock(worker)) 1598 if (worker_maybe_bind_and_lock(worker))
1669 worker_clr_flags(worker, WORKER_UNBOUND); 1599 worker_clr_flags(worker, WORKER_UNBOUND);
1670 1600
1671 spin_unlock_irq(&gcwq->lock); 1601 spin_unlock_irq(&worker->pool->lock);
1672} 1602}
1673 1603
1674/** 1604/**
1675 * rebind_workers - rebind all workers of a gcwq to the associated CPU 1605 * rebind_workers - rebind all workers of a pool to the associated CPU
1676 * @gcwq: gcwq of interest 1606 * @pool: pool of interest
1677 * 1607 *
1678 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding 1608 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
1679 * is different for idle and busy ones. 1609 * is different for idle and busy ones.
1680 * 1610 *
1681 * Idle ones will be removed from the idle_list and woken up. They will 1611 * Idle ones will be removed from the idle_list and woken up. They will
@@ -1693,38 +1623,31 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1693 * including the manager will not appear on @idle_list until rebind is 1623 * including the manager will not appear on @idle_list until rebind is
1694 * complete, making local wake-ups safe. 1624 * complete, making local wake-ups safe.
1695 */ 1625 */
1696static void rebind_workers(struct global_cwq *gcwq) 1626static void rebind_workers(struct worker_pool *pool)
1697{ 1627{
1698 struct worker_pool *pool;
1699 struct worker *worker, *n; 1628 struct worker *worker, *n;
1700 struct hlist_node *pos;
1701 int i; 1629 int i;
1702 1630
1703 lockdep_assert_held(&gcwq->lock); 1631 lockdep_assert_held(&pool->assoc_mutex);
1704 1632 lockdep_assert_held(&pool->lock);
1705 for_each_worker_pool(pool, gcwq)
1706 lockdep_assert_held(&pool->assoc_mutex);
1707 1633
1708 /* dequeue and kick idle ones */ 1634 /* dequeue and kick idle ones */
1709 for_each_worker_pool(pool, gcwq) { 1635 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1710 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { 1636 /*
1711 /* 1637 * idle workers should be off @pool->idle_list until rebind
1712 * idle workers should be off @pool->idle_list 1638 * is complete to avoid receiving premature local wake-ups.
1713 * until rebind is complete to avoid receiving 1639 */
1714 * premature local wake-ups. 1640 list_del_init(&worker->entry);
1715 */
1716 list_del_init(&worker->entry);
1717 1641
1718 /* 1642 /*
1719 * worker_thread() will see the above dequeuing 1643 * worker_thread() will see the above dequeuing and call
1720 * and call idle_worker_rebind(). 1644 * idle_worker_rebind().
1721 */ 1645 */
1722 wake_up_process(worker->task); 1646 wake_up_process(worker->task);
1723 }
1724 } 1647 }
1725 1648
1726 /* rebind busy workers */ 1649 /* rebind busy workers */
1727 for_each_busy_worker(worker, i, pos, gcwq) { 1650 for_each_busy_worker(worker, i, pool) {
1728 struct work_struct *rebind_work = &worker->rebind_work; 1651 struct work_struct *rebind_work = &worker->rebind_work;
1729 struct workqueue_struct *wq; 1652 struct workqueue_struct *wq;
1730 1653
@@ -1736,16 +1659,16 @@ static void rebind_workers(struct global_cwq *gcwq)
1736 1659
1737 /* 1660 /*
1738 * wq doesn't really matter but let's keep @worker->pool 1661 * wq doesn't really matter but let's keep @worker->pool
1739 * and @cwq->pool consistent for sanity. 1662 * and @pwq->pool consistent for sanity.
1740 */ 1663 */
1741 if (worker_pool_pri(worker->pool)) 1664 if (std_worker_pool_pri(worker->pool))
1742 wq = system_highpri_wq; 1665 wq = system_highpri_wq;
1743 else 1666 else
1744 wq = system_wq; 1667 wq = system_wq;
1745 1668
1746 insert_work(get_cwq(gcwq->cpu, wq), rebind_work, 1669 insert_work(get_pwq(pool->cpu, wq), rebind_work,
1747 worker->scheduled.next, 1670 worker->scheduled.next,
1748 work_color_to_flags(WORK_NO_COLOR)); 1671 work_color_to_flags(WORK_NO_COLOR));
1749 } 1672 }
1750} 1673}
1751 1674
@@ -1780,19 +1703,18 @@ static struct worker *alloc_worker(void)
1780 */ 1703 */
1781static struct worker *create_worker(struct worker_pool *pool) 1704static struct worker *create_worker(struct worker_pool *pool)
1782{ 1705{
1783 struct global_cwq *gcwq = pool->gcwq; 1706 const char *pri = std_worker_pool_pri(pool) ? "H" : "";
1784 const char *pri = worker_pool_pri(pool) ? "H" : "";
1785 struct worker *worker = NULL; 1707 struct worker *worker = NULL;
1786 int id = -1; 1708 int id = -1;
1787 1709
1788 spin_lock_irq(&gcwq->lock); 1710 spin_lock_irq(&pool->lock);
1789 while (ida_get_new(&pool->worker_ida, &id)) { 1711 while (ida_get_new(&pool->worker_ida, &id)) {
1790 spin_unlock_irq(&gcwq->lock); 1712 spin_unlock_irq(&pool->lock);
1791 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) 1713 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1792 goto fail; 1714 goto fail;
1793 spin_lock_irq(&gcwq->lock); 1715 spin_lock_irq(&pool->lock);
1794 } 1716 }
1795 spin_unlock_irq(&gcwq->lock); 1717 spin_unlock_irq(&pool->lock);
1796 1718
1797 worker = alloc_worker(); 1719 worker = alloc_worker();
1798 if (!worker) 1720 if (!worker)
@@ -1801,30 +1723,30 @@ static struct worker *create_worker(struct worker_pool *pool)
1801 worker->pool = pool; 1723 worker->pool = pool;
1802 worker->id = id; 1724 worker->id = id;
1803 1725
1804 if (gcwq->cpu != WORK_CPU_UNBOUND) 1726 if (pool->cpu != WORK_CPU_UNBOUND)
1805 worker->task = kthread_create_on_node(worker_thread, 1727 worker->task = kthread_create_on_node(worker_thread,
1806 worker, cpu_to_node(gcwq->cpu), 1728 worker, cpu_to_node(pool->cpu),
1807 "kworker/%u:%d%s", gcwq->cpu, id, pri); 1729 "kworker/%u:%d%s", pool->cpu, id, pri);
1808 else 1730 else
1809 worker->task = kthread_create(worker_thread, worker, 1731 worker->task = kthread_create(worker_thread, worker,
1810 "kworker/u:%d%s", id, pri); 1732 "kworker/u:%d%s", id, pri);
1811 if (IS_ERR(worker->task)) 1733 if (IS_ERR(worker->task))
1812 goto fail; 1734 goto fail;
1813 1735
1814 if (worker_pool_pri(pool)) 1736 if (std_worker_pool_pri(pool))
1815 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); 1737 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1816 1738
1817 /* 1739 /*
1818 * Determine CPU binding of the new worker depending on 1740 * Determine CPU binding of the new worker depending on
1819 * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the 1741 * %POOL_DISASSOCIATED. The caller is responsible for ensuring the
1820 * flag remains stable across this function. See the comments 1742 * flag remains stable across this function. See the comments
1821 * above the flag definition for details. 1743 * above the flag definition for details.
1822 * 1744 *
1823 * As an unbound worker may later become a regular one if CPU comes 1745 * As an unbound worker may later become a regular one if CPU comes
1824 * online, make sure every worker has %PF_THREAD_BOUND set. 1746 * online, make sure every worker has %PF_THREAD_BOUND set.
1825 */ 1747 */
1826 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { 1748 if (!(pool->flags & POOL_DISASSOCIATED)) {
1827 kthread_bind(worker->task, gcwq->cpu); 1749 kthread_bind(worker->task, pool->cpu);
1828 } else { 1750 } else {
1829 worker->task->flags |= PF_THREAD_BOUND; 1751 worker->task->flags |= PF_THREAD_BOUND;
1830 worker->flags |= WORKER_UNBOUND; 1752 worker->flags |= WORKER_UNBOUND;
@@ -1833,9 +1755,9 @@ static struct worker *create_worker(struct worker_pool *pool)
1833 return worker; 1755 return worker;
1834fail: 1756fail:
1835 if (id >= 0) { 1757 if (id >= 0) {
1836 spin_lock_irq(&gcwq->lock); 1758 spin_lock_irq(&pool->lock);
1837 ida_remove(&pool->worker_ida, id); 1759 ida_remove(&pool->worker_ida, id);
1838 spin_unlock_irq(&gcwq->lock); 1760 spin_unlock_irq(&pool->lock);
1839 } 1761 }
1840 kfree(worker); 1762 kfree(worker);
1841 return NULL; 1763 return NULL;
@@ -1845,10 +1767,10 @@ fail:
1845 * start_worker - start a newly created worker 1767 * start_worker - start a newly created worker
1846 * @worker: worker to start 1768 * @worker: worker to start
1847 * 1769 *
1848 * Make the gcwq aware of @worker and start it. 1770 * Make the pool aware of @worker and start it.
1849 * 1771 *
1850 * CONTEXT: 1772 * CONTEXT:
1851 * spin_lock_irq(gcwq->lock). 1773 * spin_lock_irq(pool->lock).
1852 */ 1774 */
1853static void start_worker(struct worker *worker) 1775static void start_worker(struct worker *worker)
1854{ 1776{
@@ -1862,15 +1784,14 @@ static void start_worker(struct worker *worker)
1862 * destroy_worker - destroy a workqueue worker 1784 * destroy_worker - destroy a workqueue worker
1863 * @worker: worker to be destroyed 1785 * @worker: worker to be destroyed
1864 * 1786 *
1865 * Destroy @worker and adjust @gcwq stats accordingly. 1787 * Destroy @worker and adjust @pool stats accordingly.
1866 * 1788 *
1867 * CONTEXT: 1789 * CONTEXT:
1868 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 1790 * spin_lock_irq(pool->lock) which is released and regrabbed.
1869 */ 1791 */
1870static void destroy_worker(struct worker *worker) 1792static void destroy_worker(struct worker *worker)
1871{ 1793{
1872 struct worker_pool *pool = worker->pool; 1794 struct worker_pool *pool = worker->pool;
1873 struct global_cwq *gcwq = pool->gcwq;
1874 int id = worker->id; 1795 int id = worker->id;
1875 1796
1876 /* sanity check frenzy */ 1797 /* sanity check frenzy */
@@ -1885,21 +1806,20 @@ static void destroy_worker(struct worker *worker)
1885 list_del_init(&worker->entry); 1806 list_del_init(&worker->entry);
1886 worker->flags |= WORKER_DIE; 1807 worker->flags |= WORKER_DIE;
1887 1808
1888 spin_unlock_irq(&gcwq->lock); 1809 spin_unlock_irq(&pool->lock);
1889 1810
1890 kthread_stop(worker->task); 1811 kthread_stop(worker->task);
1891 kfree(worker); 1812 kfree(worker);
1892 1813
1893 spin_lock_irq(&gcwq->lock); 1814 spin_lock_irq(&pool->lock);
1894 ida_remove(&pool->worker_ida, id); 1815 ida_remove(&pool->worker_ida, id);
1895} 1816}
1896 1817
1897static void idle_worker_timeout(unsigned long __pool) 1818static void idle_worker_timeout(unsigned long __pool)
1898{ 1819{
1899 struct worker_pool *pool = (void *)__pool; 1820 struct worker_pool *pool = (void *)__pool;
1900 struct global_cwq *gcwq = pool->gcwq;
1901 1821
1902 spin_lock_irq(&gcwq->lock); 1822 spin_lock_irq(&pool->lock);
1903 1823
1904 if (too_many_workers(pool)) { 1824 if (too_many_workers(pool)) {
1905 struct worker *worker; 1825 struct worker *worker;
@@ -1918,20 +1838,20 @@ static void idle_worker_timeout(unsigned long __pool)
1918 } 1838 }
1919 } 1839 }
1920 1840
1921 spin_unlock_irq(&gcwq->lock); 1841 spin_unlock_irq(&pool->lock);
1922} 1842}
1923 1843
1924static bool send_mayday(struct work_struct *work) 1844static bool send_mayday(struct work_struct *work)
1925{ 1845{
1926 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 1846 struct pool_workqueue *pwq = get_work_pwq(work);
1927 struct workqueue_struct *wq = cwq->wq; 1847 struct workqueue_struct *wq = pwq->wq;
1928 unsigned int cpu; 1848 unsigned int cpu;
1929 1849
1930 if (!(wq->flags & WQ_RESCUER)) 1850 if (!(wq->flags & WQ_RESCUER))
1931 return false; 1851 return false;
1932 1852
1933 /* mayday mayday mayday */ 1853 /* mayday mayday mayday */
1934 cpu = cwq->pool->gcwq->cpu; 1854 cpu = pwq->pool->cpu;
1935 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1855 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1936 if (cpu == WORK_CPU_UNBOUND) 1856 if (cpu == WORK_CPU_UNBOUND)
1937 cpu = 0; 1857 cpu = 0;
@@ -1940,13 +1860,12 @@ static bool send_mayday(struct work_struct *work)
1940 return true; 1860 return true;
1941} 1861}
1942 1862
1943static void gcwq_mayday_timeout(unsigned long __pool) 1863static void pool_mayday_timeout(unsigned long __pool)
1944{ 1864{
1945 struct worker_pool *pool = (void *)__pool; 1865 struct worker_pool *pool = (void *)__pool;
1946 struct global_cwq *gcwq = pool->gcwq;
1947 struct work_struct *work; 1866 struct work_struct *work;
1948 1867
1949 spin_lock_irq(&gcwq->lock); 1868 spin_lock_irq(&pool->lock);
1950 1869
1951 if (need_to_create_worker(pool)) { 1870 if (need_to_create_worker(pool)) {
1952 /* 1871 /*
@@ -1959,7 +1878,7 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1959 send_mayday(work); 1878 send_mayday(work);
1960 } 1879 }
1961 1880
1962 spin_unlock_irq(&gcwq->lock); 1881 spin_unlock_irq(&pool->lock);
1963 1882
1964 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1883 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1965} 1884}
@@ -1978,24 +1897,22 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1978 * may_start_working() true. 1897 * may_start_working() true.
1979 * 1898 *
1980 * LOCKING: 1899 * LOCKING:
1981 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1900 * spin_lock_irq(pool->lock) which may be released and regrabbed
1982 * multiple times. Does GFP_KERNEL allocations. Called only from 1901 * multiple times. Does GFP_KERNEL allocations. Called only from
1983 * manager. 1902 * manager.
1984 * 1903 *
1985 * RETURNS: 1904 * RETURNS:
1986 * false if no action was taken and gcwq->lock stayed locked, true 1905 * false if no action was taken and pool->lock stayed locked, true
1987 * otherwise. 1906 * otherwise.
1988 */ 1907 */
1989static bool maybe_create_worker(struct worker_pool *pool) 1908static bool maybe_create_worker(struct worker_pool *pool)
1990__releases(&gcwq->lock) 1909__releases(&pool->lock)
1991__acquires(&gcwq->lock) 1910__acquires(&pool->lock)
1992{ 1911{
1993 struct global_cwq *gcwq = pool->gcwq;
1994
1995 if (!need_to_create_worker(pool)) 1912 if (!need_to_create_worker(pool))
1996 return false; 1913 return false;
1997restart: 1914restart:
1998 spin_unlock_irq(&gcwq->lock); 1915 spin_unlock_irq(&pool->lock);
1999 1916
2000 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 1917 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
2001 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1918 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2006,7 +1923,7 @@ restart:
2006 worker = create_worker(pool); 1923 worker = create_worker(pool);
2007 if (worker) { 1924 if (worker) {
2008 del_timer_sync(&pool->mayday_timer); 1925 del_timer_sync(&pool->mayday_timer);
2009 spin_lock_irq(&gcwq->lock); 1926 spin_lock_irq(&pool->lock);
2010 start_worker(worker); 1927 start_worker(worker);
2011 BUG_ON(need_to_create_worker(pool)); 1928 BUG_ON(need_to_create_worker(pool));
2012 return true; 1929 return true;
@@ -2023,7 +1940,7 @@ restart:
2023 } 1940 }
2024 1941
2025 del_timer_sync(&pool->mayday_timer); 1942 del_timer_sync(&pool->mayday_timer);
2026 spin_lock_irq(&gcwq->lock); 1943 spin_lock_irq(&pool->lock);
2027 if (need_to_create_worker(pool)) 1944 if (need_to_create_worker(pool))
2028 goto restart; 1945 goto restart;
2029 return true; 1946 return true;
@@ -2037,11 +1954,11 @@ restart:
2037 * IDLE_WORKER_TIMEOUT. 1954 * IDLE_WORKER_TIMEOUT.
2038 * 1955 *
2039 * LOCKING: 1956 * LOCKING:
2040 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1957 * spin_lock_irq(pool->lock) which may be released and regrabbed
2041 * multiple times. Called only from manager. 1958 * multiple times. Called only from manager.
2042 * 1959 *
2043 * RETURNS: 1960 * RETURNS:
2044 * false if no action was taken and gcwq->lock stayed locked, true 1961 * false if no action was taken and pool->lock stayed locked, true
2045 * otherwise. 1962 * otherwise.
2046 */ 1963 */
2047static bool maybe_destroy_workers(struct worker_pool *pool) 1964static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2071,21 +1988,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2071 * manage_workers - manage worker pool 1988 * manage_workers - manage worker pool
2072 * @worker: self 1989 * @worker: self
2073 * 1990 *
2074 * Assume the manager role and manage gcwq worker pool @worker belongs 1991 * Assume the manager role and manage the worker pool @worker belongs
2075 * to. At any given time, there can be only zero or one manager per 1992 * to. At any given time, there can be only zero or one manager per
2076 * gcwq. The exclusion is handled automatically by this function. 1993 * pool. The exclusion is handled automatically by this function.
2077 * 1994 *
2078 * The caller can safely start processing works on false return. On 1995 * The caller can safely start processing works on false return. On
2079 * true return, it's guaranteed that need_to_create_worker() is false 1996 * true return, it's guaranteed that need_to_create_worker() is false
2080 * and may_start_working() is true. 1997 * and may_start_working() is true.
2081 * 1998 *
2082 * CONTEXT: 1999 * CONTEXT:
2083 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 2000 * spin_lock_irq(pool->lock) which may be released and regrabbed
2084 * multiple times. Does GFP_KERNEL allocations. 2001 * multiple times. Does GFP_KERNEL allocations.
2085 * 2002 *
2086 * RETURNS: 2003 * RETURNS:
2087 * false if no action was taken and gcwq->lock stayed locked, true if 2004 * spin_lock_irq(pool->lock) which may be released and regrabbed
2088 * some action was taken. 2005 * multiple times. Does GFP_KERNEL allocations.
2089 */ 2006 */
2090static bool manage_workers(struct worker *worker) 2007static bool manage_workers(struct worker *worker)
2091{ 2008{
@@ -2107,20 +2024,20 @@ static bool manage_workers(struct worker *worker)
2107 * manager against CPU hotplug. 2024 * manager against CPU hotplug.
2108 * 2025 *
2109 * assoc_mutex would always be free unless CPU hotplug is in 2026 * assoc_mutex would always be free unless CPU hotplug is in
2110 * progress. trylock first without dropping @gcwq->lock. 2027 * progress. trylock first without dropping @pool->lock.
2111 */ 2028 */
2112 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { 2029 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
2113 spin_unlock_irq(&pool->gcwq->lock); 2030 spin_unlock_irq(&pool->lock);
2114 mutex_lock(&pool->assoc_mutex); 2031 mutex_lock(&pool->assoc_mutex);
2115 /* 2032 /*
2116 * CPU hotplug could have happened while we were waiting 2033 * CPU hotplug could have happened while we were waiting
2117 * for assoc_mutex. Hotplug itself can't handle us 2034 * for assoc_mutex. Hotplug itself can't handle us
2118 * because manager isn't either on idle or busy list, and 2035 * because manager isn't either on idle or busy list, and
2119 * @gcwq's state and ours could have deviated. 2036 * @pool's state and ours could have deviated.
2120 * 2037 *
2121 * As hotplug is now excluded via assoc_mutex, we can 2038 * As hotplug is now excluded via assoc_mutex, we can
2122 * simply try to bind. It will succeed or fail depending 2039 * simply try to bind. It will succeed or fail depending
2123 * on @gcwq's current state. Try it and adjust 2040 * on @pool's current state. Try it and adjust
2124 * %WORKER_UNBOUND accordingly. 2041 * %WORKER_UNBOUND accordingly.
2125 */ 2042 */
2126 if (worker_maybe_bind_and_lock(worker)) 2043 if (worker_maybe_bind_and_lock(worker))
@@ -2157,18 +2074,15 @@ static bool manage_workers(struct worker *worker)
2157 * call this function to process a work. 2074 * call this function to process a work.
2158 * 2075 *
2159 * CONTEXT: 2076 * CONTEXT:
2160 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 2077 * spin_lock_irq(pool->lock) which is released and regrabbed.
2161 */ 2078 */
2162static void process_one_work(struct worker *worker, struct work_struct *work) 2079static void process_one_work(struct worker *worker, struct work_struct *work)
2163__releases(&gcwq->lock) 2080__releases(&pool->lock)
2164__acquires(&gcwq->lock) 2081__acquires(&pool->lock)
2165{ 2082{
2166 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 2083 struct pool_workqueue *pwq = get_work_pwq(work);
2167 struct worker_pool *pool = worker->pool; 2084 struct worker_pool *pool = worker->pool;
2168 struct global_cwq *gcwq = pool->gcwq; 2085 bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
2169 struct hlist_head *bwh = busy_worker_head(gcwq, work);
2170 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
2171 work_func_t f = work->func;
2172 int work_color; 2086 int work_color;
2173 struct worker *collision; 2087 struct worker *collision;
2174#ifdef CONFIG_LOCKDEP 2088#ifdef CONFIG_LOCKDEP
@@ -2186,11 +2100,11 @@ __acquires(&gcwq->lock)
2186 /* 2100 /*
2187 * Ensure we're on the correct CPU. DISASSOCIATED test is 2101 * Ensure we're on the correct CPU. DISASSOCIATED test is
2188 * necessary to avoid spurious warnings from rescuers servicing the 2102 * necessary to avoid spurious warnings from rescuers servicing the
2189 * unbound or a disassociated gcwq. 2103 * unbound or a disassociated pool.
2190 */ 2104 */
2191 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && 2105 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2192 !(gcwq->flags & GCWQ_DISASSOCIATED) && 2106 !(pool->flags & POOL_DISASSOCIATED) &&
2193 raw_smp_processor_id() != gcwq->cpu); 2107 raw_smp_processor_id() != pool->cpu);
2194 2108
2195 /* 2109 /*
2196 * A single work shouldn't be executed concurrently by 2110 * A single work shouldn't be executed concurrently by
@@ -2198,7 +2112,7 @@ __acquires(&gcwq->lock)
2198 * already processing the work. If so, defer the work to the 2112 * already processing the work. If so, defer the work to the
2199 * currently executing one. 2113 * currently executing one.
2200 */ 2114 */
2201 collision = __find_worker_executing_work(gcwq, bwh, work); 2115 collision = find_worker_executing_work(pool, work);
2202 if (unlikely(collision)) { 2116 if (unlikely(collision)) {
2203 move_linked_works(work, &collision->scheduled, NULL); 2117 move_linked_works(work, &collision->scheduled, NULL);
2204 return; 2118 return;
@@ -2206,9 +2120,10 @@ __acquires(&gcwq->lock)
2206 2120
2207 /* claim and dequeue */ 2121 /* claim and dequeue */
2208 debug_work_deactivate(work); 2122 debug_work_deactivate(work);
2209 hlist_add_head(&worker->hentry, bwh); 2123 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
2210 worker->current_work = work; 2124 worker->current_work = work;
2211 worker->current_cwq = cwq; 2125 worker->current_func = work->func;
2126 worker->current_pwq = pwq;
2212 work_color = get_work_color(work); 2127 work_color = get_work_color(work);
2213 2128
2214 list_del_init(&work->entry); 2129 list_del_init(&work->entry);
@@ -2221,53 +2136,55 @@ __acquires(&gcwq->lock)
2221 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 2136 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
2222 2137
2223 /* 2138 /*
2224 * Unbound gcwq isn't concurrency managed and work items should be 2139 * Unbound pool isn't concurrency managed and work items should be
2225 * executed ASAP. Wake up another worker if necessary. 2140 * executed ASAP. Wake up another worker if necessary.
2226 */ 2141 */
2227 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 2142 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2228 wake_up_worker(pool); 2143 wake_up_worker(pool);
2229 2144
2230 /* 2145 /*
2231 * Record the last CPU and clear PENDING which should be the last 2146 * Record the last pool and clear PENDING which should be the last
2232 * update to @work. Also, do this inside @gcwq->lock so that 2147 * update to @work. Also, do this inside @pool->lock so that
2233 * PENDING and queued state changes happen together while IRQ is 2148 * PENDING and queued state changes happen together while IRQ is
2234 * disabled. 2149 * disabled.
2235 */ 2150 */
2236 set_work_cpu_and_clear_pending(work, gcwq->cpu); 2151 set_work_pool_and_clear_pending(work, pool->id);
2237 2152
2238 spin_unlock_irq(&gcwq->lock); 2153 spin_unlock_irq(&pool->lock);
2239 2154
2240 lock_map_acquire_read(&cwq->wq->lockdep_map); 2155 lock_map_acquire_read(&pwq->wq->lockdep_map);
2241 lock_map_acquire(&lockdep_map); 2156 lock_map_acquire(&lockdep_map);
2242 trace_workqueue_execute_start(work); 2157 trace_workqueue_execute_start(work);
2243 f(work); 2158 worker->current_func(work);
2244 /* 2159 /*
2245 * While we must be careful to not use "work" after this, the trace 2160 * While we must be careful to not use "work" after this, the trace
2246 * point will only record its address. 2161 * point will only record its address.
2247 */ 2162 */
2248 trace_workqueue_execute_end(work); 2163 trace_workqueue_execute_end(work);
2249 lock_map_release(&lockdep_map); 2164 lock_map_release(&lockdep_map);
2250 lock_map_release(&cwq->wq->lockdep_map); 2165 lock_map_release(&pwq->wq->lockdep_map);
2251 2166
2252 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2167 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2253 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" 2168 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2254 " last function: %pf\n", 2169 " last function: %pf\n",
2255 current->comm, preempt_count(), task_pid_nr(current), f); 2170 current->comm, preempt_count(), task_pid_nr(current),
2171 worker->current_func);
2256 debug_show_held_locks(current); 2172 debug_show_held_locks(current);
2257 dump_stack(); 2173 dump_stack();
2258 } 2174 }
2259 2175
2260 spin_lock_irq(&gcwq->lock); 2176 spin_lock_irq(&pool->lock);
2261 2177
2262 /* clear cpu intensive status */ 2178 /* clear cpu intensive status */
2263 if (unlikely(cpu_intensive)) 2179 if (unlikely(cpu_intensive))
2264 worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 2180 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
2265 2181
2266 /* we're done with it, release */ 2182 /* we're done with it, release */
2267 hlist_del_init(&worker->hentry); 2183 hash_del(&worker->hentry);
2268 worker->current_work = NULL; 2184 worker->current_work = NULL;
2269 worker->current_cwq = NULL; 2185 worker->current_func = NULL;
2270 cwq_dec_nr_in_flight(cwq, work_color); 2186 worker->current_pwq = NULL;
2187 pwq_dec_nr_in_flight(pwq, work_color);
2271} 2188}
2272 2189
2273/** 2190/**
@@ -2279,7 +2196,7 @@ __acquires(&gcwq->lock)
2279 * fetches a work from the top and executes it. 2196 * fetches a work from the top and executes it.
2280 * 2197 *
2281 * CONTEXT: 2198 * CONTEXT:
2282 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 2199 * spin_lock_irq(pool->lock) which may be released and regrabbed
2283 * multiple times. 2200 * multiple times.
2284 */ 2201 */
2285static void process_scheduled_works(struct worker *worker) 2202static void process_scheduled_works(struct worker *worker)
@@ -2295,8 +2212,8 @@ static void process_scheduled_works(struct worker *worker)
2295 * worker_thread - the worker thread function 2212 * worker_thread - the worker thread function
2296 * @__worker: self 2213 * @__worker: self
2297 * 2214 *
2298 * The gcwq worker thread function. There's a single dynamic pool of 2215 * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools
2299 * these per each cpu. These workers process all works regardless of 2216 * of these per each cpu. These workers process all works regardless of
2300 * their specific target workqueue. The only exception is works which 2217 * their specific target workqueue. The only exception is works which
2301 * belong to workqueues with a rescuer which will be explained in 2218 * belong to workqueues with a rescuer which will be explained in
2302 * rescuer_thread(). 2219 * rescuer_thread().
@@ -2305,16 +2222,15 @@ static int worker_thread(void *__worker)
2305{ 2222{
2306 struct worker *worker = __worker; 2223 struct worker *worker = __worker;
2307 struct worker_pool *pool = worker->pool; 2224 struct worker_pool *pool = worker->pool;
2308 struct global_cwq *gcwq = pool->gcwq;
2309 2225
2310 /* tell the scheduler that this is a workqueue worker */ 2226 /* tell the scheduler that this is a workqueue worker */
2311 worker->task->flags |= PF_WQ_WORKER; 2227 worker->task->flags |= PF_WQ_WORKER;
2312woke_up: 2228woke_up:
2313 spin_lock_irq(&gcwq->lock); 2229 spin_lock_irq(&pool->lock);
2314 2230
2315 /* we are off idle list if destruction or rebind is requested */ 2231 /* we are off idle list if destruction or rebind is requested */
2316 if (unlikely(list_empty(&worker->entry))) { 2232 if (unlikely(list_empty(&worker->entry))) {
2317 spin_unlock_irq(&gcwq->lock); 2233 spin_unlock_irq(&pool->lock);
2318 2234
2319 /* if DIE is set, destruction is requested */ 2235 /* if DIE is set, destruction is requested */
2320 if (worker->flags & WORKER_DIE) { 2236 if (worker->flags & WORKER_DIE) {
@@ -2373,52 +2289,59 @@ sleep:
2373 goto recheck; 2289 goto recheck;
2374 2290
2375 /* 2291 /*
2376 * gcwq->lock is held and there's no work to process and no 2292 * pool->lock is held and there's no work to process and no need to
2377 * need to manage, sleep. Workers are woken up only while 2293 * manage, sleep. Workers are woken up only while holding
2378 * holding gcwq->lock or from local cpu, so setting the 2294 * pool->lock or from local cpu, so setting the current state
2379 * current state before releasing gcwq->lock is enough to 2295 * before releasing pool->lock is enough to prevent losing any
2380 * prevent losing any event. 2296 * event.
2381 */ 2297 */
2382 worker_enter_idle(worker); 2298 worker_enter_idle(worker);
2383 __set_current_state(TASK_INTERRUPTIBLE); 2299 __set_current_state(TASK_INTERRUPTIBLE);
2384 spin_unlock_irq(&gcwq->lock); 2300 spin_unlock_irq(&pool->lock);
2385 schedule(); 2301 schedule();
2386 goto woke_up; 2302 goto woke_up;
2387} 2303}
2388 2304
2389/** 2305/**
2390 * rescuer_thread - the rescuer thread function 2306 * rescuer_thread - the rescuer thread function
2391 * @__wq: the associated workqueue 2307 * @__rescuer: self
2392 * 2308 *
2393 * Workqueue rescuer thread function. There's one rescuer for each 2309 * Workqueue rescuer thread function. There's one rescuer for each
2394 * workqueue which has WQ_RESCUER set. 2310 * workqueue which has WQ_RESCUER set.
2395 * 2311 *
2396 * Regular work processing on a gcwq may block trying to create a new 2312 * Regular work processing on a pool may block trying to create a new
2397 * worker which uses GFP_KERNEL allocation which has slight chance of 2313 * worker which uses GFP_KERNEL allocation which has slight chance of
2398 * developing into deadlock if some works currently on the same queue 2314 * developing into deadlock if some works currently on the same queue
2399 * need to be processed to satisfy the GFP_KERNEL allocation. This is 2315 * need to be processed to satisfy the GFP_KERNEL allocation. This is
2400 * the problem rescuer solves. 2316 * the problem rescuer solves.
2401 * 2317 *
2402 * When such condition is possible, the gcwq summons rescuers of all 2318 * When such condition is possible, the pool summons rescuers of all
2403 * workqueues which have works queued on the gcwq and let them process 2319 * workqueues which have works queued on the pool and let them process
2404 * those works so that forward progress can be guaranteed. 2320 * those works so that forward progress can be guaranteed.
2405 * 2321 *
2406 * This should happen rarely. 2322 * This should happen rarely.
2407 */ 2323 */
2408static int rescuer_thread(void *__wq) 2324static int rescuer_thread(void *__rescuer)
2409{ 2325{
2410 struct workqueue_struct *wq = __wq; 2326 struct worker *rescuer = __rescuer;
2411 struct worker *rescuer = wq->rescuer; 2327 struct workqueue_struct *wq = rescuer->rescue_wq;
2412 struct list_head *scheduled = &rescuer->scheduled; 2328 struct list_head *scheduled = &rescuer->scheduled;
2413 bool is_unbound = wq->flags & WQ_UNBOUND; 2329 bool is_unbound = wq->flags & WQ_UNBOUND;
2414 unsigned int cpu; 2330 unsigned int cpu;
2415 2331
2416 set_user_nice(current, RESCUER_NICE_LEVEL); 2332 set_user_nice(current, RESCUER_NICE_LEVEL);
2333
2334 /*
2335 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
2336 * doesn't participate in concurrency management.
2337 */
2338 rescuer->task->flags |= PF_WQ_WORKER;
2417repeat: 2339repeat:
2418 set_current_state(TASK_INTERRUPTIBLE); 2340 set_current_state(TASK_INTERRUPTIBLE);
2419 2341
2420 if (kthread_should_stop()) { 2342 if (kthread_should_stop()) {
2421 __set_current_state(TASK_RUNNING); 2343 __set_current_state(TASK_RUNNING);
2344 rescuer->task->flags &= ~PF_WQ_WORKER;
2422 return 0; 2345 return 0;
2423 } 2346 }
2424 2347
@@ -2428,9 +2351,8 @@ repeat:
2428 */ 2351 */
2429 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2352 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2430 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2353 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2431 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); 2354 struct pool_workqueue *pwq = get_pwq(tcpu, wq);
2432 struct worker_pool *pool = cwq->pool; 2355 struct worker_pool *pool = pwq->pool;
2433 struct global_cwq *gcwq = pool->gcwq;
2434 struct work_struct *work, *n; 2356 struct work_struct *work, *n;
2435 2357
2436 __set_current_state(TASK_RUNNING); 2358 __set_current_state(TASK_RUNNING);
@@ -2446,22 +2368,24 @@ repeat:
2446 */ 2368 */
2447 BUG_ON(!list_empty(&rescuer->scheduled)); 2369 BUG_ON(!list_empty(&rescuer->scheduled));
2448 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2370 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2449 if (get_work_cwq(work) == cwq) 2371 if (get_work_pwq(work) == pwq)
2450 move_linked_works(work, scheduled, &n); 2372 move_linked_works(work, scheduled, &n);
2451 2373
2452 process_scheduled_works(rescuer); 2374 process_scheduled_works(rescuer);
2453 2375
2454 /* 2376 /*
2455 * Leave this gcwq. If keep_working() is %true, notify a 2377 * Leave this pool. If keep_working() is %true, notify a
2456 * regular worker; otherwise, we end up with 0 concurrency 2378 * regular worker; otherwise, we end up with 0 concurrency
2457 * and stalling the execution. 2379 * and stalling the execution.
2458 */ 2380 */
2459 if (keep_working(pool)) 2381 if (keep_working(pool))
2460 wake_up_worker(pool); 2382 wake_up_worker(pool);
2461 2383
2462 spin_unlock_irq(&gcwq->lock); 2384 spin_unlock_irq(&pool->lock);
2463 } 2385 }
2464 2386
2387 /* rescuers should never participate in concurrency management */
2388 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2465 schedule(); 2389 schedule();
2466 goto repeat; 2390 goto repeat;
2467} 2391}
@@ -2479,7 +2403,7 @@ static void wq_barrier_func(struct work_struct *work)
2479 2403
2480/** 2404/**
2481 * insert_wq_barrier - insert a barrier work 2405 * insert_wq_barrier - insert a barrier work
2482 * @cwq: cwq to insert barrier into 2406 * @pwq: pwq to insert barrier into
2483 * @barr: wq_barrier to insert 2407 * @barr: wq_barrier to insert
2484 * @target: target work to attach @barr to 2408 * @target: target work to attach @barr to
2485 * @worker: worker currently executing @target, NULL if @target is not executing 2409 * @worker: worker currently executing @target, NULL if @target is not executing
@@ -2496,12 +2420,12 @@ static void wq_barrier_func(struct work_struct *work)
2496 * after a work with LINKED flag set. 2420 * after a work with LINKED flag set.
2497 * 2421 *
2498 * Note that when @worker is non-NULL, @target may be modified 2422 * Note that when @worker is non-NULL, @target may be modified
2499 * underneath us, so we can't reliably determine cwq from @target. 2423 * underneath us, so we can't reliably determine pwq from @target.
2500 * 2424 *
2501 * CONTEXT: 2425 * CONTEXT:
2502 * spin_lock_irq(gcwq->lock). 2426 * spin_lock_irq(pool->lock).
2503 */ 2427 */
2504static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2428static void insert_wq_barrier(struct pool_workqueue *pwq,
2505 struct wq_barrier *barr, 2429 struct wq_barrier *barr,
2506 struct work_struct *target, struct worker *worker) 2430 struct work_struct *target, struct worker *worker)
2507{ 2431{
@@ -2509,7 +2433,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2509 unsigned int linked = 0; 2433 unsigned int linked = 0;
2510 2434
2511 /* 2435 /*
2512 * debugobject calls are safe here even with gcwq->lock locked 2436 * debugobject calls are safe here even with pool->lock locked
2513 * as we know for sure that this will not trigger any of the 2437 * as we know for sure that this will not trigger any of the
2514 * checks and call back into the fixup functions where we 2438 * checks and call back into the fixup functions where we
2515 * might deadlock. 2439 * might deadlock.
@@ -2534,23 +2458,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2534 } 2458 }
2535 2459
2536 debug_work_activate(&barr->work); 2460 debug_work_activate(&barr->work);
2537 insert_work(cwq, &barr->work, head, 2461 insert_work(pwq, &barr->work, head,
2538 work_color_to_flags(WORK_NO_COLOR) | linked); 2462 work_color_to_flags(WORK_NO_COLOR) | linked);
2539} 2463}
2540 2464
2541/** 2465/**
2542 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing 2466 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
2543 * @wq: workqueue being flushed 2467 * @wq: workqueue being flushed
2544 * @flush_color: new flush color, < 0 for no-op 2468 * @flush_color: new flush color, < 0 for no-op
2545 * @work_color: new work color, < 0 for no-op 2469 * @work_color: new work color, < 0 for no-op
2546 * 2470 *
2547 * Prepare cwqs for workqueue flushing. 2471 * Prepare pwqs for workqueue flushing.
2548 * 2472 *
2549 * If @flush_color is non-negative, flush_color on all cwqs should be 2473 * If @flush_color is non-negative, flush_color on all pwqs should be
2550 * -1. If no cwq has in-flight commands at the specified color, all 2474 * -1. If no pwq has in-flight commands at the specified color, all
2551 * cwq->flush_color's stay at -1 and %false is returned. If any cwq 2475 * pwq->flush_color's stay at -1 and %false is returned. If any pwq
2552 * has in flight commands, its cwq->flush_color is set to 2476 * has in flight commands, its pwq->flush_color is set to
2553 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq 2477 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
2554 * wakeup logic is armed and %true is returned. 2478 * wakeup logic is armed and %true is returned.
2555 * 2479 *
2556 * The caller should have initialized @wq->first_flusher prior to 2480 * The caller should have initialized @wq->first_flusher prior to
@@ -2558,7 +2482,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2558 * @flush_color is negative, no flush color update is done and %false 2482 * @flush_color is negative, no flush color update is done and %false
2559 * is returned. 2483 * is returned.
2560 * 2484 *
2561 * If @work_color is non-negative, all cwqs should have the same 2485 * If @work_color is non-negative, all pwqs should have the same
2562 * work_color which is previous to @work_color and all will be 2486 * work_color which is previous to @work_color and all will be
2563 * advanced to @work_color. 2487 * advanced to @work_color.
2564 * 2488 *
@@ -2569,42 +2493,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2569 * %true if @flush_color >= 0 and there's something to flush. %false 2493 * %true if @flush_color >= 0 and there's something to flush. %false
2570 * otherwise. 2494 * otherwise.
2571 */ 2495 */
2572static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, 2496static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2573 int flush_color, int work_color) 2497 int flush_color, int work_color)
2574{ 2498{
2575 bool wait = false; 2499 bool wait = false;
2576 unsigned int cpu; 2500 unsigned int cpu;
2577 2501
2578 if (flush_color >= 0) { 2502 if (flush_color >= 0) {
2579 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); 2503 BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
2580 atomic_set(&wq->nr_cwqs_to_flush, 1); 2504 atomic_set(&wq->nr_pwqs_to_flush, 1);
2581 } 2505 }
2582 2506
2583 for_each_cwq_cpu(cpu, wq) { 2507 for_each_pwq_cpu(cpu, wq) {
2584 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2508 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2585 struct global_cwq *gcwq = cwq->pool->gcwq; 2509 struct worker_pool *pool = pwq->pool;
2586 2510
2587 spin_lock_irq(&gcwq->lock); 2511 spin_lock_irq(&pool->lock);
2588 2512
2589 if (flush_color >= 0) { 2513 if (flush_color >= 0) {
2590 BUG_ON(cwq->flush_color != -1); 2514 BUG_ON(pwq->flush_color != -1);
2591 2515
2592 if (cwq->nr_in_flight[flush_color]) { 2516 if (pwq->nr_in_flight[flush_color]) {
2593 cwq->flush_color = flush_color; 2517 pwq->flush_color = flush_color;
2594 atomic_inc(&wq->nr_cwqs_to_flush); 2518 atomic_inc(&wq->nr_pwqs_to_flush);
2595 wait = true; 2519 wait = true;
2596 } 2520 }
2597 } 2521 }
2598 2522
2599 if (work_color >= 0) { 2523 if (work_color >= 0) {
2600 BUG_ON(work_color != work_next_color(cwq->work_color)); 2524 BUG_ON(work_color != work_next_color(pwq->work_color));
2601 cwq->work_color = work_color; 2525 pwq->work_color = work_color;
2602 } 2526 }
2603 2527
2604 spin_unlock_irq(&gcwq->lock); 2528 spin_unlock_irq(&pool->lock);
2605 } 2529 }
2606 2530
2607 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) 2531 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
2608 complete(&wq->first_flusher->done); 2532 complete(&wq->first_flusher->done);
2609 2533
2610 return wait; 2534 return wait;
@@ -2655,7 +2579,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2655 2579
2656 wq->first_flusher = &this_flusher; 2580 wq->first_flusher = &this_flusher;
2657 2581
2658 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, 2582 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
2659 wq->work_color)) { 2583 wq->work_color)) {
2660 /* nothing to flush, done */ 2584 /* nothing to flush, done */
2661 wq->flush_color = next_color; 2585 wq->flush_color = next_color;
@@ -2666,7 +2590,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2666 /* wait in queue */ 2590 /* wait in queue */
2667 BUG_ON(wq->flush_color == this_flusher.flush_color); 2591 BUG_ON(wq->flush_color == this_flusher.flush_color);
2668 list_add_tail(&this_flusher.list, &wq->flusher_queue); 2592 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2669 flush_workqueue_prep_cwqs(wq, -1, wq->work_color); 2593 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2670 } 2594 }
2671 } else { 2595 } else {
2672 /* 2596 /*
@@ -2733,7 +2657,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2733 2657
2734 list_splice_tail_init(&wq->flusher_overflow, 2658 list_splice_tail_init(&wq->flusher_overflow,
2735 &wq->flusher_queue); 2659 &wq->flusher_queue);
2736 flush_workqueue_prep_cwqs(wq, -1, wq->work_color); 2660 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2737 } 2661 }
2738 2662
2739 if (list_empty(&wq->flusher_queue)) { 2663 if (list_empty(&wq->flusher_queue)) {
@@ -2743,7 +2667,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2743 2667
2744 /* 2668 /*
2745 * Need to flush more colors. Make the next flusher 2669 * Need to flush more colors. Make the next flusher
2746 * the new first flusher and arm cwqs. 2670 * the new first flusher and arm pwqs.
2747 */ 2671 */
2748 BUG_ON(wq->flush_color == wq->work_color); 2672 BUG_ON(wq->flush_color == wq->work_color);
2749 BUG_ON(wq->flush_color != next->flush_color); 2673 BUG_ON(wq->flush_color != next->flush_color);
@@ -2751,7 +2675,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2751 list_del_init(&next->list); 2675 list_del_init(&next->list);
2752 wq->first_flusher = next; 2676 wq->first_flusher = next;
2753 2677
2754 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) 2678 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
2755 break; 2679 break;
2756 2680
2757 /* 2681 /*
@@ -2794,13 +2718,13 @@ void drain_workqueue(struct workqueue_struct *wq)
2794reflush: 2718reflush:
2795 flush_workqueue(wq); 2719 flush_workqueue(wq);
2796 2720
2797 for_each_cwq_cpu(cpu, wq) { 2721 for_each_pwq_cpu(cpu, wq) {
2798 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2722 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2799 bool drained; 2723 bool drained;
2800 2724
2801 spin_lock_irq(&cwq->pool->gcwq->lock); 2725 spin_lock_irq(&pwq->pool->lock);
2802 drained = !cwq->nr_active && list_empty(&cwq->delayed_works); 2726 drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
2803 spin_unlock_irq(&cwq->pool->gcwq->lock); 2727 spin_unlock_irq(&pwq->pool->lock);
2804 2728
2805 if (drained) 2729 if (drained)
2806 continue; 2730 continue;
@@ -2822,34 +2746,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue);
2822static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) 2746static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2823{ 2747{
2824 struct worker *worker = NULL; 2748 struct worker *worker = NULL;
2825 struct global_cwq *gcwq; 2749 struct worker_pool *pool;
2826 struct cpu_workqueue_struct *cwq; 2750 struct pool_workqueue *pwq;
2827 2751
2828 might_sleep(); 2752 might_sleep();
2829 gcwq = get_work_gcwq(work); 2753 pool = get_work_pool(work);
2830 if (!gcwq) 2754 if (!pool)
2831 return false; 2755 return false;
2832 2756
2833 spin_lock_irq(&gcwq->lock); 2757 spin_lock_irq(&pool->lock);
2834 if (!list_empty(&work->entry)) { 2758 /* see the comment in try_to_grab_pending() with the same code */
2835 /* 2759 pwq = get_work_pwq(work);
2836 * See the comment near try_to_grab_pending()->smp_rmb(). 2760 if (pwq) {
2837 * If it was re-queued to a different gcwq under us, we 2761 if (unlikely(pwq->pool != pool))
2838 * are not going to wait.
2839 */
2840 smp_rmb();
2841 cwq = get_work_cwq(work);
2842 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2843 goto already_gone; 2762 goto already_gone;
2844 } else { 2763 } else {
2845 worker = find_worker_executing_work(gcwq, work); 2764 worker = find_worker_executing_work(pool, work);
2846 if (!worker) 2765 if (!worker)
2847 goto already_gone; 2766 goto already_gone;
2848 cwq = worker->current_cwq; 2767 pwq = worker->current_pwq;
2849 } 2768 }
2850 2769
2851 insert_wq_barrier(cwq, barr, work, worker); 2770 insert_wq_barrier(pwq, barr, work, worker);
2852 spin_unlock_irq(&gcwq->lock); 2771 spin_unlock_irq(&pool->lock);
2853 2772
2854 /* 2773 /*
2855 * If @max_active is 1 or rescuer is in use, flushing another work 2774 * If @max_active is 1 or rescuer is in use, flushing another work
@@ -2857,15 +2776,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2857 * flusher is not running on the same workqueue by verifying write 2776 * flusher is not running on the same workqueue by verifying write
2858 * access. 2777 * access.
2859 */ 2778 */
2860 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) 2779 if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
2861 lock_map_acquire(&cwq->wq->lockdep_map); 2780 lock_map_acquire(&pwq->wq->lockdep_map);
2862 else 2781 else
2863 lock_map_acquire_read(&cwq->wq->lockdep_map); 2782 lock_map_acquire_read(&pwq->wq->lockdep_map);
2864 lock_map_release(&cwq->wq->lockdep_map); 2783 lock_map_release(&pwq->wq->lockdep_map);
2865 2784
2866 return true; 2785 return true;
2867already_gone: 2786already_gone:
2868 spin_unlock_irq(&gcwq->lock); 2787 spin_unlock_irq(&pool->lock);
2869 return false; 2788 return false;
2870} 2789}
2871 2790
@@ -2961,8 +2880,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
2961{ 2880{
2962 local_irq_disable(); 2881 local_irq_disable();
2963 if (del_timer_sync(&dwork->timer)) 2882 if (del_timer_sync(&dwork->timer))
2964 __queue_work(dwork->cpu, 2883 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
2965 get_work_cwq(&dwork->work)->wq, &dwork->work);
2966 local_irq_enable(); 2884 local_irq_enable();
2967 return flush_work(&dwork->work); 2885 return flush_work(&dwork->work);
2968} 2886}
@@ -2992,7 +2910,8 @@ bool cancel_delayed_work(struct delayed_work *dwork)
2992 if (unlikely(ret < 0)) 2910 if (unlikely(ret < 0))
2993 return false; 2911 return false;
2994 2912
2995 set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); 2913 set_work_pool_and_clear_pending(&dwork->work,
2914 get_work_pool_id(&dwork->work));
2996 local_irq_restore(flags); 2915 local_irq_restore(flags);
2997 return ret; 2916 return ret;
2998} 2917}
@@ -3171,46 +3090,46 @@ int keventd_up(void)
3171 return system_wq != NULL; 3090 return system_wq != NULL;
3172} 3091}
3173 3092
3174static int alloc_cwqs(struct workqueue_struct *wq) 3093static int alloc_pwqs(struct workqueue_struct *wq)
3175{ 3094{
3176 /* 3095 /*
3177 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. 3096 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
3178 * Make sure that the alignment isn't lower than that of 3097 * Make sure that the alignment isn't lower than that of
3179 * unsigned long long. 3098 * unsigned long long.
3180 */ 3099 */
3181 const size_t size = sizeof(struct cpu_workqueue_struct); 3100 const size_t size = sizeof(struct pool_workqueue);
3182 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 3101 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
3183 __alignof__(unsigned long long)); 3102 __alignof__(unsigned long long));
3184 3103
3185 if (!(wq->flags & WQ_UNBOUND)) 3104 if (!(wq->flags & WQ_UNBOUND))
3186 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 3105 wq->pool_wq.pcpu = __alloc_percpu(size, align);
3187 else { 3106 else {
3188 void *ptr; 3107 void *ptr;
3189 3108
3190 /* 3109 /*
3191 * Allocate enough room to align cwq and put an extra 3110 * Allocate enough room to align pwq and put an extra
3192 * pointer at the end pointing back to the originally 3111 * pointer at the end pointing back to the originally
3193 * allocated pointer which will be used for free. 3112 * allocated pointer which will be used for free.
3194 */ 3113 */
3195 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); 3114 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
3196 if (ptr) { 3115 if (ptr) {
3197 wq->cpu_wq.single = PTR_ALIGN(ptr, align); 3116 wq->pool_wq.single = PTR_ALIGN(ptr, align);
3198 *(void **)(wq->cpu_wq.single + 1) = ptr; 3117 *(void **)(wq->pool_wq.single + 1) = ptr;
3199 } 3118 }
3200 } 3119 }
3201 3120
3202 /* just in case, make sure it's actually aligned */ 3121 /* just in case, make sure it's actually aligned */
3203 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 3122 BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
3204 return wq->cpu_wq.v ? 0 : -ENOMEM; 3123 return wq->pool_wq.v ? 0 : -ENOMEM;
3205} 3124}
3206 3125
3207static void free_cwqs(struct workqueue_struct *wq) 3126static void free_pwqs(struct workqueue_struct *wq)
3208{ 3127{
3209 if (!(wq->flags & WQ_UNBOUND)) 3128 if (!(wq->flags & WQ_UNBOUND))
3210 free_percpu(wq->cpu_wq.pcpu); 3129 free_percpu(wq->pool_wq.pcpu);
3211 else if (wq->cpu_wq.single) { 3130 else if (wq->pool_wq.single) {
3212 /* the pointer to free is stored right after the cwq */ 3131 /* the pointer to free is stored right after the pwq */
3213 kfree(*(void **)(wq->cpu_wq.single + 1)); 3132 kfree(*(void **)(wq->pool_wq.single + 1));
3214 } 3133 }
3215} 3134}
3216 3135
@@ -3264,27 +3183,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3264 wq->flags = flags; 3183 wq->flags = flags;
3265 wq->saved_max_active = max_active; 3184 wq->saved_max_active = max_active;
3266 mutex_init(&wq->flush_mutex); 3185 mutex_init(&wq->flush_mutex);
3267 atomic_set(&wq->nr_cwqs_to_flush, 0); 3186 atomic_set(&wq->nr_pwqs_to_flush, 0);
3268 INIT_LIST_HEAD(&wq->flusher_queue); 3187 INIT_LIST_HEAD(&wq->flusher_queue);
3269 INIT_LIST_HEAD(&wq->flusher_overflow); 3188 INIT_LIST_HEAD(&wq->flusher_overflow);
3270 3189
3271 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 3190 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3272 INIT_LIST_HEAD(&wq->list); 3191 INIT_LIST_HEAD(&wq->list);
3273 3192
3274 if (alloc_cwqs(wq) < 0) 3193 if (alloc_pwqs(wq) < 0)
3275 goto err; 3194 goto err;
3276 3195
3277 for_each_cwq_cpu(cpu, wq) { 3196 for_each_pwq_cpu(cpu, wq) {
3278 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3197 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3279 struct global_cwq *gcwq = get_gcwq(cpu); 3198
3280 int pool_idx = (bool)(flags & WQ_HIGHPRI); 3199 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3281 3200 pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
3282 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3201 pwq->wq = wq;
3283 cwq->pool = &gcwq->pools[pool_idx]; 3202 pwq->flush_color = -1;
3284 cwq->wq = wq; 3203 pwq->max_active = max_active;
3285 cwq->flush_color = -1; 3204 INIT_LIST_HEAD(&pwq->delayed_works);
3286 cwq->max_active = max_active;
3287 INIT_LIST_HEAD(&cwq->delayed_works);
3288 } 3205 }
3289 3206
3290 if (flags & WQ_RESCUER) { 3207 if (flags & WQ_RESCUER) {
@@ -3297,7 +3214,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3297 if (!rescuer) 3214 if (!rescuer)
3298 goto err; 3215 goto err;
3299 3216
3300 rescuer->task = kthread_create(rescuer_thread, wq, "%s", 3217 rescuer->rescue_wq = wq;
3218 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3301 wq->name); 3219 wq->name);
3302 if (IS_ERR(rescuer->task)) 3220 if (IS_ERR(rescuer->task))
3303 goto err; 3221 goto err;
@@ -3314,8 +3232,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3314 spin_lock(&workqueue_lock); 3232 spin_lock(&workqueue_lock);
3315 3233
3316 if (workqueue_freezing && wq->flags & WQ_FREEZABLE) 3234 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3317 for_each_cwq_cpu(cpu, wq) 3235 for_each_pwq_cpu(cpu, wq)
3318 get_cwq(cpu, wq)->max_active = 0; 3236 get_pwq(cpu, wq)->max_active = 0;
3319 3237
3320 list_add(&wq->list, &workqueues); 3238 list_add(&wq->list, &workqueues);
3321 3239
@@ -3324,7 +3242,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3324 return wq; 3242 return wq;
3325err: 3243err:
3326 if (wq) { 3244 if (wq) {
3327 free_cwqs(wq); 3245 free_pwqs(wq);
3328 free_mayday_mask(wq->mayday_mask); 3246 free_mayday_mask(wq->mayday_mask);
3329 kfree(wq->rescuer); 3247 kfree(wq->rescuer);
3330 kfree(wq); 3248 kfree(wq);
@@ -3355,14 +3273,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
3355 spin_unlock(&workqueue_lock); 3273 spin_unlock(&workqueue_lock);
3356 3274
3357 /* sanity check */ 3275 /* sanity check */
3358 for_each_cwq_cpu(cpu, wq) { 3276 for_each_pwq_cpu(cpu, wq) {
3359 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3277 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3360 int i; 3278 int i;
3361 3279
3362 for (i = 0; i < WORK_NR_COLORS; i++) 3280 for (i = 0; i < WORK_NR_COLORS; i++)
3363 BUG_ON(cwq->nr_in_flight[i]); 3281 BUG_ON(pwq->nr_in_flight[i]);
3364 BUG_ON(cwq->nr_active); 3282 BUG_ON(pwq->nr_active);
3365 BUG_ON(!list_empty(&cwq->delayed_works)); 3283 BUG_ON(!list_empty(&pwq->delayed_works));
3366 } 3284 }
3367 3285
3368 if (wq->flags & WQ_RESCUER) { 3286 if (wq->flags & WQ_RESCUER) {
@@ -3371,29 +3289,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
3371 kfree(wq->rescuer); 3289 kfree(wq->rescuer);
3372 } 3290 }
3373 3291
3374 free_cwqs(wq); 3292 free_pwqs(wq);
3375 kfree(wq); 3293 kfree(wq);
3376} 3294}
3377EXPORT_SYMBOL_GPL(destroy_workqueue); 3295EXPORT_SYMBOL_GPL(destroy_workqueue);
3378 3296
3379/** 3297/**
3380 * cwq_set_max_active - adjust max_active of a cwq 3298 * pwq_set_max_active - adjust max_active of a pwq
3381 * @cwq: target cpu_workqueue_struct 3299 * @pwq: target pool_workqueue
3382 * @max_active: new max_active value. 3300 * @max_active: new max_active value.
3383 * 3301 *
3384 * Set @cwq->max_active to @max_active and activate delayed works if 3302 * Set @pwq->max_active to @max_active and activate delayed works if
3385 * increased. 3303 * increased.
3386 * 3304 *
3387 * CONTEXT: 3305 * CONTEXT:
3388 * spin_lock_irq(gcwq->lock). 3306 * spin_lock_irq(pool->lock).
3389 */ 3307 */
3390static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) 3308static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3391{ 3309{
3392 cwq->max_active = max_active; 3310 pwq->max_active = max_active;
3393 3311
3394 while (!list_empty(&cwq->delayed_works) && 3312 while (!list_empty(&pwq->delayed_works) &&
3395 cwq->nr_active < cwq->max_active) 3313 pwq->nr_active < pwq->max_active)
3396 cwq_activate_first_delayed(cwq); 3314 pwq_activate_first_delayed(pwq);
3397} 3315}
3398 3316
3399/** 3317/**
@@ -3416,16 +3334,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3416 3334
3417 wq->saved_max_active = max_active; 3335 wq->saved_max_active = max_active;
3418 3336
3419 for_each_cwq_cpu(cpu, wq) { 3337 for_each_pwq_cpu(cpu, wq) {
3420 struct global_cwq *gcwq = get_gcwq(cpu); 3338 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3339 struct worker_pool *pool = pwq->pool;
3421 3340
3422 spin_lock_irq(&gcwq->lock); 3341 spin_lock_irq(&pool->lock);
3423 3342
3424 if (!(wq->flags & WQ_FREEZABLE) || 3343 if (!(wq->flags & WQ_FREEZABLE) ||
3425 !(gcwq->flags & GCWQ_FREEZING)) 3344 !(pool->flags & POOL_FREEZING))
3426 cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); 3345 pwq_set_max_active(pwq, max_active);
3427 3346
3428 spin_unlock_irq(&gcwq->lock); 3347 spin_unlock_irq(&pool->lock);
3429 } 3348 }
3430 3349
3431 spin_unlock(&workqueue_lock); 3350 spin_unlock(&workqueue_lock);
@@ -3446,57 +3365,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3446 */ 3365 */
3447bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) 3366bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3448{ 3367{
3449 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3368 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3450 3369
3451 return !list_empty(&cwq->delayed_works); 3370 return !list_empty(&pwq->delayed_works);
3452} 3371}
3453EXPORT_SYMBOL_GPL(workqueue_congested); 3372EXPORT_SYMBOL_GPL(workqueue_congested);
3454 3373
3455/** 3374/**
3456 * work_cpu - return the last known associated cpu for @work
3457 * @work: the work of interest
3458 *
3459 * RETURNS:
3460 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
3461 */
3462unsigned int work_cpu(struct work_struct *work)
3463{
3464 struct global_cwq *gcwq = get_work_gcwq(work);
3465
3466 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3467}
3468EXPORT_SYMBOL_GPL(work_cpu);
3469
3470/**
3471 * work_busy - test whether a work is currently pending or running 3375 * work_busy - test whether a work is currently pending or running
3472 * @work: the work to be tested 3376 * @work: the work to be tested
3473 * 3377 *
3474 * Test whether @work is currently pending or running. There is no 3378 * Test whether @work is currently pending or running. There is no
3475 * synchronization around this function and the test result is 3379 * synchronization around this function and the test result is
3476 * unreliable and only useful as advisory hints or for debugging. 3380 * unreliable and only useful as advisory hints or for debugging.
3477 * Especially for reentrant wqs, the pending state might hide the
3478 * running state.
3479 * 3381 *
3480 * RETURNS: 3382 * RETURNS:
3481 * OR'd bitmask of WORK_BUSY_* bits. 3383 * OR'd bitmask of WORK_BUSY_* bits.
3482 */ 3384 */
3483unsigned int work_busy(struct work_struct *work) 3385unsigned int work_busy(struct work_struct *work)
3484{ 3386{
3485 struct global_cwq *gcwq = get_work_gcwq(work); 3387 struct worker_pool *pool = get_work_pool(work);
3486 unsigned long flags; 3388 unsigned long flags;
3487 unsigned int ret = 0; 3389 unsigned int ret = 0;
3488 3390
3489 if (!gcwq)
3490 return 0;
3491
3492 spin_lock_irqsave(&gcwq->lock, flags);
3493
3494 if (work_pending(work)) 3391 if (work_pending(work))
3495 ret |= WORK_BUSY_PENDING; 3392 ret |= WORK_BUSY_PENDING;
3496 if (find_worker_executing_work(gcwq, work))
3497 ret |= WORK_BUSY_RUNNING;
3498 3393
3499 spin_unlock_irqrestore(&gcwq->lock, flags); 3394 if (pool) {
3395 spin_lock_irqsave(&pool->lock, flags);
3396 if (find_worker_executing_work(pool, work))
3397 ret |= WORK_BUSY_RUNNING;
3398 spin_unlock_irqrestore(&pool->lock, flags);
3399 }
3500 3400
3501 return ret; 3401 return ret;
3502} 3402}
@@ -3506,65 +3406,48 @@ EXPORT_SYMBOL_GPL(work_busy);
3506 * CPU hotplug. 3406 * CPU hotplug.
3507 * 3407 *
3508 * There are two challenges in supporting CPU hotplug. Firstly, there 3408 * There are two challenges in supporting CPU hotplug. Firstly, there
3509 * are a lot of assumptions on strong associations among work, cwq and 3409 * are a lot of assumptions on strong associations among work, pwq and
3510 * gcwq which make migrating pending and scheduled works very 3410 * pool which make migrating pending and scheduled works very
3511 * difficult to implement without impacting hot paths. Secondly, 3411 * difficult to implement without impacting hot paths. Secondly,
3512 * gcwqs serve mix of short, long and very long running works making 3412 * worker pools serve mix of short, long and very long running works making
3513 * blocked draining impractical. 3413 * blocked draining impractical.
3514 * 3414 *
3515 * This is solved by allowing a gcwq to be disassociated from the CPU 3415 * This is solved by allowing the pools to be disassociated from the CPU
3516 * running as an unbound one and allowing it to be reattached later if the 3416 * running as an unbound one and allowing it to be reattached later if the
3517 * cpu comes back online. 3417 * cpu comes back online.
3518 */ 3418 */
3519 3419
3520/* claim manager positions of all pools */ 3420static void wq_unbind_fn(struct work_struct *work)
3521static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
3522{ 3421{
3523 struct worker_pool *pool; 3422 int cpu = smp_processor_id();
3524
3525 for_each_worker_pool(pool, gcwq)
3526 mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
3527 spin_lock_irq(&gcwq->lock);
3528}
3529
3530/* release manager positions */
3531static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
3532{
3533 struct worker_pool *pool;
3534
3535 spin_unlock_irq(&gcwq->lock);
3536 for_each_worker_pool(pool, gcwq)
3537 mutex_unlock(&pool->assoc_mutex);
3538}
3539
3540static void gcwq_unbind_fn(struct work_struct *work)
3541{
3542 struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3543 struct worker_pool *pool; 3423 struct worker_pool *pool;
3544 struct worker *worker; 3424 struct worker *worker;
3545 struct hlist_node *pos;
3546 int i; 3425 int i;
3547 3426
3548 BUG_ON(gcwq->cpu != smp_processor_id()); 3427 for_each_std_worker_pool(pool, cpu) {
3428 BUG_ON(cpu != smp_processor_id());
3549 3429
3550 gcwq_claim_assoc_and_lock(gcwq); 3430 mutex_lock(&pool->assoc_mutex);
3431 spin_lock_irq(&pool->lock);
3551 3432
3552 /* 3433 /*
3553 * We've claimed all manager positions. Make all workers unbound 3434 * We've claimed all manager positions. Make all workers
3554 * and set DISASSOCIATED. Before this, all workers except for the 3435 * unbound and set DISASSOCIATED. Before this, all workers
3555 * ones which are still executing works from before the last CPU 3436 * except for the ones which are still executing works from
3556 * down must be on the cpu. After this, they may become diasporas. 3437 * before the last CPU down must be on the cpu. After
3557 */ 3438 * this, they may become diasporas.
3558 for_each_worker_pool(pool, gcwq) 3439 */
3559 list_for_each_entry(worker, &pool->idle_list, entry) 3440 list_for_each_entry(worker, &pool->idle_list, entry)
3560 worker->flags |= WORKER_UNBOUND; 3441 worker->flags |= WORKER_UNBOUND;
3561 3442
3562 for_each_busy_worker(worker, i, pos, gcwq) 3443 for_each_busy_worker(worker, i, pool)
3563 worker->flags |= WORKER_UNBOUND; 3444 worker->flags |= WORKER_UNBOUND;
3564 3445
3565 gcwq->flags |= GCWQ_DISASSOCIATED; 3446 pool->flags |= POOL_DISASSOCIATED;
3566 3447
3567 gcwq_release_assoc_and_unlock(gcwq); 3448 spin_unlock_irq(&pool->lock);
3449 mutex_unlock(&pool->assoc_mutex);
3450 }
3568 3451
3569 /* 3452 /*
3570 * Call schedule() so that we cross rq->lock and thus can guarantee 3453 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3576,16 +3459,16 @@ static void gcwq_unbind_fn(struct work_struct *work)
3576 /* 3459 /*
3577 * Sched callbacks are disabled now. Zap nr_running. After this, 3460 * Sched callbacks are disabled now. Zap nr_running. After this,
3578 * nr_running stays zero and need_more_worker() and keep_working() 3461 * nr_running stays zero and need_more_worker() and keep_working()
3579 * are always true as long as the worklist is not empty. @gcwq now 3462 * are always true as long as the worklist is not empty. Pools on
3580 * behaves as unbound (in terms of concurrency management) gcwq 3463 * @cpu now behave as unbound (in terms of concurrency management)
3581 * which is served by workers tied to the CPU. 3464 * pools which are served by workers tied to the CPU.
3582 * 3465 *
3583 * On return from this function, the current worker would trigger 3466 * On return from this function, the current worker would trigger
3584 * unbound chain execution of pending work items if other workers 3467 * unbound chain execution of pending work items if other workers
3585 * didn't already. 3468 * didn't already.
3586 */ 3469 */
3587 for_each_worker_pool(pool, gcwq) 3470 for_each_std_worker_pool(pool, cpu)
3588 atomic_set(get_pool_nr_running(pool), 0); 3471 atomic_set(&pool->nr_running, 0);
3589} 3472}
3590 3473
3591/* 3474/*
@@ -3597,12 +3480,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3597 void *hcpu) 3480 void *hcpu)
3598{ 3481{
3599 unsigned int cpu = (unsigned long)hcpu; 3482 unsigned int cpu = (unsigned long)hcpu;
3600 struct global_cwq *gcwq = get_gcwq(cpu);
3601 struct worker_pool *pool; 3483 struct worker_pool *pool;
3602 3484
3603 switch (action & ~CPU_TASKS_FROZEN) { 3485 switch (action & ~CPU_TASKS_FROZEN) {
3604 case CPU_UP_PREPARE: 3486 case CPU_UP_PREPARE:
3605 for_each_worker_pool(pool, gcwq) { 3487 for_each_std_worker_pool(pool, cpu) {
3606 struct worker *worker; 3488 struct worker *worker;
3607 3489
3608 if (pool->nr_workers) 3490 if (pool->nr_workers)
@@ -3612,18 +3494,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3612 if (!worker) 3494 if (!worker)
3613 return NOTIFY_BAD; 3495 return NOTIFY_BAD;
3614 3496
3615 spin_lock_irq(&gcwq->lock); 3497 spin_lock_irq(&pool->lock);
3616 start_worker(worker); 3498 start_worker(worker);
3617 spin_unlock_irq(&gcwq->lock); 3499 spin_unlock_irq(&pool->lock);
3618 } 3500 }
3619 break; 3501 break;
3620 3502
3621 case CPU_DOWN_FAILED: 3503 case CPU_DOWN_FAILED:
3622 case CPU_ONLINE: 3504 case CPU_ONLINE:
3623 gcwq_claim_assoc_and_lock(gcwq); 3505 for_each_std_worker_pool(pool, cpu) {
3624 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3506 mutex_lock(&pool->assoc_mutex);
3625 rebind_workers(gcwq); 3507 spin_lock_irq(&pool->lock);
3626 gcwq_release_assoc_and_unlock(gcwq); 3508
3509 pool->flags &= ~POOL_DISASSOCIATED;
3510 rebind_workers(pool);
3511
3512 spin_unlock_irq(&pool->lock);
3513 mutex_unlock(&pool->assoc_mutex);
3514 }
3627 break; 3515 break;
3628 } 3516 }
3629 return NOTIFY_OK; 3517 return NOTIFY_OK;
@@ -3643,7 +3531,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3643 switch (action & ~CPU_TASKS_FROZEN) { 3531 switch (action & ~CPU_TASKS_FROZEN) {
3644 case CPU_DOWN_PREPARE: 3532 case CPU_DOWN_PREPARE:
3645 /* unbinding should happen on the local CPU */ 3533 /* unbinding should happen on the local CPU */
3646 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); 3534 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3647 queue_work_on(cpu, system_highpri_wq, &unbind_work); 3535 queue_work_on(cpu, system_highpri_wq, &unbind_work);
3648 flush_work(&unbind_work); 3536 flush_work(&unbind_work);
3649 break; 3537 break;
@@ -3696,10 +3584,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3696 * 3584 *
3697 * Start freezing workqueues. After this function returns, all freezable 3585 * Start freezing workqueues. After this function returns, all freezable
3698 * workqueues will queue new works to their frozen_works list instead of 3586 * workqueues will queue new works to their frozen_works list instead of
3699 * gcwq->worklist. 3587 * pool->worklist.
3700 * 3588 *
3701 * CONTEXT: 3589 * CONTEXT:
3702 * Grabs and releases workqueue_lock and gcwq->lock's. 3590 * Grabs and releases workqueue_lock and pool->lock's.
3703 */ 3591 */
3704void freeze_workqueues_begin(void) 3592void freeze_workqueues_begin(void)
3705{ 3593{
@@ -3710,23 +3598,26 @@ void freeze_workqueues_begin(void)
3710 BUG_ON(workqueue_freezing); 3598 BUG_ON(workqueue_freezing);
3711 workqueue_freezing = true; 3599 workqueue_freezing = true;
3712 3600
3713 for_each_gcwq_cpu(cpu) { 3601 for_each_wq_cpu(cpu) {
3714 struct global_cwq *gcwq = get_gcwq(cpu); 3602 struct worker_pool *pool;
3715 struct workqueue_struct *wq; 3603 struct workqueue_struct *wq;
3716 3604
3717 spin_lock_irq(&gcwq->lock); 3605 for_each_std_worker_pool(pool, cpu) {
3606 spin_lock_irq(&pool->lock);
3718 3607
3719 BUG_ON(gcwq->flags & GCWQ_FREEZING); 3608 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3720 gcwq->flags |= GCWQ_FREEZING; 3609 pool->flags |= POOL_FREEZING;
3721 3610
3722 list_for_each_entry(wq, &workqueues, list) { 3611 list_for_each_entry(wq, &workqueues, list) {
3723 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3612 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3724 3613
3725 if (cwq && wq->flags & WQ_FREEZABLE) 3614 if (pwq && pwq->pool == pool &&
3726 cwq->max_active = 0; 3615 (wq->flags & WQ_FREEZABLE))
3727 } 3616 pwq->max_active = 0;
3617 }
3728 3618
3729 spin_unlock_irq(&gcwq->lock); 3619 spin_unlock_irq(&pool->lock);
3620 }
3730 } 3621 }
3731 3622
3732 spin_unlock(&workqueue_lock); 3623 spin_unlock(&workqueue_lock);
@@ -3754,20 +3645,20 @@ bool freeze_workqueues_busy(void)
3754 3645
3755 BUG_ON(!workqueue_freezing); 3646 BUG_ON(!workqueue_freezing);
3756 3647
3757 for_each_gcwq_cpu(cpu) { 3648 for_each_wq_cpu(cpu) {
3758 struct workqueue_struct *wq; 3649 struct workqueue_struct *wq;
3759 /* 3650 /*
3760 * nr_active is monotonically decreasing. It's safe 3651 * nr_active is monotonically decreasing. It's safe
3761 * to peek without lock. 3652 * to peek without lock.
3762 */ 3653 */
3763 list_for_each_entry(wq, &workqueues, list) { 3654 list_for_each_entry(wq, &workqueues, list) {
3764 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3655 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3765 3656
3766 if (!cwq || !(wq->flags & WQ_FREEZABLE)) 3657 if (!pwq || !(wq->flags & WQ_FREEZABLE))
3767 continue; 3658 continue;
3768 3659
3769 BUG_ON(cwq->nr_active < 0); 3660 BUG_ON(pwq->nr_active < 0);
3770 if (cwq->nr_active) { 3661 if (pwq->nr_active) {
3771 busy = true; 3662 busy = true;
3772 goto out_unlock; 3663 goto out_unlock;
3773 } 3664 }
@@ -3782,10 +3673,10 @@ out_unlock:
3782 * thaw_workqueues - thaw workqueues 3673 * thaw_workqueues - thaw workqueues
3783 * 3674 *
3784 * Thaw workqueues. Normal queueing is restored and all collected 3675 * Thaw workqueues. Normal queueing is restored and all collected
3785 * frozen works are transferred to their respective gcwq worklists. 3676 * frozen works are transferred to their respective pool worklists.
3786 * 3677 *
3787 * CONTEXT: 3678 * CONTEXT:
3788 * Grabs and releases workqueue_lock and gcwq->lock's. 3679 * Grabs and releases workqueue_lock and pool->lock's.
3789 */ 3680 */
3790void thaw_workqueues(void) 3681void thaw_workqueues(void)
3791{ 3682{
@@ -3796,30 +3687,31 @@ void thaw_workqueues(void)
3796 if (!workqueue_freezing) 3687 if (!workqueue_freezing)
3797 goto out_unlock; 3688 goto out_unlock;
3798 3689
3799 for_each_gcwq_cpu(cpu) { 3690 for_each_wq_cpu(cpu) {
3800 struct global_cwq *gcwq = get_gcwq(cpu);
3801 struct worker_pool *pool; 3691 struct worker_pool *pool;
3802 struct workqueue_struct *wq; 3692 struct workqueue_struct *wq;
3803 3693
3804 spin_lock_irq(&gcwq->lock); 3694 for_each_std_worker_pool(pool, cpu) {
3695 spin_lock_irq(&pool->lock);
3805 3696
3806 BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); 3697 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
3807 gcwq->flags &= ~GCWQ_FREEZING; 3698 pool->flags &= ~POOL_FREEZING;
3808 3699
3809 list_for_each_entry(wq, &workqueues, list) { 3700 list_for_each_entry(wq, &workqueues, list) {
3810 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3701 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3811 3702
3812 if (!cwq || !(wq->flags & WQ_FREEZABLE)) 3703 if (!pwq || pwq->pool != pool ||
3813 continue; 3704 !(wq->flags & WQ_FREEZABLE))
3705 continue;
3814 3706
3815 /* restore max_active and repopulate worklist */ 3707 /* restore max_active and repopulate worklist */
3816 cwq_set_max_active(cwq, wq->saved_max_active); 3708 pwq_set_max_active(pwq, wq->saved_max_active);
3817 } 3709 }
3818 3710
3819 for_each_worker_pool(pool, gcwq)
3820 wake_up_worker(pool); 3711 wake_up_worker(pool);
3821 3712
3822 spin_unlock_irq(&gcwq->lock); 3713 spin_unlock_irq(&pool->lock);
3714 }
3823 } 3715 }
3824 3716
3825 workqueue_freezing = false; 3717 workqueue_freezing = false;
@@ -3831,60 +3723,56 @@ out_unlock:
3831static int __init init_workqueues(void) 3723static int __init init_workqueues(void)
3832{ 3724{
3833 unsigned int cpu; 3725 unsigned int cpu;
3834 int i;
3835 3726
3836 /* make sure we have enough bits for OFFQ CPU number */ 3727 /* make sure we have enough bits for OFFQ pool ID */
3837 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < 3728 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3838 WORK_CPU_LAST); 3729 WORK_CPU_END * NR_STD_WORKER_POOLS);
3839 3730
3840 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 3731 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3841 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 3732 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3842 3733
3843 /* initialize gcwqs */ 3734 /* initialize CPU pools */
3844 for_each_gcwq_cpu(cpu) { 3735 for_each_wq_cpu(cpu) {
3845 struct global_cwq *gcwq = get_gcwq(cpu);
3846 struct worker_pool *pool; 3736 struct worker_pool *pool;
3847 3737
3848 spin_lock_init(&gcwq->lock); 3738 for_each_std_worker_pool(pool, cpu) {
3849 gcwq->cpu = cpu; 3739 spin_lock_init(&pool->lock);
3850 gcwq->flags |= GCWQ_DISASSOCIATED; 3740 pool->cpu = cpu;
3851 3741 pool->flags |= POOL_DISASSOCIATED;
3852 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3853 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3854
3855 for_each_worker_pool(pool, gcwq) {
3856 pool->gcwq = gcwq;
3857 INIT_LIST_HEAD(&pool->worklist); 3742 INIT_LIST_HEAD(&pool->worklist);
3858 INIT_LIST_HEAD(&pool->idle_list); 3743 INIT_LIST_HEAD(&pool->idle_list);
3744 hash_init(pool->busy_hash);
3859 3745
3860 init_timer_deferrable(&pool->idle_timer); 3746 init_timer_deferrable(&pool->idle_timer);
3861 pool->idle_timer.function = idle_worker_timeout; 3747 pool->idle_timer.function = idle_worker_timeout;
3862 pool->idle_timer.data = (unsigned long)pool; 3748 pool->idle_timer.data = (unsigned long)pool;
3863 3749
3864 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, 3750 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3865 (unsigned long)pool); 3751 (unsigned long)pool);
3866 3752
3867 mutex_init(&pool->assoc_mutex); 3753 mutex_init(&pool->assoc_mutex);
3868 ida_init(&pool->worker_ida); 3754 ida_init(&pool->worker_ida);
3755
3756 /* alloc pool ID */
3757 BUG_ON(worker_pool_assign_id(pool));
3869 } 3758 }
3870 } 3759 }
3871 3760
3872 /* create the initial worker */ 3761 /* create the initial worker */
3873 for_each_online_gcwq_cpu(cpu) { 3762 for_each_online_wq_cpu(cpu) {
3874 struct global_cwq *gcwq = get_gcwq(cpu);
3875 struct worker_pool *pool; 3763 struct worker_pool *pool;
3876 3764
3877 if (cpu != WORK_CPU_UNBOUND) 3765 for_each_std_worker_pool(pool, cpu) {
3878 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3879
3880 for_each_worker_pool(pool, gcwq) {
3881 struct worker *worker; 3766 struct worker *worker;
3882 3767
3768 if (cpu != WORK_CPU_UNBOUND)
3769 pool->flags &= ~POOL_DISASSOCIATED;
3770
3883 worker = create_worker(pool); 3771 worker = create_worker(pool);
3884 BUG_ON(!worker); 3772 BUG_ON(!worker);
3885 spin_lock_irq(&gcwq->lock); 3773 spin_lock_irq(&pool->lock);
3886 start_worker(worker); 3774 start_worker(worker);
3887 spin_unlock_irq(&gcwq->lock); 3775 spin_unlock_irq(&pool->lock);
3888 } 3776 }
3889 } 3777 }
3890 3778
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 000000000000..07650264ec15
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,65 @@
1/*
2 * kernel/workqueue_internal.h
3 *
4 * Workqueue internal header file. Only to be included by workqueue and
5 * core kernel subsystems.
6 */
7#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
8#define _KERNEL_WORKQUEUE_INTERNAL_H
9
10#include <linux/workqueue.h>
11#include <linux/kthread.h>
12
13struct worker_pool;
14
15/*
16 * The poor guys doing the actual heavy lifting. All on-duty workers are
17 * either serving the manager role, on idle list or on busy hash. For
18 * details on the locking annotation (L, I, X...), refer to workqueue.c.
19 *
20 * Only to be used in workqueue and async.
21 */
22struct worker {
23 /* on idle list while idle, on busy hash table while busy */
24 union {
25 struct list_head entry; /* L: while idle */
26 struct hlist_node hentry; /* L: while busy */
27 };
28
29 struct work_struct *current_work; /* L: work being processed */
30 work_func_t current_func; /* L: current_work's fn */
31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */
32 struct list_head scheduled; /* L: scheduled works */
33 struct task_struct *task; /* I: worker task */
34 struct worker_pool *pool; /* I: the associated pool */
35 /* 64 bytes boundary on 64bit, 32 on 32bit */
36 unsigned long last_active; /* L: last active timestamp */
37 unsigned int flags; /* X: flags */
38 int id; /* I: worker id */
39
40 /* for rebinding worker to CPU */
41 struct work_struct rebind_work; /* L: for busy worker */
42
43 /* used only by rescuers to point to the target workqueue */
44 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
45};
46
47/**
48 * current_wq_worker - return struct worker if %current is a workqueue worker
49 */
50static inline struct worker *current_wq_worker(void)
51{
52 if (current->flags & PF_WQ_WORKER)
53 return kthread_data(current);
54 return NULL;
55}
56
57/*
58 * Scheduler hooks for concurrency managed workqueue. Only to be used from
59 * sched.c and workqueue.c.
60 */
61void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
62struct task_struct *wq_worker_sleeping(struct task_struct *task,
63 unsigned int cpu);
64
65#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc98dc79..000000000000
--- a/kernel/workqueue_sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);