aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile44
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/async.c159
-rw-r--r--kernel/audit.c40
-rw-r--r--kernel/audit_tree.c26
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c20
-rw-r--r--kernel/cgroup.c325
-rw-r--r--kernel/compat.c97
-rw-r--r--kernel/context_tracking.c114
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c884
-rw-r--r--kernel/debug/debug_core.c1
-rw-r--r--kernel/debug/gdbstub.c1
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/events/core.c43
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/uprobes.c466
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c21
-rw-r--r--kernel/futex.c3
-rw-r--r--kernel/futex_compat.c21
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c38
-rw-r--r--kernel/irq/chip.c30
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/spurious.c7
-rw-r--r--kernel/irq_work.c150
-rw-r--r--kernel/kexec.c78
-rw-r--r--kernel/kfifo.c609
-rw-r--r--kernel/kmod.c9
-rw-r--r--kernel/kprobes.c66
-rw-r--r--kernel/lockdep.c32
-rw-r--r--kernel/module.c257
-rw-r--r--kernel/mutex.c1
-rw-r--r--kernel/nsproxy.c5
-rw-r--r--kernel/panic.c34
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/posix-cpu-timers.c51
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/qos.c9
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/power/suspend_test.c11
-rw-r--r--kernel/printk.c36
-rw-r--r--kernel/profile.c24
-rw-r--r--kernel/ptrace.c80
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c60
-rw-r--r--kernel/rcutiny.c8
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c66
-rw-r--r--kernel/rcutree.c260
-rw-r--r--kernel/rcutree.h11
-rw-r--r--kernel/rcutree_plugin.h13
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c1
-rw-r--r--kernel/rwsem.c10
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c159
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c314
-rw-r--r--kernel/sched/debug.c101
-rw-r--r--kernel/sched/fair.c29
-rw-r--r--kernel/sched/rt.c28
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/stats.c79
-rw-r--r--kernel/signal.c380
-rw-r--r--kernel/smp.c192
-rw-r--r--kernel/smpboot.c7
-rw-r--r--kernel/softirq.c23
-rw-r--r--kernel/srcu.c37
-rw-r--r--kernel/stop_machine.c156
-rw-r--r--kernel/sys.c311
-rw-r--r--kernel/sysctl.c22
-rw-r--r--kernel/sysctl_binary.c43
-rw-r--r--kernel/time.c12
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/ntp.c48
-rw-r--r--kernel/time/tick-broadcast.c38
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c71
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig33
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c160
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c108
-rw-r--r--kernel/trace/trace.c270
-rw-r--r--kernel/trace/trace.h134
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_functions.c61
-rw-r--r--kernel/trace/trace_functions_graph.c68
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_selftest.c21
-rw-r--r--kernel/trace/trace_syscalls.c61
-rw-r--r--kernel/trace/trace_uprobe.c217
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/tsacct.c44
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c7
-rw-r--r--kernel/user_namespace.c62
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/utsname_sysctl.c3
-rw-r--r--kernel/watchdog.c11
-rw-r--r--kernel/workqueue.c1533
-rw-r--r--kernel/workqueue_internal.h65
-rw-r--r--kernel/workqueue_sched.h9
121 files changed, 5528 insertions, 4412 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6da239..bbde5f1a4486 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
@@ -25,9 +25,7 @@ endif
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27 27
28ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) 28obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
29obj-$(CONFIG_X86) += kcmp.o
30endif
31obj-$(CONFIG_FREEZER) += freezer.o 29obj-$(CONFIG_FREEZER) += freezer.o
32obj-$(CONFIG_PROFILING) += profile.o 30obj-$(CONFIG_PROFILING) += profile.o
33obj-$(CONFIG_STACKTRACE) += stacktrace.o 31obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -127,11 +125,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
127 125
128$(obj)/time.o: $(obj)/timeconst.h 126$(obj)/time.o: $(obj)/timeconst.h
129 127
130quiet_cmd_timeconst = TIMEC $@ 128quiet_cmd_hzfile = HZFILE $@
131 cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ 129 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
130
131targets += hz.bc
132$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
133 $(call if_changed,hzfile)
134
135quiet_cmd_bc = BC $@
136 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
137
132targets += timeconst.h 138targets += timeconst.h
133$(obj)/timeconst.h: $(src)/timeconst.pl FORCE 139$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
134 $(call if_changed,timeconst) 140 $(call if_changed,bc)
135 141
136ifeq ($(CONFIG_MODULE_SIG),y) 142ifeq ($(CONFIG_MODULE_SIG),y)
137# 143#
@@ -153,23 +159,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates
153# fail and that the kernel may be used afterwards. 159# fail and that the kernel may be used afterwards.
154# 160#
155############################################################################### 161###############################################################################
156sign_key_with_hash := 162ifndef CONFIG_MODULE_SIG_HASH
157ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
158sign_key_with_hash := -sha1
159endif
160ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
161sign_key_with_hash := -sha224
162endif
163ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
164sign_key_with_hash := -sha256
165endif
166ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
167sign_key_with_hash := -sha384
168endif
169ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
170sign_key_with_hash := -sha512
171endif
172ifeq ($(sign_key_with_hash),)
173$(error Could not determine digest type to use from kernel config) 163$(error Could not determine digest type to use from kernel config)
174endif 164endif
175 165
@@ -182,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey
182 @echo "### needs to be run as root, and uses a hardware random" 172 @echo "### needs to be run as root, and uses a hardware random"
183 @echo "### number generator if one is available." 173 @echo "### number generator if one is available."
184 @echo "###" 174 @echo "###"
185 openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ 175 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
186 -x509 -config x509.genkey \ 176 -batch -x509 -config x509.genkey \
187 -outform DER -out signing_key.x509 \ 177 -outform DER -out signing_key.x509 \
188 -keyout signing_key.priv 178 -keyout signing_key.priv
189 @echo "###" 179 @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e7..b9bd7f098ee5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
205 if (IS_ERR(file)) 205 if (IS_ERR(file))
206 return PTR_ERR(file); 206 return PTR_ERR(file);
207 207
208 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { 208 if (!S_ISREG(file_inode(file)->i_mode)) {
209 filp_close(file, NULL); 209 filp_close(file, NULL);
210 return -EACCES; 210 return -EACCES;
211 } 211 }
@@ -566,6 +566,7 @@ out:
566void acct_collect(long exitcode, int group_dead) 566void acct_collect(long exitcode, int group_dead)
567{ 567{
568 struct pacct_struct *pacct = &current->signal->pacct; 568 struct pacct_struct *pacct = &current->signal->pacct;
569 cputime_t utime, stime;
569 unsigned long vsize = 0; 570 unsigned long vsize = 0;
570 571
571 if (group_dead && current->mm) { 572 if (group_dead && current->mm) {
@@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead)
593 pacct->ac_flag |= ACORE; 594 pacct->ac_flag |= ACORE;
594 if (current->flags & PF_SIGNALED) 595 if (current->flags & PF_SIGNALED)
595 pacct->ac_flag |= AXSIG; 596 pacct->ac_flag |= AXSIG;
596 pacct->ac_utime += current->utime; 597 task_cputime(current, &utime, &stime);
597 pacct->ac_stime += current->stime; 598 pacct->ac_utime += utime;
599 pacct->ac_stime += stime;
598 pacct->ac_minflt += current->min_flt; 600 pacct->ac_minflt += current->min_flt;
599 pacct->ac_majflt += current->maj_flt; 601 pacct->ac_majflt += current->maj_flt;
600 spin_unlock_irq(&current->sighand->siglock); 602 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 9d3118384858..8ddee2c3e5b0 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,56 +57,52 @@ asynchronous and synchronous parts of the kernel.
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/workqueue.h> 58#include <linux/workqueue.h>
59 59
60#include "workqueue_internal.h"
61
60static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
61 63
62#define MAX_WORK 32768 64#define MAX_WORK 32768
65#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
63 66
64static LIST_HEAD(async_pending); 67static LIST_HEAD(async_global_pending); /* pending from all registered doms */
65static ASYNC_DOMAIN(async_running); 68static ASYNC_DOMAIN(async_dfl_domain);
66static LIST_HEAD(async_domains);
67static DEFINE_SPINLOCK(async_lock); 69static DEFINE_SPINLOCK(async_lock);
68static DEFINE_MUTEX(async_register_mutex);
69 70
70struct async_entry { 71struct async_entry {
71 struct list_head list; 72 struct list_head domain_list;
73 struct list_head global_list;
72 struct work_struct work; 74 struct work_struct work;
73 async_cookie_t cookie; 75 async_cookie_t cookie;
74 async_func_ptr *func; 76 async_func_ptr *func;
75 void *data; 77 void *data;
76 struct async_domain *running; 78 struct async_domain *domain;
77}; 79};
78 80
79static DECLARE_WAIT_QUEUE_HEAD(async_done); 81static DECLARE_WAIT_QUEUE_HEAD(async_done);
80 82
81static atomic_t entry_count; 83static atomic_t entry_count;
82 84
83 85static async_cookie_t lowest_in_progress(struct async_domain *domain)
84/*
85 * MUST be called with the lock held!
86 */
87static async_cookie_t __lowest_in_progress(struct async_domain *running)
88{ 86{
89 struct async_entry *entry; 87 struct async_entry *first = NULL;
90 88 async_cookie_t ret = ASYNC_COOKIE_MAX;
91 if (!list_empty(&running->domain)) { 89 unsigned long flags;
92 entry = list_first_entry(&running->domain, typeof(*entry), list);
93 return entry->cookie;
94 }
95 90
96 list_for_each_entry(entry, &async_pending, list) 91 spin_lock_irqsave(&async_lock, flags);
97 if (entry->running == running)
98 return entry->cookie;
99 92
100 return next_cookie; /* "infinity" value */ 93 if (domain) {
101} 94 if (!list_empty(&domain->pending))
95 first = list_first_entry(&domain->pending,
96 struct async_entry, domain_list);
97 } else {
98 if (!list_empty(&async_global_pending))
99 first = list_first_entry(&async_global_pending,
100 struct async_entry, global_list);
101 }
102 102
103static async_cookie_t lowest_in_progress(struct async_domain *running) 103 if (first)
104{ 104 ret = first->cookie;
105 unsigned long flags;
106 async_cookie_t ret;
107 105
108 spin_lock_irqsave(&async_lock, flags);
109 ret = __lowest_in_progress(running);
110 spin_unlock_irqrestore(&async_lock, flags); 106 spin_unlock_irqrestore(&async_lock, flags);
111 return ret; 107 return ret;
112} 108}
@@ -120,14 +116,8 @@ static void async_run_entry_fn(struct work_struct *work)
120 container_of(work, struct async_entry, work); 116 container_of(work, struct async_entry, work);
121 unsigned long flags; 117 unsigned long flags;
122 ktime_t uninitialized_var(calltime), delta, rettime; 118 ktime_t uninitialized_var(calltime), delta, rettime;
123 struct async_domain *running = entry->running;
124 119
125 /* 1) move self to the running queue */ 120 /* 1) run (and print duration) */
126 spin_lock_irqsave(&async_lock, flags);
127 list_move_tail(&entry->list, &running->domain);
128 spin_unlock_irqrestore(&async_lock, flags);
129
130 /* 2) run (and print duration) */
131 if (initcall_debug && system_state == SYSTEM_BOOTING) { 121 if (initcall_debug && system_state == SYSTEM_BOOTING) {
132 printk(KERN_DEBUG "calling %lli_%pF @ %i\n", 122 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
133 (long long)entry->cookie, 123 (long long)entry->cookie,
@@ -144,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work)
144 (long long)ktime_to_ns(delta) >> 10); 134 (long long)ktime_to_ns(delta) >> 10);
145 } 135 }
146 136
147 /* 3) remove self from the running queue */ 137 /* 2) remove self from the pending queues */
148 spin_lock_irqsave(&async_lock, flags); 138 spin_lock_irqsave(&async_lock, flags);
149 list_del(&entry->list); 139 list_del_init(&entry->domain_list);
150 if (running->registered && --running->count == 0) 140 list_del_init(&entry->global_list);
151 list_del_init(&running->node);
152 141
153 /* 4) free the entry */ 142 /* 3) free the entry */
154 kfree(entry); 143 kfree(entry);
155 atomic_dec(&entry_count); 144 atomic_dec(&entry_count);
156 145
157 spin_unlock_irqrestore(&async_lock, flags); 146 spin_unlock_irqrestore(&async_lock, flags);
158 147
159 /* 5) wake up any waiters */ 148 /* 4) wake up any waiters */
160 wake_up(&async_done); 149 wake_up(&async_done);
161} 150}
162 151
163static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) 152static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
164{ 153{
165 struct async_entry *entry; 154 struct async_entry *entry;
166 unsigned long flags; 155 unsigned long flags;
@@ -183,19 +172,28 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
183 ptr(data, newcookie); 172 ptr(data, newcookie);
184 return newcookie; 173 return newcookie;
185 } 174 }
175 INIT_LIST_HEAD(&entry->domain_list);
176 INIT_LIST_HEAD(&entry->global_list);
186 INIT_WORK(&entry->work, async_run_entry_fn); 177 INIT_WORK(&entry->work, async_run_entry_fn);
187 entry->func = ptr; 178 entry->func = ptr;
188 entry->data = data; 179 entry->data = data;
189 entry->running = running; 180 entry->domain = domain;
190 181
191 spin_lock_irqsave(&async_lock, flags); 182 spin_lock_irqsave(&async_lock, flags);
183
184 /* allocate cookie and queue */
192 newcookie = entry->cookie = next_cookie++; 185 newcookie = entry->cookie = next_cookie++;
193 list_add_tail(&entry->list, &async_pending); 186
194 if (running->registered && running->count++ == 0) 187 list_add_tail(&entry->domain_list, &domain->pending);
195 list_add_tail(&running->node, &async_domains); 188 if (domain->registered)
189 list_add_tail(&entry->global_list, &async_global_pending);
190
196 atomic_inc(&entry_count); 191 atomic_inc(&entry_count);
197 spin_unlock_irqrestore(&async_lock, flags); 192 spin_unlock_irqrestore(&async_lock, flags);
198 193
194 /* mark that this task has queued an async job, used by module init */
195 current->flags |= PF_USED_ASYNC;
196
199 /* schedule for execution */ 197 /* schedule for execution */
200 queue_work(system_unbound_wq, &entry->work); 198 queue_work(system_unbound_wq, &entry->work);
201 199
@@ -212,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
212 */ 210 */
213async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 211async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
214{ 212{
215 return __async_schedule(ptr, data, &async_running); 213 return __async_schedule(ptr, data, &async_dfl_domain);
216} 214}
217EXPORT_SYMBOL_GPL(async_schedule); 215EXPORT_SYMBOL_GPL(async_schedule);
218 216
@@ -220,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule);
220 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain 218 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
221 * @ptr: function to execute asynchronously 219 * @ptr: function to execute asynchronously
222 * @data: data pointer to pass to the function 220 * @data: data pointer to pass to the function
223 * @running: running list for the domain 221 * @domain: the domain
224 * 222 *
225 * Returns an async_cookie_t that may be used for checkpointing later. 223 * Returns an async_cookie_t that may be used for checkpointing later.
226 * @running may be used in the async_synchronize_*_domain() functions 224 * @domain may be used in the async_synchronize_*_domain() functions to
227 * to wait within a certain synchronization domain rather than globally. 225 * wait within a certain synchronization domain rather than globally. A
228 * A synchronization domain is specified via the running queue @running to use. 226 * synchronization domain is specified via @domain. Note: This function
229 * Note: This function may be called from atomic or non-atomic contexts. 227 * may be called from atomic or non-atomic contexts.
230 */ 228 */
231async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 229async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
232 struct async_domain *running) 230 struct async_domain *domain)
233{ 231{
234 return __async_schedule(ptr, data, running); 232 return __async_schedule(ptr, data, domain);
235} 233}
236EXPORT_SYMBOL_GPL(async_schedule_domain); 234EXPORT_SYMBOL_GPL(async_schedule_domain);
237 235
@@ -242,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
242 */ 240 */
243void async_synchronize_full(void) 241void async_synchronize_full(void)
244{ 242{
245 mutex_lock(&async_register_mutex); 243 async_synchronize_full_domain(NULL);
246 do {
247 struct async_domain *domain = NULL;
248
249 spin_lock_irq(&async_lock);
250 if (!list_empty(&async_domains))
251 domain = list_first_entry(&async_domains, typeof(*domain), node);
252 spin_unlock_irq(&async_lock);
253
254 async_synchronize_cookie_domain(next_cookie, domain);
255 } while (!list_empty(&async_domains));
256 mutex_unlock(&async_register_mutex);
257} 244}
258EXPORT_SYMBOL_GPL(async_synchronize_full); 245EXPORT_SYMBOL_GPL(async_synchronize_full);
259 246
@@ -268,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
268 */ 255 */
269void async_unregister_domain(struct async_domain *domain) 256void async_unregister_domain(struct async_domain *domain)
270{ 257{
271 mutex_lock(&async_register_mutex);
272 spin_lock_irq(&async_lock); 258 spin_lock_irq(&async_lock);
273 WARN_ON(!domain->registered || !list_empty(&domain->node) || 259 WARN_ON(!domain->registered || !list_empty(&domain->pending));
274 !list_empty(&domain->domain));
275 domain->registered = 0; 260 domain->registered = 0;
276 spin_unlock_irq(&async_lock); 261 spin_unlock_irq(&async_lock);
277 mutex_unlock(&async_register_mutex);
278} 262}
279EXPORT_SYMBOL_GPL(async_unregister_domain); 263EXPORT_SYMBOL_GPL(async_unregister_domain);
280 264
281/** 265/**
282 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 266 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
283 * @domain: running list to synchronize on 267 * @domain: the domain to synchronize
284 * 268 *
285 * This function waits until all asynchronous function calls for the 269 * This function waits until all asynchronous function calls for the
286 * synchronization domain specified by the running list @domain have been done. 270 * synchronization domain specified by @domain have been done.
287 */ 271 */
288void async_synchronize_full_domain(struct async_domain *domain) 272void async_synchronize_full_domain(struct async_domain *domain)
289{ 273{
290 async_synchronize_cookie_domain(next_cookie, domain); 274 async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
291} 275}
292EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 276EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
293 277
294/** 278/**
295 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing 279 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
296 * @cookie: async_cookie_t to use as checkpoint 280 * @cookie: async_cookie_t to use as checkpoint
297 * @running: running list to synchronize on 281 * @domain: the domain to synchronize (%NULL for all registered domains)
298 * 282 *
299 * This function waits until all asynchronous function calls for the 283 * This function waits until all asynchronous function calls for the
300 * synchronization domain specified by running list @running submitted 284 * synchronization domain specified by @domain submitted prior to @cookie
301 * prior to @cookie have been done. 285 * have been done.
302 */ 286 */
303void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) 287void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
304{ 288{
305 ktime_t uninitialized_var(starttime), delta, endtime; 289 ktime_t uninitialized_var(starttime), delta, endtime;
306 290
307 if (!running)
308 return;
309
310 if (initcall_debug && system_state == SYSTEM_BOOTING) { 291 if (initcall_debug && system_state == SYSTEM_BOOTING) {
311 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 292 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
312 starttime = ktime_get(); 293 starttime = ktime_get();
313 } 294 }
314 295
315 wait_event(async_done, lowest_in_progress(running) >= cookie); 296 wait_event(async_done, lowest_in_progress(domain) >= cookie);
316 297
317 if (initcall_debug && system_state == SYSTEM_BOOTING) { 298 if (initcall_debug && system_state == SYSTEM_BOOTING) {
318 endtime = ktime_get(); 299 endtime = ktime_get();
@@ -334,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
334 */ 315 */
335void async_synchronize_cookie(async_cookie_t cookie) 316void async_synchronize_cookie(async_cookie_t cookie)
336{ 317{
337 async_synchronize_cookie_domain(cookie, &async_running); 318 async_synchronize_cookie_domain(cookie, &async_dfl_domain);
338} 319}
339EXPORT_SYMBOL_GPL(async_synchronize_cookie); 320EXPORT_SYMBOL_GPL(async_synchronize_cookie);
321
322/**
323 * current_is_async - is %current an async worker task?
324 *
325 * Returns %true if %current is an async worker task.
326 */
327bool current_is_async(void)
328{
329 struct worker *worker = current_wq_worker();
330
331 return worker && worker->current_func == async_run_entry_fn;
332}
diff --git a/kernel/audit.c b/kernel/audit.c
index 40414e9143db..d596e5355f15 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
272 int rc = 0; 272 int rc = 0;
273 273
274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
275 if (unlikely(!ab))
276 return rc;
275 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, 277 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
276 old, from_kuid(&init_user_ns, loginuid), sessionid); 278 old, from_kuid(&init_user_ns, loginuid), sessionid);
277 if (sid) { 279 if (sid) {
@@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
619 } 621 }
620 622
621 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 623 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
624 if (unlikely(!*ab))
625 return rc;
622 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", 626 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
623 task_tgid_vnr(current), 627 task_tgid_vnr(current),
624 from_kuid(&init_user_ns, current_uid()), 628 from_kuid(&init_user_ns, current_uid()),
@@ -1097,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx,
1097 } 1101 }
1098} 1102}
1099 1103
1104/*
1105 * Wait for auditd to drain the queue a little
1106 */
1107static void wait_for_auditd(unsigned long sleep_time)
1108{
1109 DECLARE_WAITQUEUE(wait, current);
1110 set_current_state(TASK_INTERRUPTIBLE);
1111 add_wait_queue(&audit_backlog_wait, &wait);
1112
1113 if (audit_backlog_limit &&
1114 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
1115 schedule_timeout(sleep_time);
1116
1117 __set_current_state(TASK_RUNNING);
1118 remove_wait_queue(&audit_backlog_wait, &wait);
1119}
1120
1100/* Obtain an audit buffer. This routine does locking to obtain the 1121/* Obtain an audit buffer. This routine does locking to obtain the
1101 * audit buffer, but then no locking is required for calls to 1122 * audit buffer, but then no locking is required for calls to
1102 * audit_log_*format. If the tsk is a task that is currently in a 1123 * audit_log_*format. If the tsk is a task that is currently in a
@@ -1142,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1142 1163
1143 while (audit_backlog_limit 1164 while (audit_backlog_limit
1144 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { 1165 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
1145 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time 1166 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
1146 && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { 1167 unsigned long sleep_time;
1147 1168
1148 /* Wait for auditd to drain the queue a little */ 1169 sleep_time = timeout_start + audit_backlog_wait_time -
1149 DECLARE_WAITQUEUE(wait, current); 1170 jiffies;
1150 set_current_state(TASK_INTERRUPTIBLE); 1171 if ((long)sleep_time > 0)
1151 add_wait_queue(&audit_backlog_wait, &wait); 1172 wait_for_auditd(sleep_time);
1152
1153 if (audit_backlog_limit &&
1154 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
1155 schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
1156
1157 __set_current_state(TASK_RUNNING);
1158 remove_wait_queue(&audit_backlog_wait, &wait);
1159 continue; 1173 continue;
1160 } 1174 }
1161 if (audit_rate_check() && printk_ratelimit()) 1175 if (audit_rate_check() && printk_ratelimit())
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e81175ef25f8..642a89c4f3d6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -449,11 +449,26 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
449 return 0; 449 return 0;
450} 450}
451 451
452static void audit_log_remove_rule(struct audit_krule *rule)
453{
454 struct audit_buffer *ab;
455
456 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
457 if (unlikely(!ab))
458 return;
459 audit_log_format(ab, "op=");
460 audit_log_string(ab, "remove rule");
461 audit_log_format(ab, " dir=");
462 audit_log_untrustedstring(ab, rule->tree->pathname);
463 audit_log_key(ab, rule->filterkey);
464 audit_log_format(ab, " list=%d res=1", rule->listnr);
465 audit_log_end(ab);
466}
467
452static void kill_rules(struct audit_tree *tree) 468static void kill_rules(struct audit_tree *tree)
453{ 469{
454 struct audit_krule *rule, *next; 470 struct audit_krule *rule, *next;
455 struct audit_entry *entry; 471 struct audit_entry *entry;
456 struct audit_buffer *ab;
457 472
458 list_for_each_entry_safe(rule, next, &tree->rules, rlist) { 473 list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
459 entry = container_of(rule, struct audit_entry, rule); 474 entry = container_of(rule, struct audit_entry, rule);
@@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree)
461 list_del_init(&rule->rlist); 476 list_del_init(&rule->rlist);
462 if (rule->tree) { 477 if (rule->tree) {
463 /* not a half-baked one */ 478 /* not a half-baked one */
464 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 479 audit_log_remove_rule(rule);
465 audit_log_format(ab, "op=");
466 audit_log_string(ab, "remove rule");
467 audit_log_format(ab, " dir=");
468 audit_log_untrustedstring(ab, rule->tree->pathname);
469 audit_log_key(ab, rule->filterkey);
470 audit_log_format(ab, " list=%d res=1", rule->listnr);
471 audit_log_end(ab);
472 rule->tree = NULL; 480 rule->tree = NULL;
473 list_del_rcu(&entry->list); 481 list_del_rcu(&entry->list);
474 list_del(&entry->rule.list); 482 list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 4a599f699adc..22831c4d369c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
240 if (audit_enabled) { 240 if (audit_enabled) {
241 struct audit_buffer *ab; 241 struct audit_buffer *ab;
242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); 242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
243 if (unlikely(!ab))
244 return;
243 audit_log_format(ab, "auid=%u ses=%u op=", 245 audit_log_format(ab, "auid=%u ses=%u op=",
244 from_kuid(&init_user_ns, audit_get_loginuid(current)), 246 from_kuid(&init_user_ns, audit_get_loginuid(current)),
245 audit_get_sessionid(current)); 247 audit_get_sessionid(current));
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7f19f23d38a3..f9fc54bbe06f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1144 * audit_receive_filter - apply all rules to the specified message type 1144 * audit_receive_filter - apply all rules to the specified message type
1145 * @type: audit message type 1145 * @type: audit message type
1146 * @pid: target pid for netlink audit messages 1146 * @pid: target pid for netlink audit messages
1147 * @uid: target uid for netlink audit messages
1148 * @seq: netlink audit message sequence (serial) number 1147 * @seq: netlink audit message sequence (serial) number
1149 * @data: payload data 1148 * @data: payload data
1150 * @datasz: size of payload data 1149 * @datasz: size of payload data
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e37e6a12c5e3..a371f857a0a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1464,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic)
1464 audit_log_end(ab); 1464 audit_log_end(ab);
1465 ab = audit_log_start(context, GFP_KERNEL, 1465 ab = audit_log_start(context, GFP_KERNEL,
1466 AUDIT_IPC_SET_PERM); 1466 AUDIT_IPC_SET_PERM);
1467 if (unlikely(!ab))
1468 return;
1467 audit_log_format(ab, 1469 audit_log_format(ab,
1468 "qbytes=%lx ouid=%u ogid=%u mode=%#ho", 1470 "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
1469 context->ipc.qbytes, 1471 context->ipc.qbytes,
1470 context->ipc.perm_uid, 1472 context->ipc.perm_uid,
1471 context->ipc.perm_gid, 1473 context->ipc.perm_gid,
1472 context->ipc.perm_mode); 1474 context->ipc.perm_mode);
1473 if (!ab)
1474 return;
1475 } 1475 }
1476 break; } 1476 break; }
1477 case AUDIT_MQ_OPEN: { 1477 case AUDIT_MQ_OPEN: {
@@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags)
2675 context->type = AUDIT_MMAP; 2675 context->type = AUDIT_MMAP;
2676} 2676}
2677 2677
2678static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) 2678static void audit_log_task(struct audit_buffer *ab)
2679{ 2679{
2680 kuid_t auid, uid; 2680 kuid_t auid, uid;
2681 kgid_t gid; 2681 kgid_t gid;
@@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2693 audit_log_task_context(ab); 2693 audit_log_task_context(ab);
2694 audit_log_format(ab, " pid=%d comm=", current->pid); 2694 audit_log_format(ab, " pid=%d comm=", current->pid);
2695 audit_log_untrustedstring(ab, current->comm); 2695 audit_log_untrustedstring(ab, current->comm);
2696}
2697
2698static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2699{
2700 audit_log_task(ab);
2696 audit_log_format(ab, " reason="); 2701 audit_log_format(ab, " reason=");
2697 audit_log_string(ab, reason); 2702 audit_log_string(ab, reason);
2698 audit_log_format(ab, " sig=%ld", signr); 2703 audit_log_format(ab, " sig=%ld", signr);
@@ -2715,6 +2720,8 @@ void audit_core_dumps(long signr)
2715 return; 2720 return;
2716 2721
2717 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2722 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2723 if (unlikely(!ab))
2724 return;
2718 audit_log_abend(ab, "memory violation", signr); 2725 audit_log_abend(ab, "memory violation", signr);
2719 audit_log_end(ab); 2726 audit_log_end(ab);
2720} 2727}
@@ -2723,8 +2730,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
2723{ 2730{
2724 struct audit_buffer *ab; 2731 struct audit_buffer *ab;
2725 2732
2726 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2733 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
2727 audit_log_abend(ab, "seccomp", signr); 2734 if (unlikely(!ab))
2735 return;
2736 audit_log_task(ab);
2737 audit_log_format(ab, " sig=%ld", signr);
2728 audit_log_format(ab, " syscall=%ld", syscall); 2738 audit_log_format(ab, " syscall=%ld", syscall);
2729 audit_log_format(ab, " compat=%d", is_compat_task()); 2739 audit_log_format(ab, " compat=%d", is_compat_task());
2730 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); 2740 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..a32f9432666c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
52#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/delayacct.h> 53#include <linux/delayacct.h>
54#include <linux/cgroupstats.h> 54#include <linux/cgroupstats.h>
55#include <linux/hash.h> 55#include <linux/hashtable.h>
56#include <linux/namei.h> 56#include <linux/namei.h>
57#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
58#include <linux/idr.h> 58#include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
376 * account cgroups in empty hierarchies. 376 * account cgroups in empty hierarchies.
377 */ 377 */
378#define CSS_SET_HASH_BITS 7 378#define CSS_SET_HASH_BITS 7
379#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 379static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
380static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
381 380
382static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) 381static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383{ 382{
384 int i; 383 int i;
385 int index; 384 unsigned long key = 0UL;
386 unsigned long tmp = 0UL;
387 385
388 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
389 tmp += (unsigned long)css[i]; 387 key += (unsigned long)css[i];
390 tmp = (tmp >> 16) ^ tmp; 388 key = (key >> 16) ^ key;
391 389
392 index = hash_long(tmp, CSS_SET_HASH_BITS); 390 return key;
393
394 return &css_set_table[index];
395} 391}
396 392
397/* We don't maintain the lists running through each css_set to its 393/* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
418 } 414 }
419 415
420 /* This css_set is dead. unlink it and release cgroup refcounts */ 416 /* This css_set is dead. unlink it and release cgroup refcounts */
421 hlist_del(&cg->hlist); 417 hash_del(&cg->hlist);
422 css_set_count--; 418 css_set_count--;
423 419
424 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 420 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
426 struct cgroup *cgrp = link->cgrp; 422 struct cgroup *cgrp = link->cgrp;
427 list_del(&link->cg_link_list); 423 list_del(&link->cg_link_list);
428 list_del(&link->cgrp_link_list); 424 list_del(&link->cgrp_link_list);
425
426 /*
427 * We may not be holding cgroup_mutex, and if cgrp->count is
428 * dropped to 0 the cgroup can be destroyed at any time, hence
429 * rcu_read_lock is used to keep it alive.
430 */
431 rcu_read_lock();
429 if (atomic_dec_and_test(&cgrp->count) && 432 if (atomic_dec_and_test(&cgrp->count) &&
430 notify_on_release(cgrp)) { 433 notify_on_release(cgrp)) {
431 if (taskexit) 434 if (taskexit)
432 set_bit(CGRP_RELEASABLE, &cgrp->flags); 435 set_bit(CGRP_RELEASABLE, &cgrp->flags);
433 check_for_release(cgrp); 436 check_for_release(cgrp);
434 } 437 }
438 rcu_read_unlock();
435 439
436 kfree(link); 440 kfree(link);
437 } 441 }
@@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set(
550{ 554{
551 int i; 555 int i;
552 struct cgroupfs_root *root = cgrp->root; 556 struct cgroupfs_root *root = cgrp->root;
553 struct hlist_head *hhead;
554 struct hlist_node *node;
555 struct css_set *cg; 557 struct css_set *cg;
558 unsigned long key;
556 559
557 /* 560 /*
558 * Build the set of subsystem state objects that we want to see in the 561 * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set(
572 } 575 }
573 } 576 }
574 577
575 hhead = css_set_hash(template); 578 key = css_set_hash(template);
576 hlist_for_each_entry(cg, node, hhead, hlist) { 579 hash_for_each_possible(css_set_table, cg, hlist, key) {
577 if (!compare_css_sets(cg, oldcg, cgrp, template)) 580 if (!compare_css_sets(cg, oldcg, cgrp, template))
578 continue; 581 continue;
579 582
@@ -657,8 +660,8 @@ static struct css_set *find_css_set(
657 660
658 struct list_head tmp_cg_links; 661 struct list_head tmp_cg_links;
659 662
660 struct hlist_head *hhead;
661 struct cg_cgroup_link *link; 663 struct cg_cgroup_link *link;
664 unsigned long key;
662 665
663 /* First see if we already have a cgroup group that matches 666 /* First see if we already have a cgroup group that matches
664 * the desired set */ 667 * the desired set */
@@ -704,8 +707,8 @@ static struct css_set *find_css_set(
704 css_set_count++; 707 css_set_count++;
705 708
706 /* Add this cgroup group to the hash table */ 709 /* Add this cgroup group to the hash table */
707 hhead = css_set_hash(res->subsys); 710 key = css_set_hash(res->subsys);
708 hlist_add_head(&res->hlist, hhead); 711 hash_add(css_set_table, &res->hlist, key);
709 712
710 write_unlock(&css_set_lock); 713 write_unlock(&css_set_lock);
711 714
@@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
856 return inode; 859 return inode;
857} 860}
858 861
859static void cgroup_diput(struct dentry *dentry, struct inode *inode) 862static void cgroup_free_fn(struct work_struct *work)
860{ 863{
861 /* is dentry a directory ? if so, kfree() associated cgroup */ 864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
862 if (S_ISDIR(inode->i_mode)) { 865 struct cgroup_subsys *ss;
863 struct cgroup *cgrp = dentry->d_fsdata;
864 struct cgroup_subsys *ss;
865 BUG_ON(!(cgroup_is_removed(cgrp)));
866 /* It's possible for external users to be holding css
867 * reference counts on a cgroup; css_put() needs to
868 * be able to access the cgroup after decrementing
869 * the reference count in order to know if it needs to
870 * queue the cgroup to be handled by the release
871 * agent */
872 synchronize_rcu();
873 866
874 mutex_lock(&cgroup_mutex); 867 mutex_lock(&cgroup_mutex);
875 /* 868 /*
876 * Release the subsystem state objects. 869 * Release the subsystem state objects.
877 */ 870 */
878 for_each_subsys(cgrp->root, ss) 871 for_each_subsys(cgrp->root, ss)
879 ss->css_free(cgrp); 872 ss->css_free(cgrp);
880 873
881 cgrp->root->number_of_cgroups--; 874 cgrp->root->number_of_cgroups--;
882 mutex_unlock(&cgroup_mutex); 875 mutex_unlock(&cgroup_mutex);
883 876
884 /* 877 /*
885 * Drop the active superblock reference that we took when we 878 * Drop the active superblock reference that we took when we
886 * created the cgroup 879 * created the cgroup
887 */ 880 */
888 deactivate_super(cgrp->root->sb); 881 deactivate_super(cgrp->root->sb);
889 882
890 /* 883 /*
891 * if we're getting rid of the cgroup, refcount should ensure 884 * if we're getting rid of the cgroup, refcount should ensure
892 * that there are no pidlists left. 885 * that there are no pidlists left.
893 */ 886 */
894 BUG_ON(!list_empty(&cgrp->pidlists)); 887 BUG_ON(!list_empty(&cgrp->pidlists));
888
889 simple_xattrs_free(&cgrp->xattrs);
890
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
892 kfree(cgrp);
893}
895 894
896 simple_xattrs_free(&cgrp->xattrs); 895static void cgroup_free_rcu(struct rcu_head *head)
896{
897 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
898
899 schedule_work(&cgrp->free_work);
900}
901
902static void cgroup_diput(struct dentry *dentry, struct inode *inode)
903{
904 /* is dentry a directory ? if so, kfree() associated cgroup */
905 if (S_ISDIR(inode->i_mode)) {
906 struct cgroup *cgrp = dentry->d_fsdata;
897 907
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 908 BUG_ON(!(cgroup_is_removed(cgrp)));
899 kfree_rcu(cgrp, rcu_head); 909 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
900 } else { 910 } else {
901 struct cfent *cfe = __d_cfe(dentry); 911 struct cfent *cfe = __d_cfe(dentry);
902 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 912 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d)
925 dput(parent); 935 dput(parent);
926} 936}
927 937
928static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 938static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
929{ 939{
930 struct cfent *cfe; 940 struct cfent *cfe;
931 941
932 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 942 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
933 lockdep_assert_held(&cgroup_mutex); 943 lockdep_assert_held(&cgroup_mutex);
934 944
945 /*
946 * If we're doing cleanup due to failure of cgroup_create(),
947 * the corresponding @cfe may not exist.
948 */
935 list_for_each_entry(cfe, &cgrp->files, node) { 949 list_for_each_entry(cfe, &cgrp->files, node) {
936 struct dentry *d = cfe->dentry; 950 struct dentry *d = cfe->dentry;
937 951
@@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
944 list_del_init(&cfe->node); 958 list_del_init(&cfe->node);
945 dput(d); 959 dput(d);
946 960
947 return 0; 961 break;
948 } 962 }
949 return -ENOENT;
950} 963}
951 964
952/** 965/**
@@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1083 } 1096 }
1084 } 1097 }
1085 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1098 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1086 synchronize_rcu();
1087 1099
1088 return 0; 1100 return 0;
1089} 1101}
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1393 INIT_LIST_HEAD(&cgrp->allcg_node); 1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1394 INIT_LIST_HEAD(&cgrp->release_list); 1406 INIT_LIST_HEAD(&cgrp->release_list);
1395 INIT_LIST_HEAD(&cgrp->pidlists); 1407 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1396 mutex_init(&cgrp->pidlist_mutex); 1409 mutex_init(&cgrp->pidlist_mutex);
1397 INIT_LIST_HEAD(&cgrp->event_list); 1410 INIT_LIST_HEAD(&cgrp->event_list);
1398 spin_lock_init(&cgrp->event_list_lock); 1411 spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 struct cgroupfs_root *existing_root; 1610 struct cgroupfs_root *existing_root;
1598 const struct cred *cred; 1611 const struct cred *cred;
1599 int i; 1612 int i;
1613 struct css_set *cg;
1600 1614
1601 BUG_ON(sb->s_root != NULL); 1615 BUG_ON(sb->s_root != NULL);
1602 1616
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1650 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1651 * the css_set objects */ 1665 * the css_set objects */
1652 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
1653 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 1667 hash_for_each(css_set_table, i, cg, hlist)
1654 struct hlist_head *hhead = &css_set_table[i]; 1668 link_css_set(&tmp_cg_links, cg, root_cgrp);
1655 struct hlist_node *node;
1656 struct css_set *cg;
1657
1658 hlist_for_each_entry(cg, node, hhead, hlist)
1659 link_css_set(&tmp_cg_links, cg, root_cgrp);
1660 }
1661 write_unlock(&css_set_lock); 1669 write_unlock(&css_set_lock);
1662 1670
1663 free_cg_links(&tmp_cg_links); 1671 free_cg_links(&tmp_cg_links);
@@ -1773,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1774 "cgroup_path() called without proper locking"); 1782 "cgroup_path() called without proper locking");
1775 1783
1776 if (!dentry || cgrp == dummytop) { 1784 if (cgrp == dummytop) {
1777 /* 1785 /*
1778 * Inactive subsystems have no dentry for their root 1786 * Inactive subsystems have no dentry for their root
1779 * cgroup 1787 * cgroup
@@ -1982,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1982 ss->attach(cgrp, &tset); 1990 ss->attach(cgrp, &tset);
1983 } 1991 }
1984 1992
1985 synchronize_rcu();
1986out: 1993out:
1987 if (retval) { 1994 if (retval) {
1988 for_each_subsys(root, ss) { 1995 for_each_subsys(root, ss) {
@@ -2151,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2151 /* 2158 /*
2152 * step 5: success! and cleanup 2159 * step 5: success! and cleanup
2153 */ 2160 */
2154 synchronize_rcu();
2155 retval = 0; 2161 retval = 0;
2156out_put_css_set_refs: 2162out_put_css_set_refs:
2157 if (retval) { 2163 if (retval) {
@@ -2637,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
2637 */ 2643 */
2638static inline struct cftype *__file_cft(struct file *file) 2644static inline struct cftype *__file_cft(struct file *file)
2639{ 2645{
2640 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) 2646 if (file_inode(file)->i_fop != &cgroup_file_operations)
2641 return ERR_PTR(-EINVAL); 2647 return ERR_PTR(-EINVAL);
2642 return __d_cft(file->f_dentry); 2648 return __d_cft(file->f_dentry);
2643} 2649}
@@ -2769,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue; 2776 continue;
2771 2777
2772 if (is_add) 2778 if (is_add) {
2773 err = cgroup_add_file(cgrp, subsys, cft); 2779 err = cgroup_add_file(cgrp, subsys, cft);
2774 else 2780 if (err)
2775 err = cgroup_rm_file(cgrp, cft); 2781 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2776 if (err) { 2782 cft->name, err);
2777 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2778 is_add ? "add" : "remove", cft->name, err);
2779 ret = err; 2783 ret = err;
2784 } else {
2785 cgroup_rm_file(cgrp, cft);
2780 } 2786 }
2781 } 2787 }
2782 return ret; 2788 return ret;
@@ -3017,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
3017} 3023}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3024EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019 3025
3026/**
3027 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
3028 * @pos: cgroup of interest
3029 *
3030 * Return the rightmost descendant of @pos. If there's no descendant,
3031 * @pos is returned. This can be used during pre-order traversal to skip
3032 * subtree of @pos.
3033 */
3034struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3035{
3036 struct cgroup *last, *tmp;
3037
3038 WARN_ON_ONCE(!rcu_read_lock_held());
3039
3040 do {
3041 last = pos;
3042 /* ->prev isn't RCU safe, walk ->next till the end */
3043 pos = NULL;
3044 list_for_each_entry_rcu(tmp, &last->children, sibling)
3045 pos = tmp;
3046 } while (pos);
3047
3048 return last;
3049}
3050EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
3051
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3052static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{ 3053{
3022 struct cgroup *last; 3054 struct cgroup *last;
@@ -3752,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
3752 remove); 3784 remove);
3753 struct cgroup *cgrp = event->cgrp; 3785 struct cgroup *cgrp = event->cgrp;
3754 3786
3787 remove_wait_queue(event->wqh, &event->wait);
3788
3755 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3789 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3756 3790
3791 /* Notify userspace the event is going away. */
3792 eventfd_signal(event->eventfd, 1);
3793
3757 eventfd_ctx_put(event->eventfd); 3794 eventfd_ctx_put(event->eventfd);
3758 kfree(event); 3795 kfree(event);
3759 dput(cgrp->dentry); 3796 dput(cgrp->dentry);
@@ -3773,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3773 unsigned long flags = (unsigned long)key; 3810 unsigned long flags = (unsigned long)key;
3774 3811
3775 if (flags & POLLHUP) { 3812 if (flags & POLLHUP) {
3776 __remove_wait_queue(event->wqh, &event->wait);
3777 spin_lock(&cgrp->event_list_lock);
3778 list_del_init(&event->list);
3779 spin_unlock(&cgrp->event_list_lock);
3780 /* 3813 /*
3781 * We are in atomic context, but cgroup_event_remove() may 3814 * If the event has been detached at cgroup removal, we
3782 * sleep, so we have to call it in workqueue. 3815 * can simply return knowing the other side will cleanup
3816 * for us.
3817 *
3818 * We can't race against event freeing since the other
3819 * side will require wqh->lock via remove_wait_queue(),
3820 * which we hold.
3783 */ 3821 */
3784 schedule_work(&event->remove); 3822 spin_lock(&cgrp->event_list_lock);
3823 if (!list_empty(&event->list)) {
3824 list_del_init(&event->list);
3825 /*
3826 * We are in atomic context, but cgroup_event_remove()
3827 * may sleep, so we have to call it in workqueue.
3828 */
3829 schedule_work(&event->remove);
3830 }
3831 spin_unlock(&cgrp->event_list_lock);
3785 } 3832 }
3786 3833
3787 return 0; 3834 return 0;
@@ -3807,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3807 const char *buffer) 3854 const char *buffer)
3808{ 3855{
3809 struct cgroup_event *event = NULL; 3856 struct cgroup_event *event = NULL;
3857 struct cgroup *cgrp_cfile;
3810 unsigned int efd, cfd; 3858 unsigned int efd, cfd;
3811 struct file *efile = NULL; 3859 struct file *efile = NULL;
3812 struct file *cfile = NULL; 3860 struct file *cfile = NULL;
@@ -3852,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3852 3900
3853 /* the process need read permission on control file */ 3901 /* the process need read permission on control file */
3854 /* AV: shouldn't we check that it's been opened for read instead? */ 3902 /* AV: shouldn't we check that it's been opened for read instead? */
3855 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); 3903 ret = inode_permission(file_inode(cfile), MAY_READ);
3856 if (ret < 0) 3904 if (ret < 0)
3857 goto fail; 3905 goto fail;
3858 3906
@@ -3862,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3862 goto fail; 3910 goto fail;
3863 } 3911 }
3864 3912
3913 /*
3914 * The file to be monitored must be in the same cgroup as
3915 * cgroup.event_control is.
3916 */
3917 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
3918 if (cgrp_cfile != cgrp) {
3919 ret = -EINVAL;
3920 goto fail;
3921 }
3922
3865 if (!event->cft->register_event || !event->cft->unregister_event) { 3923 if (!event->cft->register_event || !event->cft->unregister_event) {
3866 ret = -EINVAL; 3924 ret = -EINVAL;
3867 goto fail; 3925 goto fail;
@@ -4135,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4135 4193
4136 init_cgroup_housekeeping(cgrp); 4194 init_cgroup_housekeeping(cgrp);
4137 4195
4196 dentry->d_fsdata = cgrp;
4197 cgrp->dentry = dentry;
4198
4138 cgrp->parent = parent; 4199 cgrp->parent = parent;
4139 cgrp->root = parent->root; 4200 cgrp->root = parent->root;
4140 cgrp->top_cgroup = parent->top_cgroup; 4201 cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4172 lockdep_assert_held(&dentry->d_inode->i_mutex); 4233 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173 4234
4174 /* allocation complete, commit to creation */ 4235 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4236 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++; 4238 root->number_of_cgroups++;
@@ -4340,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 /* 4399 /*
4341 * Unregister events and notify userspace. 4400 * Unregister events and notify userspace.
4342 * Notify userspace about cgroup removing only after rmdir of cgroup 4401 * Notify userspace about cgroup removing only after rmdir of cgroup
4343 * directory to avoid race between userspace and kernelspace. Use 4402 * directory to avoid race between userspace and kernelspace.
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4347 */ 4403 */
4348 spin_lock(&cgrp->event_list_lock); 4404 spin_lock(&cgrp->event_list_lock);
4349 list_splice_init(&cgrp->event_list, &tmp_list); 4405 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4350 spin_unlock(&cgrp->event_list_lock);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list); 4406 list_del_init(&event->list);
4353 remove_wait_queue(event->wqh, &event->wait);
4354 eventfd_signal(event->eventfd, 1);
4355 schedule_work(&event->remove); 4407 schedule_work(&event->remove);
4356 } 4408 }
4409 spin_unlock(&cgrp->event_list_lock);
4357 4410
4358 return 0; 4411 return 0;
4359} 4412}
@@ -4438,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4438{ 4491{
4439 struct cgroup_subsys_state *css; 4492 struct cgroup_subsys_state *css;
4440 int i, ret; 4493 int i, ret;
4494 struct hlist_node *tmp;
4495 struct css_set *cg;
4496 unsigned long key;
4441 4497
4442 /* check name and function validity */ 4498 /* check name and function validity */
4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4499 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4503 * this is all done under the css_set_lock. 4559 * this is all done under the css_set_lock.
4504 */ 4560 */
4505 write_lock(&css_set_lock); 4561 write_lock(&css_set_lock);
4506 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 4562 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
4507 struct css_set *cg; 4563 /* skip entries that we already rehashed */
4508 struct hlist_node *node, *tmp; 4564 if (cg->subsys[ss->subsys_id])
4509 struct hlist_head *bucket = &css_set_table[i], *new_bucket; 4565 continue;
4510 4566 /* remove existing entry */
4511 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { 4567 hash_del(&cg->hlist);
4512 /* skip entries that we already rehashed */ 4568 /* set new value */
4513 if (cg->subsys[ss->subsys_id]) 4569 cg->subsys[ss->subsys_id] = css;
4514 continue; 4570 /* recompute hash and restore entry */
4515 /* remove existing entry */ 4571 key = css_set_hash(cg->subsys);
4516 hlist_del(&cg->hlist); 4572 hash_add(css_set_table, &cg->hlist, key);
4517 /* set new value */
4518 cg->subsys[ss->subsys_id] = css;
4519 /* recompute hash and restore entry */
4520 new_bucket = css_set_hash(cg->subsys);
4521 hlist_add_head(&cg->hlist, new_bucket);
4522 }
4523 } 4573 }
4524 write_unlock(&css_set_lock); 4574 write_unlock(&css_set_lock);
4525 4575
@@ -4551,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4551void cgroup_unload_subsys(struct cgroup_subsys *ss) 4601void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552{ 4602{
4553 struct cg_cgroup_link *link; 4603 struct cg_cgroup_link *link;
4554 struct hlist_head *hhead;
4555 4604
4556 BUG_ON(ss->module == NULL); 4605 BUG_ON(ss->module == NULL);
4557 4606
@@ -4567,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 offline_css(ss, dummytop); 4616 offline_css(ss, dummytop);
4568 ss->active = 0; 4617 ss->active = 0;
4569 4618
4570 if (ss->use_id) { 4619 if (ss->use_id)
4571 idr_remove_all(&ss->idr);
4572 idr_destroy(&ss->idr); 4620 idr_destroy(&ss->idr);
4573 }
4574 4621
4575 /* deassign the subsys_id */ 4622 /* deassign the subsys_id */
4576 subsys[ss->subsys_id] = NULL; 4623 subsys[ss->subsys_id] = NULL;
@@ -4585,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4585 write_lock(&css_set_lock); 4632 write_lock(&css_set_lock);
4586 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4633 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4587 struct css_set *cg = link->cg; 4634 struct css_set *cg = link->cg;
4635 unsigned long key;
4588 4636
4589 hlist_del(&cg->hlist); 4637 hash_del(&cg->hlist);
4590 cg->subsys[ss->subsys_id] = NULL; 4638 cg->subsys[ss->subsys_id] = NULL;
4591 hhead = css_set_hash(cg->subsys); 4639 key = css_set_hash(cg->subsys);
4592 hlist_add_head(&cg->hlist, hhead); 4640 hash_add(css_set_table, &cg->hlist, key);
4593 } 4641 }
4594 write_unlock(&css_set_lock); 4642 write_unlock(&css_set_lock);
4595 4643
@@ -4631,9 +4679,6 @@ int __init cgroup_init_early(void)
4631 list_add(&init_css_set_link.cg_link_list, 4679 list_add(&init_css_set_link.cg_link_list,
4632 &init_css_set.cg_links); 4680 &init_css_set.cg_links);
4633 4681
4634 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4635 INIT_HLIST_HEAD(&css_set_table[i]);
4636
4637 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4682 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4638 struct cgroup_subsys *ss = subsys[i]; 4683 struct cgroup_subsys *ss = subsys[i];
4639 4684
@@ -4667,7 +4712,7 @@ int __init cgroup_init(void)
4667{ 4712{
4668 int err; 4713 int err;
4669 int i; 4714 int i;
4670 struct hlist_head *hhead; 4715 unsigned long key;
4671 4716
4672 err = bdi_init(&cgroup_backing_dev_info); 4717 err = bdi_init(&cgroup_backing_dev_info);
4673 if (err) 4718 if (err)
@@ -4686,8 +4731,8 @@ int __init cgroup_init(void)
4686 } 4731 }
4687 4732
4688 /* Add init_css_set to the hash table */ 4733 /* Add init_css_set to the hash table */
4689 hhead = css_set_hash(init_css_set.subsys); 4734 key = css_set_hash(init_css_set.subsys);
4690 hlist_add_head(&init_css_set.hlist, hhead); 4735 hash_add(css_set_table, &init_css_set.hlist, key);
4691 BUG_ON(!init_root_id(&rootnode)); 4736 BUG_ON(!init_root_id(&rootnode));
4692 4737
4693 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4738 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4982 } 5027 }
4983 task_unlock(tsk); 5028 task_unlock(tsk);
4984 5029
4985 if (cg) 5030 put_css_set_taskexit(cg);
4986 put_css_set_taskexit(cg);
4987} 5031}
4988 5032
4989/** 5033/**
@@ -5274,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
5274static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) 5318static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5275{ 5319{
5276 struct css_id *newid; 5320 struct css_id *newid;
5277 int myid, error, size; 5321 int ret, size;
5278 5322
5279 BUG_ON(!ss->use_id); 5323 BUG_ON(!ss->use_id);
5280 5324
@@ -5282,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5282 newid = kzalloc(size, GFP_KERNEL); 5326 newid = kzalloc(size, GFP_KERNEL);
5283 if (!newid) 5327 if (!newid)
5284 return ERR_PTR(-ENOMEM); 5328 return ERR_PTR(-ENOMEM);
5285 /* get id */ 5329
5286 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { 5330 idr_preload(GFP_KERNEL);
5287 error = -ENOMEM;
5288 goto err_out;
5289 }
5290 spin_lock(&ss->id_lock); 5331 spin_lock(&ss->id_lock);
5291 /* Don't use 0. allocates an ID of 1-65535 */ 5332 /* Don't use 0. allocates an ID of 1-65535 */
5292 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 5333 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
5293 spin_unlock(&ss->id_lock); 5334 spin_unlock(&ss->id_lock);
5335 idr_preload_end();
5294 5336
5295 /* Returns error when there are no free spaces for new ID.*/ 5337 /* Returns error when there are no free spaces for new ID.*/
5296 if (error) { 5338 if (ret < 0)
5297 error = -ENOSPC;
5298 goto err_out; 5339 goto err_out;
5299 }
5300 if (myid > CSS_ID_MAX)
5301 goto remove_idr;
5302 5340
5303 newid->id = myid; 5341 newid->id = ret;
5304 newid->depth = depth; 5342 newid->depth = depth;
5305 return newid; 5343 return newid;
5306remove_idr:
5307 error = -ENOSPC;
5308 spin_lock(&ss->id_lock);
5309 idr_remove(&ss->idr, myid);
5310 spin_unlock(&ss->id_lock);
5311err_out: 5344err_out:
5312 kfree(newid); 5345 kfree(newid);
5313 return ERR_PTR(error); 5346 return ERR_PTR(ret);
5314 5347
5315} 5348}
5316 5349
@@ -5441,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5441 struct inode *inode; 5474 struct inode *inode;
5442 struct cgroup_subsys_state *css; 5475 struct cgroup_subsys_state *css;
5443 5476
5444 inode = f->f_dentry->d_inode; 5477 inode = file_inode(f);
5445 /* check in cgroup filesystem dir */ 5478 /* check in cgroup filesystem dir */
5446 if (inode->i_op != &cgroup_dir_inode_operations) 5479 if (inode->i_op != &cgroup_dir_inode_operations)
5447 return ERR_PTR(-EBADF); 5480 return ERR_PTR(-EBADF);
diff --git a/kernel/compat.c b/kernel/compat.c
index f6150e92dfc9..19971d8c7299 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
290 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); 290 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
291} 291}
292 292
293asmlinkage long compat_sys_getitimer(int which, 293COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
294 struct compat_itimerval __user *it) 294 struct compat_itimerval __user *, it)
295{ 295{
296 struct itimerval kit; 296 struct itimerval kit;
297 int error; 297 int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
302 return error; 302 return error;
303} 303}
304 304
305asmlinkage long compat_sys_setitimer(int which, 305COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
306 struct compat_itimerval __user *in, 306 struct compat_itimerval __user *, in,
307 struct compat_itimerval __user *out) 307 struct compat_itimerval __user *, out)
308{ 308{
309 struct itimerval kin, kout; 309 struct itimerval kin, kout;
310 int error; 310 int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
381 memcpy(blocked->sig, &set, sizeof(set)); 381 memcpy(blocked->sig, &set, sizeof(set));
382} 382}
383 383
384asmlinkage long compat_sys_sigprocmask(int how, 384COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
385 compat_old_sigset_t __user *nset, 385 compat_old_sigset_t __user *, nset,
386 compat_old_sigset_t __user *oset) 386 compat_old_sigset_t __user *, oset)
387{ 387{
388 old_sigset_t old_set, new_set; 388 old_sigset_t old_set, new_set;
389 sigset_t new_blocked; 389 sigset_t new_blocked;
@@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
535 return 0; 535 return 0;
536} 536}
537 537
538asmlinkage long 538COMPAT_SYSCALL_DEFINE4(wait4,
539compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, 539 compat_pid_t, pid,
540 struct compat_rusage __user *ru) 540 compat_uint_t __user *, stat_addr,
541 int, options,
542 struct compat_rusage __user *, ru)
541{ 543{
542 if (!ru) { 544 if (!ru) {
543 return sys_wait4(pid, stat_addr, options, NULL); 545 return sys_wait4(pid, stat_addr, options, NULL);
@@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
564 } 566 }
565} 567}
566 568
567asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, 569COMPAT_SYSCALL_DEFINE5(waitid,
568 struct compat_siginfo __user *uinfo, int options, 570 int, which, compat_pid_t, pid,
569 struct compat_rusage __user *uru) 571 struct compat_siginfo __user *, uinfo, int, options,
572 struct compat_rusage __user *, uru)
570{ 573{
571 siginfo_t info; 574 siginfo_t info;
572 struct rusage ru; 575 struct rusage ru;
@@ -584,9 +587,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
584 return ret; 587 return ret;
585 588
586 if (uru) { 589 if (uru) {
587 ret = put_compat_rusage(&ru, uru); 590 /* sys_waitid() overwrites everything in ru */
591 if (COMPAT_USE_64BIT_TIME)
592 ret = copy_to_user(uru, &ru, sizeof(ru));
593 else
594 ret = put_compat_rusage(&ru, uru);
588 if (ret) 595 if (ret)
589 return ret; 596 return -EFAULT;
590 } 597 }
591 598
592 BUG_ON(info.si_code & __SI_MASK); 599 BUG_ON(info.si_code & __SI_MASK);
@@ -964,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
964} 971}
965 972
966void 973void
967sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 974sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
968{ 975{
969 switch (_NSIG_WORDS) { 976 switch (_NSIG_WORDS) {
970 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 977 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -975,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
975} 982}
976EXPORT_SYMBOL_GPL(sigset_from_compat); 983EXPORT_SYMBOL_GPL(sigset_from_compat);
977 984
978asmlinkage long 985void
979compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 986sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
980 struct compat_siginfo __user *uinfo, 987{
981 struct compat_timespec __user *uts, compat_size_t sigsetsize) 988 switch (_NSIG_WORDS) {
989 case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
990 case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
991 case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
992 case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
993 }
994}
995
996COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
997 struct compat_siginfo __user *, uinfo,
998 struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
982{ 999{
983 compat_sigset_t s32; 1000 compat_sigset_t s32;
984 sigset_t s; 1001 sigset_t s;
@@ -994,7 +1011,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
994 sigset_from_compat(&s, &s32); 1011 sigset_from_compat(&s, &s32);
995 1012
996 if (uts) { 1013 if (uts) {
997 if (get_compat_timespec(&t, uts)) 1014 if (compat_get_timespec(&t, uts))
998 return -EFAULT; 1015 return -EFAULT;
999 } 1016 }
1000 1017
@@ -1006,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
1006 } 1023 }
1007 1024
1008 return ret; 1025 return ret;
1009
1010}
1011
1012asmlinkage long
1013compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
1014 struct compat_siginfo __user *uinfo)
1015{
1016 siginfo_t info;
1017
1018 if (copy_siginfo_from_user32(&info, uinfo))
1019 return -EFAULT;
1020 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
1021} 1026}
1022 1027
1023#ifdef __ARCH_WANT_COMPAT_SYS_TIME 1028#ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1060,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
1060 1065
1061#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ 1066#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
1062 1067
1063#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
1064asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
1065{
1066 sigset_t newset;
1067 compat_sigset_t newset32;
1068
1069 /* XXX: Don't preclude handling different sized sigset_t's. */
1070 if (sigsetsize != sizeof(sigset_t))
1071 return -EINVAL;
1072
1073 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
1074 return -EFAULT;
1075 sigset_from_compat(&newset, &newset32);
1076 return sigsuspend(&newset);
1077}
1078#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
1079
1080asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1068asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
1081{ 1069{
1082 struct timex txc; 1070 struct timex txc;
@@ -1215,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1215 return 0; 1203 return 0;
1216} 1204}
1217 1205
1218#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL 1206COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
1219asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, 1207 compat_pid_t, pid,
1220 struct compat_timespec __user *interval) 1208 struct compat_timespec __user *, interval)
1221{ 1209{
1222 struct timespec t; 1210 struct timespec t;
1223 int ret; 1211 int ret;
@@ -1230,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
1230 return -EFAULT; 1218 return -EFAULT;
1231 return ret; 1219 return ret;
1232} 1220}
1233#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
1234 1221
1235/* 1222/*
1236 * Allocate user-space memory for the duration of a single system call, 1223 * Allocate user-space memory for the duration of a single system call,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e0e07fd55508..65349f07b878 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,29 +1,41 @@
1/*
2 * Context tracking: Probe on high level context boundaries such as kernel
3 * and userspace. This includes syscalls and exceptions entry/exit.
4 *
5 * This is used by RCU to remove its dependency on the timer tick while a CPU
6 * runs in userspace.
7 *
8 * Started by Frederic Weisbecker:
9 *
10 * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
11 *
12 * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
13 * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
14 *
15 */
16
1#include <linux/context_tracking.h> 17#include <linux/context_tracking.h>
18#include <linux/kvm_host.h>
2#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
3#include <linux/sched.h> 20#include <linux/sched.h>
4#include <linux/percpu.h>
5#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/export.h>
6 23
7struct context_tracking { 24DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
8 /*
9 * When active is false, hooks are not set to
10 * minimize overhead: TIF flags are cleared
11 * and calls to user_enter/exit are ignored. This
12 * may be further optimized using static keys.
13 */
14 bool active;
15 enum {
16 IN_KERNEL = 0,
17 IN_USER,
18 } state;
19};
20
21static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
22#ifdef CONFIG_CONTEXT_TRACKING_FORCE 25#ifdef CONFIG_CONTEXT_TRACKING_FORCE
23 .active = true, 26 .active = true,
24#endif 27#endif
25}; 28};
26 29
30/**
31 * user_enter - Inform the context tracking that the CPU is going to
32 * enter userspace mode.
33 *
34 * This function must be called right before we switch from the kernel
35 * to userspace, when it's guaranteed the remaining kernel instructions
36 * to execute won't use any RCU read side critical section because this
37 * function sets RCU in extended quiescent state.
38 */
27void user_enter(void) 39void user_enter(void)
28{ 40{
29 unsigned long flags; 41 unsigned long flags;
@@ -39,40 +51,90 @@ void user_enter(void)
39 if (in_interrupt()) 51 if (in_interrupt())
40 return; 52 return;
41 53
54 /* Kernel threads aren't supposed to go to userspace */
42 WARN_ON_ONCE(!current->mm); 55 WARN_ON_ONCE(!current->mm);
43 56
44 local_irq_save(flags); 57 local_irq_save(flags);
45 if (__this_cpu_read(context_tracking.active) && 58 if (__this_cpu_read(context_tracking.active) &&
46 __this_cpu_read(context_tracking.state) != IN_USER) { 59 __this_cpu_read(context_tracking.state) != IN_USER) {
47 __this_cpu_write(context_tracking.state, IN_USER); 60 /*
61 * At this stage, only low level arch entry code remains and
62 * then we'll run in userspace. We can assume there won't be
63 * any RCU read-side critical section until the next call to
64 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
65 * on the tick.
66 */
67 vtime_user_enter(current);
48 rcu_user_enter(); 68 rcu_user_enter();
69 __this_cpu_write(context_tracking.state, IN_USER);
49 } 70 }
50 local_irq_restore(flags); 71 local_irq_restore(flags);
51} 72}
52 73
74
75/**
76 * user_exit - Inform the context tracking that the CPU is
77 * exiting userspace mode and entering the kernel.
78 *
79 * This function must be called after we entered the kernel from userspace
80 * before any use of RCU read side critical section. This potentially include
81 * any high level kernel code like syscalls, exceptions, signal handling, etc...
82 *
83 * This call supports re-entrancy. This way it can be called from any exception
84 * handler without needing to know if we came from userspace or not.
85 */
53void user_exit(void) 86void user_exit(void)
54{ 87{
55 unsigned long flags; 88 unsigned long flags;
56 89
57 /*
58 * Some contexts may involve an exception occuring in an irq,
59 * leading to that nesting:
60 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
61 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
62 * helpers are enough to protect RCU uses inside the exception. So
63 * just return immediately if we detect we are in an IRQ.
64 */
65 if (in_interrupt()) 90 if (in_interrupt())
66 return; 91 return;
67 92
68 local_irq_save(flags); 93 local_irq_save(flags);
69 if (__this_cpu_read(context_tracking.state) == IN_USER) { 94 if (__this_cpu_read(context_tracking.state) == IN_USER) {
70 __this_cpu_write(context_tracking.state, IN_KERNEL); 95 /*
96 * We are going to run code that may use RCU. Inform
97 * RCU core about that (ie: we may need the tick again).
98 */
71 rcu_user_exit(); 99 rcu_user_exit();
100 vtime_user_exit(current);
101 __this_cpu_write(context_tracking.state, IN_KERNEL);
72 } 102 }
73 local_irq_restore(flags); 103 local_irq_restore(flags);
74} 104}
75 105
106void guest_enter(void)
107{
108 if (vtime_accounting_enabled())
109 vtime_guest_enter(current);
110 else
111 __guest_enter();
112}
113EXPORT_SYMBOL_GPL(guest_enter);
114
115void guest_exit(void)
116{
117 if (vtime_accounting_enabled())
118 vtime_guest_exit(current);
119 else
120 __guest_exit();
121}
122EXPORT_SYMBOL_GPL(guest_exit);
123
124
125/**
126 * context_tracking_task_switch - context switch the syscall callbacks
127 * @prev: the task that is being switched out
128 * @next: the task that is being switched in
129 *
130 * The context tracking uses the syscall slow path to implement its user-kernel
131 * boundaries probes on syscalls. This way it doesn't impact the syscall fast
132 * path on CPUs that don't do context tracking.
133 *
134 * But we need to clear the flag on the previous task because it may later
135 * migrate to some CPU that doesn't do the context tracking. As such the TIF
136 * flag may not be desired there.
137 */
76void context_tracking_task_switch(struct task_struct *prev, 138void context_tracking_task_switch(struct task_struct *prev,
77 struct task_struct *next) 139 struct task_struct *next)
78{ 140{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a503242c..b5e4ab2d427e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)
224static inline void check_for_tasks(int cpu) 224static inline void check_for_tasks(int cpu)
225{ 225{
226 struct task_struct *p; 226 struct task_struct *p;
227 cputime_t utime, stime;
227 228
228 write_lock_irq(&tasklist_lock); 229 write_lock_irq(&tasklist_lock);
229 for_each_process(p) { 230 for_each_process(p) {
231 task_cputime(p, &utime, &stime);
230 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 232 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
231 (p->utime || p->stime)) 233 (utime || stime))
232 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 234 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
233 "(state = %ld, flags = %x)\n", 235 "(state = %ld, flags = %x)\n",
234 p->comm, task_pid_nr(p), cpu, 236 p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)
254 return err; 256 return err;
255 257
256 cpu_notify(CPU_DYING | param->mod, param->hcpu); 258 cpu_notify(CPU_DYING | param->mod, param->hcpu);
259 /* Park the stopper thread */
260 kthread_park(current);
257 return 0; 261 return 0;
258} 262}
259 263
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb8..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62 62
63/* 63/*
64 * Workqueue for cpuset related tasks.
65 *
66 * Using kevent workqueue may cause deadlock when memory_migrate
67 * is set. So we create a separate workqueue thread for cpuset.
68 */
69static struct workqueue_struct *cpuset_wq;
70
71/*
72 * Tracks how many cpusets are currently defined in system. 64 * Tracks how many cpusets are currently defined in system.
73 * When there is only one cpuset (the root cpuset) we can 65 * When there is only one cpuset (the root cpuset) we can
74 * short circuit some hooks. 66 * short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
95 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
96 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
97 89
98 struct cpuset *parent; /* my parent */
99
100 struct fmeter fmeter; /* memory_pressure filter */ 90 struct fmeter fmeter; /* memory_pressure filter */
101 91
92 /*
93 * Tasks are being attached to this cpuset. Used to prevent
94 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
95 */
96 int attach_in_progress;
97
102 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
103 int pn; 99 int pn;
104 100
105 /* for custom sched domain */ 101 /* for custom sched domain */
106 int relax_domain_level; 102 int relax_domain_level;
107 103
108 /* used for walking a cpuset hierarchy */ 104 struct work_struct hotplug_work;
109 struct list_head stack_list;
110}; 105};
111 106
112/* Retrieve the cpuset for a cgroup */ 107/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
123 struct cpuset, css); 118 struct cpuset, css);
124} 119}
125 120
121static inline struct cpuset *parent_cs(const struct cpuset *cs)
122{
123 struct cgroup *pcgrp = cs->css.cgroup->parent;
124
125 if (pcgrp)
126 return cgroup_cs(pcgrp);
127 return NULL;
128}
129
126#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task) 131static inline bool task_has_mempolicy(struct task_struct *task)
128{ 132{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
138 142
139/* bits in struct cpuset flags field */ 143/* bits in struct cpuset flags field */
140typedef enum { 144typedef enum {
145 CS_ONLINE,
141 CS_CPU_EXCLUSIVE, 146 CS_CPU_EXCLUSIVE,
142 CS_MEM_EXCLUSIVE, 147 CS_MEM_EXCLUSIVE,
143 CS_MEM_HARDWALL, 148 CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 152 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 153} cpuset_flagbits_t;
149 154
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
156/* convenient tests for these bits */ 155/* convenient tests for these bits */
156static inline bool is_cpuset_online(const struct cpuset *cs)
157{
158 return test_bit(CS_ONLINE, &cs->flags);
159}
160
157static inline int is_cpu_exclusive(const struct cpuset *cs) 161static inline int is_cpu_exclusive(const struct cpuset *cs)
158{ 162{
159 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 163 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
190} 194}
191 195
192static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
193 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
198 (1 << CS_MEM_EXCLUSIVE)),
194}; 199};
195 200
201/**
202 * cpuset_for_each_child - traverse online children of a cpuset
203 * @child_cs: loop cursor pointing to the current child
204 * @pos_cgrp: used for iteration
205 * @parent_cs: target cpuset to walk children of
206 *
207 * Walk @child_cs through the online children of @parent_cs. Must be used
208 * with RCU read locked.
209 */
210#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
211 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
212 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
213
214/**
215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
216 * @des_cs: loop cursor pointing to the current descendant
217 * @pos_cgrp: used for iteration
218 * @root_cs: target cpuset to walk ancestor of
219 *
220 * Walk @des_cs through the online descendants of @root_cs. Must be used
221 * with RCU read locked. The caller may modify @pos_cgrp by calling
222 * cgroup_rightmost_descendant() to skip subtree.
223 */
224#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
225 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
226 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
227
196/* 228/*
197 * There are two global mutexes guarding cpuset structures. The first 229 * There are two global mutexes guarding cpuset structures - cpuset_mutex
198 * is the main control groups cgroup_mutex, accessed via 230 * and callback_mutex. The latter may nest inside the former. We also
199 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific 231 * require taking task_lock() when dereferencing a task's cpuset pointer.
200 * callback_mutex, below. They can nest. It is ok to first take 232 * See "The task_lock() exception", at the end of this comment.
201 * cgroup_mutex, then nest callback_mutex. We also require taking 233 *
202 * task_lock() when dereferencing a task's cpuset pointer. See "The 234 * A task must hold both mutexes to modify cpusets. If a task holds
203 * task_lock() exception", at the end of this comment. 235 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
204 * 236 * is the only task able to also acquire callback_mutex and be able to
205 * A task must hold both mutexes to modify cpusets. If a task 237 * modify cpusets. It can perform various checks on the cpuset structure
206 * holds cgroup_mutex, then it blocks others wanting that mutex, 238 * first, knowing nothing will change. It can also allocate memory while
207 * ensuring that it is the only task able to also acquire callback_mutex 239 * just holding cpuset_mutex. While it is performing these checks, various
208 * and be able to modify cpusets. It can perform various checks on 240 * callback routines can briefly acquire callback_mutex to query cpusets.
209 * the cpuset structure first, knowing nothing will change. It can 241 * Once it is ready to make the changes, it takes callback_mutex, blocking
210 * also allocate memory while just holding cgroup_mutex. While it is 242 * everyone else.
211 * performing these checks, various callback routines can briefly
212 * acquire callback_mutex to query cpusets. Once it is ready to make
213 * the changes, it takes callback_mutex, blocking everyone else.
214 * 243 *
215 * Calls to the kernel memory allocator can not be made while holding 244 * Calls to the kernel memory allocator can not be made while holding
216 * callback_mutex, as that would risk double tripping on callback_mutex 245 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
232 * guidelines for accessing subsystem state in kernel/cgroup.c 261 * guidelines for accessing subsystem state in kernel/cgroup.c
233 */ 262 */
234 263
264static DEFINE_MUTEX(cpuset_mutex);
235static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
236 266
237/* 267/*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
246static DEFINE_SPINLOCK(cpuset_buffer_lock); 276static DEFINE_SPINLOCK(cpuset_buffer_lock);
247 277
248/* 278/*
279 * CPU / memory hotplug is handled asynchronously.
280 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq;
282
283static void cpuset_hotplug_workfn(struct work_struct *work);
284static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
285static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
286
287static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
288
289/*
249 * This is ugly, but preserves the userspace API for existing cpuset 290 * This is ugly, but preserves the userspace API for existing cpuset
250 * users. If someone tries to mount the "cpuset" filesystem, we 291 * users. If someone tries to mount the "cpuset" filesystem, we
251 * silently switch it to mount "cgroup" instead 292 * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
289 struct cpumask *pmask) 330 struct cpumask *pmask)
290{ 331{
291 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 332 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
292 cs = cs->parent; 333 cs = parent_cs(cs);
293 if (cs) 334 if (cs)
294 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 335 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
295 else 336 else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 355{
315 while (cs && !nodes_intersects(cs->mems_allowed, 356 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_MEMORY])) 357 node_states[N_MEMORY]))
317 cs = cs->parent; 358 cs = parent_cs(cs);
318 if (cs) 359 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 360 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_MEMORY]); 361 node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
326/* 367/*
327 * update task's spread flag if cpuset's page/slab spread flag is set 368 * update task's spread flag if cpuset's page/slab spread flag is set
328 * 369 *
329 * Called with callback_mutex/cgroup_mutex held 370 * Called with callback_mutex/cpuset_mutex held
330 */ 371 */
331static void cpuset_update_task_spread_flag(struct cpuset *cs, 372static void cpuset_update_task_spread_flag(struct cpuset *cs,
332 struct task_struct *tsk) 373 struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
346 * 387 *
347 * One cpuset is a subset of another if all its allowed CPUs and 388 * One cpuset is a subset of another if all its allowed CPUs and
348 * Memory Nodes are a subset of the other, and its exclusive flags 389 * Memory Nodes are a subset of the other, and its exclusive flags
349 * are only set if the other's are set. Call holding cgroup_mutex. 390 * are only set if the other's are set. Call holding cpuset_mutex.
350 */ 391 */
351 392
352static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 393static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
395 * If we replaced the flag and mask values of the current cpuset 436 * If we replaced the flag and mask values of the current cpuset
396 * (cur) with those values in the trial cpuset (trial), would 437 * (cur) with those values in the trial cpuset (trial), would
397 * our various subset and exclusive rules still be valid? Presumes 438 * our various subset and exclusive rules still be valid? Presumes
398 * cgroup_mutex held. 439 * cpuset_mutex held.
399 * 440 *
400 * 'cur' is the address of an actual, in-use cpuset. Operations 441 * 'cur' is the address of an actual, in-use cpuset. Operations
401 * such as list traversal that depend on the actual address of the 442 * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
412{ 453{
413 struct cgroup *cont; 454 struct cgroup *cont;
414 struct cpuset *c, *par; 455 struct cpuset *c, *par;
456 int ret;
457
458 rcu_read_lock();
415 459
416 /* Each of our child cpusets must be a subset of us */ 460 /* Each of our child cpusets must be a subset of us */
417 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 461 ret = -EBUSY;
418 if (!is_cpuset_subset(cgroup_cs(cont), trial)) 462 cpuset_for_each_child(c, cont, cur)
419 return -EBUSY; 463 if (!is_cpuset_subset(c, trial))
420 } 464 goto out;
421 465
422 /* Remaining checks don't apply to root cpuset */ 466 /* Remaining checks don't apply to root cpuset */
467 ret = 0;
423 if (cur == &top_cpuset) 468 if (cur == &top_cpuset)
424 return 0; 469 goto out;
425 470
426 par = cur->parent; 471 par = parent_cs(cur);
427 472
428 /* We must be a subset of our parent cpuset */ 473 /* We must be a subset of our parent cpuset */
474 ret = -EACCES;
429 if (!is_cpuset_subset(trial, par)) 475 if (!is_cpuset_subset(trial, par))
430 return -EACCES; 476 goto out;
431 477
432 /* 478 /*
433 * If either I or some sibling (!= me) is exclusive, we can't 479 * If either I or some sibling (!= me) is exclusive, we can't
434 * overlap 480 * overlap
435 */ 481 */
436 list_for_each_entry(cont, &par->css.cgroup->children, sibling) { 482 ret = -EINVAL;
437 c = cgroup_cs(cont); 483 cpuset_for_each_child(c, cont, par) {
438 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 484 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
439 c != cur && 485 c != cur &&
440 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 486 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
441 return -EINVAL; 487 goto out;
442 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 488 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
443 c != cur && 489 c != cur &&
444 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 490 nodes_intersects(trial->mems_allowed, c->mems_allowed))
445 return -EINVAL; 491 goto out;
446 } 492 }
447 493
448 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 494 /*
449 if (cgroup_task_count(cur->css.cgroup)) { 495 * Cpusets with tasks - existing or newly being attached - can't
450 if (cpumask_empty(trial->cpus_allowed) || 496 * have empty cpus_allowed or mems_allowed.
451 nodes_empty(trial->mems_allowed)) { 497 */
452 return -ENOSPC; 498 ret = -ENOSPC;
453 } 499 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
454 } 500 (cpumask_empty(trial->cpus_allowed) ||
501 nodes_empty(trial->mems_allowed)))
502 goto out;
455 503
456 return 0; 504 ret = 0;
505out:
506 rcu_read_unlock();
507 return ret;
457} 508}
458 509
459#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
474 return; 525 return;
475} 526}
476 527
477static void 528static void update_domain_attr_tree(struct sched_domain_attr *dattr,
478update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) 529 struct cpuset *root_cs)
479{ 530{
480 LIST_HEAD(q); 531 struct cpuset *cp;
481 532 struct cgroup *pos_cgrp;
482 list_add(&c->stack_list, &q);
483 while (!list_empty(&q)) {
484 struct cpuset *cp;
485 struct cgroup *cont;
486 struct cpuset *child;
487
488 cp = list_first_entry(&q, struct cpuset, stack_list);
489 list_del(q.next);
490 533
491 if (cpumask_empty(cp->cpus_allowed)) 534 rcu_read_lock();
535 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
536 /* skip the whole subtree if @cp doesn't have any CPU */
537 if (cpumask_empty(cp->cpus_allowed)) {
538 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
492 continue; 539 continue;
540 }
493 541
494 if (is_sched_load_balance(cp)) 542 if (is_sched_load_balance(cp))
495 update_domain_attr(dattr, cp); 543 update_domain_attr(dattr, cp);
496
497 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
498 child = cgroup_cs(cont);
499 list_add_tail(&child->stack_list, &q);
500 }
501 } 544 }
545 rcu_read_unlock();
502} 546}
503 547
504/* 548/*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
520 * domains when operating in the severe memory shortage situations 564 * domains when operating in the severe memory shortage situations
521 * that could cause allocation failures below. 565 * that could cause allocation failures below.
522 * 566 *
523 * Must be called with cgroup_lock held. 567 * Must be called with cpuset_mutex held.
524 * 568 *
525 * The three key local variables below are: 569 * The three key local variables below are:
526 * q - a linked-list queue of cpuset pointers, used to implement a 570 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
558static int generate_sched_domains(cpumask_var_t **domains, 602static int generate_sched_domains(cpumask_var_t **domains,
559 struct sched_domain_attr **attributes) 603 struct sched_domain_attr **attributes)
560{ 604{
561 LIST_HEAD(q); /* queue of cpusets to be scanned */
562 struct cpuset *cp; /* scans q */ 605 struct cpuset *cp; /* scans q */
563 struct cpuset **csa; /* array of all cpuset ptrs */ 606 struct cpuset **csa; /* array of all cpuset ptrs */
564 int csn; /* how many cpuset ptrs in csa so far */ 607 int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
567 struct sched_domain_attr *dattr; /* attributes for custom domains */ 610 struct sched_domain_attr *dattr; /* attributes for custom domains */
568 int ndoms = 0; /* number of sched domains in result */ 611 int ndoms = 0; /* number of sched domains in result */
569 int nslot; /* next empty doms[] struct cpumask slot */ 612 int nslot; /* next empty doms[] struct cpumask slot */
613 struct cgroup *pos_cgrp;
570 614
571 doms = NULL; 615 doms = NULL;
572 dattr = NULL; 616 dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
594 goto done; 638 goto done;
595 csn = 0; 639 csn = 0;
596 640
597 list_add(&top_cpuset.stack_list, &q); 641 rcu_read_lock();
598 while (!list_empty(&q)) { 642 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
599 struct cgroup *cont;
600 struct cpuset *child; /* scans child cpusets of cp */
601
602 cp = list_first_entry(&q, struct cpuset, stack_list);
603 list_del(q.next);
604
605 if (cpumask_empty(cp->cpus_allowed))
606 continue;
607
608 /* 643 /*
609 * All child cpusets contain a subset of the parent's cpus, so 644 * Continue traversing beyond @cp iff @cp has some CPUs and
610 * just skip them, and then we call update_domain_attr_tree() 645 * isn't load balancing. The former is obvious. The
611 * to calc relax_domain_level of the corresponding sched 646 * latter: All child cpusets contain a subset of the
612 * domain. 647 * parent's cpus, so just skip them, and then we call
648 * update_domain_attr_tree() to calc relax_domain_level of
649 * the corresponding sched domain.
613 */ 650 */
614 if (is_sched_load_balance(cp)) { 651 if (!cpumask_empty(cp->cpus_allowed) &&
615 csa[csn++] = cp; 652 !is_sched_load_balance(cp))
616 continue; 653 continue;
617 }
618 654
619 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 655 if (is_sched_load_balance(cp))
620 child = cgroup_cs(cont); 656 csa[csn++] = cp;
621 list_add_tail(&child->stack_list, &q); 657
622 } 658 /* skip @cp's subtree */
623 } 659 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
660 }
661 rcu_read_unlock();
624 662
625 for (i = 0; i < csn; i++) 663 for (i = 0; i < csn; i++)
626 csa[i]->pn = i; 664 csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
725/* 763/*
726 * Rebuild scheduler domains. 764 * Rebuild scheduler domains.
727 * 765 *
728 * Call with neither cgroup_mutex held nor within get_online_cpus(). 766 * If the flag 'sched_load_balance' of any cpuset with non-empty
729 * Takes both cgroup_mutex and get_online_cpus(). 767 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
768 * which has that flag enabled, or if any cpuset with a non-empty
769 * 'cpus' is removed, then call this routine to rebuild the
770 * scheduler's dynamic sched domains.
730 * 771 *
731 * Cannot be directly called from cpuset code handling changes 772 * Call with cpuset_mutex held. Takes get_online_cpus().
732 * to the cpuset pseudo-filesystem, because it cannot be called
733 * from code that already holds cgroup_mutex.
734 */ 773 */
735static void do_rebuild_sched_domains(struct work_struct *unused) 774static void rebuild_sched_domains_locked(void)
736{ 775{
737 struct sched_domain_attr *attr; 776 struct sched_domain_attr *attr;
738 cpumask_var_t *doms; 777 cpumask_var_t *doms;
739 int ndoms; 778 int ndoms;
740 779
780 lockdep_assert_held(&cpuset_mutex);
741 get_online_cpus(); 781 get_online_cpus();
742 782
743 /* Generate domain masks and attrs */ 783 /* Generate domain masks and attrs */
744 cgroup_lock();
745 ndoms = generate_sched_domains(&doms, &attr); 784 ndoms = generate_sched_domains(&doms, &attr);
746 cgroup_unlock();
747 785
748 /* Have scheduler rebuild the domains */ 786 /* Have scheduler rebuild the domains */
749 partition_sched_domains(ndoms, doms, attr); 787 partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
751 put_online_cpus(); 789 put_online_cpus();
752} 790}
753#else /* !CONFIG_SMP */ 791#else /* !CONFIG_SMP */
754static void do_rebuild_sched_domains(struct work_struct *unused) 792static void rebuild_sched_domains_locked(void)
755{ 793{
756} 794}
757 795
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
763} 801}
764#endif /* CONFIG_SMP */ 802#endif /* CONFIG_SMP */
765 803
766static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
767
768/*
769 * Rebuild scheduler domains, asynchronously via workqueue.
770 *
771 * If the flag 'sched_load_balance' of any cpuset with non-empty
772 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
773 * which has that flag enabled, or if any cpuset with a non-empty
774 * 'cpus' is removed, then call this routine to rebuild the
775 * scheduler's dynamic sched domains.
776 *
777 * The rebuild_sched_domains() and partition_sched_domains()
778 * routines must nest cgroup_lock() inside get_online_cpus(),
779 * but such cpuset changes as these must nest that locking the
780 * other way, holding cgroup_lock() for much of the code.
781 *
782 * So in order to avoid an ABBA deadlock, the cpuset code handling
783 * these user changes delegates the actual sched domain rebuilding
784 * to a separate workqueue thread, which ends up processing the
785 * above do_rebuild_sched_domains() function.
786 */
787static void async_rebuild_sched_domains(void)
788{
789 queue_work(cpuset_wq, &rebuild_sched_domains_work);
790}
791
792/*
793 * Accomplishes the same scheduler domain rebuild as the above
794 * async_rebuild_sched_domains(), however it directly calls the
795 * rebuild routine synchronously rather than calling it via an
796 * asynchronous work thread.
797 *
798 * This can only be called from code that is not holding
799 * cgroup_mutex (not nested in a cgroup_lock() call.)
800 */
801void rebuild_sched_domains(void) 804void rebuild_sched_domains(void)
802{ 805{
803 do_rebuild_sched_domains(NULL); 806 mutex_lock(&cpuset_mutex);
807 rebuild_sched_domains_locked();
808 mutex_unlock(&cpuset_mutex);
804} 809}
805 810
806/** 811/**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
808 * @tsk: task to test 813 * @tsk: task to test
809 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner 814 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
810 * 815 *
811 * Call with cgroup_mutex held. May take callback_mutex during call. 816 * Call with cpuset_mutex held. May take callback_mutex during call.
812 * Called for each task in a cgroup by cgroup_scan_tasks(). 817 * Called for each task in a cgroup by cgroup_scan_tasks().
813 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 818 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
814 * words, if its mask is not equal to its cpuset's mask). 819 * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829 * cpus_allowed mask needs to be changed. 834 * cpus_allowed mask needs to be changed.
830 * 835 *
831 * We don't need to re-check for the cgroup/cpuset membership, since we're 836 * We don't need to re-check for the cgroup/cpuset membership, since we're
832 * holding cgroup_lock() at this point. 837 * holding cpuset_mutex at this point.
833 */ 838 */
834static void cpuset_change_cpumask(struct task_struct *tsk, 839static void cpuset_change_cpumask(struct task_struct *tsk,
835 struct cgroup_scanner *scan) 840 struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
842 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 847 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
843 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 848 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
844 * 849 *
845 * Called with cgroup_mutex held 850 * Called with cpuset_mutex held
846 * 851 *
847 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 852 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
848 * calling callback functions for each. 853 * calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 heap_free(&heap); 925 heap_free(&heap);
921 926
922 if (is_load_balanced) 927 if (is_load_balanced)
923 async_rebuild_sched_domains(); 928 rebuild_sched_domains_locked();
924 return 0; 929 return 0;
925} 930}
926 931
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
932 * Temporarilly set tasks mems_allowed to target nodes of migration, 937 * Temporarilly set tasks mems_allowed to target nodes of migration,
933 * so that the migration code can allocate pages on these nodes. 938 * so that the migration code can allocate pages on these nodes.
934 * 939 *
935 * Call holding cgroup_mutex, so current's cpuset won't change 940 * Call holding cpuset_mutex, so current's cpuset won't change
936 * during this call, as manage_mutex holds off any cpuset_attach() 941 * during this call, as manage_mutex holds off any cpuset_attach()
937 * calls. Therefore we don't need to take task_lock around the 942 * calls. Therefore we don't need to take task_lock around the
938 * call to guarantee_online_mems(), as we know no one is changing 943 * call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007/* 1012/*
1008 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1013 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1009 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1014 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1010 * memory_migrate flag is set. Called with cgroup_mutex held. 1015 * memory_migrate flag is set. Called with cpuset_mutex held.
1011 */ 1016 */
1012static void cpuset_change_nodemask(struct task_struct *p, 1017static void cpuset_change_nodemask(struct task_struct *p,
1013 struct cgroup_scanner *scan) 1018 struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1016 struct cpuset *cs; 1021 struct cpuset *cs;
1017 int migrate; 1022 int migrate;
1018 const nodemask_t *oldmem = scan->data; 1023 const nodemask_t *oldmem = scan->data;
1019 static nodemask_t newmems; /* protected by cgroup_mutex */ 1024 static nodemask_t newmems; /* protected by cpuset_mutex */
1020 1025
1021 cs = cgroup_cs(scan->cg); 1026 cs = cgroup_cs(scan->cg);
1022 guarantee_online_mems(cs, &newmems); 1027 guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
1043 * @oldmem: old mems_allowed of cpuset cs 1048 * @oldmem: old mems_allowed of cpuset cs
1044 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1049 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1045 * 1050 *
1046 * Called with cgroup_mutex held 1051 * Called with cpuset_mutex held
1047 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1052 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1048 * if @heap != NULL. 1053 * if @heap != NULL.
1049 */ 1054 */
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1065 * take while holding tasklist_lock. Forks can happen - the 1070 * take while holding tasklist_lock. Forks can happen - the
1066 * mpol_dup() cpuset_being_rebound check will catch such forks, 1071 * mpol_dup() cpuset_being_rebound check will catch such forks,
1067 * and rebind their vma mempolicies too. Because we still hold 1072 * and rebind their vma mempolicies too. Because we still hold
1068 * the global cgroup_mutex, we know that no other rebind effort 1073 * the global cpuset_mutex, we know that no other rebind effort
1069 * will be contending for the global variable cpuset_being_rebound. 1074 * will be contending for the global variable cpuset_being_rebound.
1070 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1075 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1071 * is idempotent. Also migrate pages in each mm to new nodes. 1076 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1084 * mempolicies and if the cpuset is marked 'memory_migrate', 1089 * mempolicies and if the cpuset is marked 'memory_migrate',
1085 * migrate the tasks pages to the new memory. 1090 * migrate the tasks pages to the new memory.
1086 * 1091 *
1087 * Call with cgroup_mutex held. May take callback_mutex during call. 1092 * Call with cpuset_mutex held. May take callback_mutex during call.
1088 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1093 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1089 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1094 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1090 * their mempolicies to the cpusets new mems_allowed. 1095 * their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1168 cs->relax_domain_level = val; 1173 cs->relax_domain_level = val;
1169 if (!cpumask_empty(cs->cpus_allowed) && 1174 if (!cpumask_empty(cs->cpus_allowed) &&
1170 is_sched_load_balance(cs)) 1175 is_sched_load_balance(cs))
1171 async_rebuild_sched_domains(); 1176 rebuild_sched_domains_locked();
1172 } 1177 }
1173 1178
1174 return 0; 1179 return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1182 * Called by cgroup_scan_tasks() for each task in a cgroup. 1187 * Called by cgroup_scan_tasks() for each task in a cgroup.
1183 * 1188 *
1184 * We don't need to re-check for the cgroup/cpuset membership, since we're 1189 * We don't need to re-check for the cgroup/cpuset membership, since we're
1185 * holding cgroup_lock() at this point. 1190 * holding cpuset_mutex at this point.
1186 */ 1191 */
1187static void cpuset_change_flag(struct task_struct *tsk, 1192static void cpuset_change_flag(struct task_struct *tsk,
1188 struct cgroup_scanner *scan) 1193 struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
1195 * @cs: the cpuset in which each task's spread flags needs to be changed 1200 * @cs: the cpuset in which each task's spread flags needs to be changed
1196 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1201 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1197 * 1202 *
1198 * Called with cgroup_mutex held 1203 * Called with cpuset_mutex held
1199 * 1204 *
1200 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1205 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1201 * calling callback functions for each. 1206 * calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1220 * cs: the cpuset to update 1225 * cs: the cpuset to update
1221 * turning_on: whether the flag is being set or cleared 1226 * turning_on: whether the flag is being set or cleared
1222 * 1227 *
1223 * Call with cgroup_mutex held. 1228 * Call with cpuset_mutex held.
1224 */ 1229 */
1225 1230
1226static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1231static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1260 mutex_unlock(&callback_mutex); 1265 mutex_unlock(&callback_mutex);
1261 1266
1262 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1267 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1263 async_rebuild_sched_domains(); 1268 rebuild_sched_domains_locked();
1264 1269
1265 if (spread_flag_changed) 1270 if (spread_flag_changed)
1266 update_tasks_flags(cs, &heap); 1271 update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
1368 return val; 1373 return val;
1369} 1374}
1370 1375
1371/* 1376/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1372 * Protected by cgroup_lock. The nodemasks must be stored globally because
1373 * dynamically allocating them is not allowed in can_attach, and they must
1374 * persist until attach.
1375 */
1376static cpumask_var_t cpus_attach;
1377static nodemask_t cpuset_attach_nodemask_from;
1378static nodemask_t cpuset_attach_nodemask_to;
1379
1380/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1381static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1377static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1382{ 1378{
1383 struct cpuset *cs = cgroup_cs(cgrp); 1379 struct cpuset *cs = cgroup_cs(cgrp);
1384 struct task_struct *task; 1380 struct task_struct *task;
1385 int ret; 1381 int ret;
1386 1382
1383 mutex_lock(&cpuset_mutex);
1384
1385 ret = -ENOSPC;
1387 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1386 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1388 return -ENOSPC; 1387 goto out_unlock;
1389 1388
1390 cgroup_taskset_for_each(task, cgrp, tset) { 1389 cgroup_taskset_for_each(task, cgrp, tset) {
1391 /* 1390 /*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1397 * set_cpus_allowed_ptr() on all attached tasks before 1396 * set_cpus_allowed_ptr() on all attached tasks before
1398 * cpus_allowed may be changed. 1397 * cpus_allowed may be changed.
1399 */ 1398 */
1399 ret = -EINVAL;
1400 if (task->flags & PF_THREAD_BOUND) 1400 if (task->flags & PF_THREAD_BOUND)
1401 return -EINVAL; 1401 goto out_unlock;
1402 if ((ret = security_task_setscheduler(task))) 1402 ret = security_task_setscheduler(task);
1403 return ret; 1403 if (ret)
1404 goto out_unlock;
1404 } 1405 }
1405 1406
1406 /* prepare for attach */ 1407 /*
1407 if (cs == &top_cpuset) 1408 * Mark attach is in progress. This makes validate_change() fail
1408 cpumask_copy(cpus_attach, cpu_possible_mask); 1409 * changes which zero cpus/mems_allowed.
1409 else 1410 */
1410 guarantee_online_cpus(cs, cpus_attach); 1411 cs->attach_in_progress++;
1411 1412 ret = 0;
1412 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1413out_unlock:
1414 mutex_unlock(&cpuset_mutex);
1415 return ret;
1416}
1413 1417
1414 return 0; 1418static void cpuset_cancel_attach(struct cgroup *cgrp,
1419 struct cgroup_taskset *tset)
1420{
1421 mutex_lock(&cpuset_mutex);
1422 cgroup_cs(cgrp)->attach_in_progress--;
1423 mutex_unlock(&cpuset_mutex);
1415} 1424}
1416 1425
1426/*
1427 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
1428 * but we can't allocate it dynamically there. Define it global and
1429 * allocate from cpuset_init().
1430 */
1431static cpumask_var_t cpus_attach;
1432
1417static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1433static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1418{ 1434{
1435 /* static bufs protected by cpuset_mutex */
1436 static nodemask_t cpuset_attach_nodemask_from;
1437 static nodemask_t cpuset_attach_nodemask_to;
1419 struct mm_struct *mm; 1438 struct mm_struct *mm;
1420 struct task_struct *task; 1439 struct task_struct *task;
1421 struct task_struct *leader = cgroup_taskset_first(tset); 1440 struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1423 struct cpuset *cs = cgroup_cs(cgrp); 1442 struct cpuset *cs = cgroup_cs(cgrp);
1424 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1443 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1425 1444
1445 mutex_lock(&cpuset_mutex);
1446
1447 /* prepare for attach */
1448 if (cs == &top_cpuset)
1449 cpumask_copy(cpus_attach, cpu_possible_mask);
1450 else
1451 guarantee_online_cpus(cs, cpus_attach);
1452
1453 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1454
1426 cgroup_taskset_for_each(task, cgrp, tset) { 1455 cgroup_taskset_for_each(task, cgrp, tset) {
1427 /* 1456 /*
1428 * can_attach beforehand should guarantee that this doesn't 1457 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1448 &cpuset_attach_nodemask_to); 1477 &cpuset_attach_nodemask_to);
1449 mmput(mm); 1478 mmput(mm);
1450 } 1479 }
1480
1481 cs->attach_in_progress--;
1482
1483 /*
1484 * We may have raced with CPU/memory hotunplug. Trigger hotplug
1485 * propagation if @cs doesn't have any CPU or memory. It will move
1486 * the newly added tasks to the nearest parent which can execute.
1487 */
1488 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1489 schedule_cpuset_propagate_hotplug(cs);
1490
1491 mutex_unlock(&cpuset_mutex);
1451} 1492}
1452 1493
1453/* The various types of files and directories in a cpuset file system */ 1494/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
1469 1510
1470static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1511static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1471{ 1512{
1472 int retval = 0;
1473 struct cpuset *cs = cgroup_cs(cgrp); 1513 struct cpuset *cs = cgroup_cs(cgrp);
1474 cpuset_filetype_t type = cft->private; 1514 cpuset_filetype_t type = cft->private;
1515 int retval = -ENODEV;
1475 1516
1476 if (!cgroup_lock_live_group(cgrp)) 1517 mutex_lock(&cpuset_mutex);
1477 return -ENODEV; 1518 if (!is_cpuset_online(cs))
1519 goto out_unlock;
1478 1520
1479 switch (type) { 1521 switch (type) {
1480 case FILE_CPU_EXCLUSIVE: 1522 case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1508 retval = -EINVAL; 1550 retval = -EINVAL;
1509 break; 1551 break;
1510 } 1552 }
1511 cgroup_unlock(); 1553out_unlock:
1554 mutex_unlock(&cpuset_mutex);
1512 return retval; 1555 return retval;
1513} 1556}
1514 1557
1515static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1558static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1516{ 1559{
1517 int retval = 0;
1518 struct cpuset *cs = cgroup_cs(cgrp); 1560 struct cpuset *cs = cgroup_cs(cgrp);
1519 cpuset_filetype_t type = cft->private; 1561 cpuset_filetype_t type = cft->private;
1562 int retval = -ENODEV;
1520 1563
1521 if (!cgroup_lock_live_group(cgrp)) 1564 mutex_lock(&cpuset_mutex);
1522 return -ENODEV; 1565 if (!is_cpuset_online(cs))
1566 goto out_unlock;
1523 1567
1524 switch (type) { 1568 switch (type) {
1525 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1569 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1529 retval = -EINVAL; 1573 retval = -EINVAL;
1530 break; 1574 break;
1531 } 1575 }
1532 cgroup_unlock(); 1576out_unlock:
1577 mutex_unlock(&cpuset_mutex);
1533 return retval; 1578 return retval;
1534} 1579}
1535 1580
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1539static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1584static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1540 const char *buf) 1585 const char *buf)
1541{ 1586{
1542 int retval = 0;
1543 struct cpuset *cs = cgroup_cs(cgrp); 1587 struct cpuset *cs = cgroup_cs(cgrp);
1544 struct cpuset *trialcs; 1588 struct cpuset *trialcs;
1589 int retval = -ENODEV;
1590
1591 /*
1592 * CPU or memory hotunplug may leave @cs w/o any execution
1593 * resources, in which case the hotplug code asynchronously updates
1594 * configuration and transfers all tasks to the nearest ancestor
1595 * which can execute.
1596 *
1597 * As writes to "cpus" or "mems" may restore @cs's execution
1598 * resources, wait for the previously scheduled operations before
1599 * proceeding, so that we don't end up keep removing tasks added
1600 * after execution capability is restored.
1601 *
1602 * Flushing cpuset_hotplug_work is enough to synchronize against
1603 * hotplug hanlding; however, cpuset_attach() may schedule
1604 * propagation work directly. Flush the workqueue too.
1605 */
1606 flush_work(&cpuset_hotplug_work);
1607 flush_workqueue(cpuset_propagate_hotplug_wq);
1545 1608
1546 if (!cgroup_lock_live_group(cgrp)) 1609 mutex_lock(&cpuset_mutex);
1547 return -ENODEV; 1610 if (!is_cpuset_online(cs))
1611 goto out_unlock;
1548 1612
1549 trialcs = alloc_trial_cpuset(cs); 1613 trialcs = alloc_trial_cpuset(cs);
1550 if (!trialcs) { 1614 if (!trialcs) {
1551 retval = -ENOMEM; 1615 retval = -ENOMEM;
1552 goto out; 1616 goto out_unlock;
1553 } 1617 }
1554 1618
1555 switch (cft->private) { 1619 switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1565 } 1629 }
1566 1630
1567 free_trial_cpuset(trialcs); 1631 free_trial_cpuset(trialcs);
1568out: 1632out_unlock:
1569 cgroup_unlock(); 1633 mutex_unlock(&cpuset_mutex);
1570 return retval; 1634 return retval;
1571} 1635}
1572 1636
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
1790 1854
1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1855static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1792{ 1856{
1793 struct cgroup *parent_cg = cont->parent; 1857 struct cpuset *cs;
1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1796 1858
1797 if (!parent_cg) 1859 if (!cont->parent)
1798 return &top_cpuset.css; 1860 return &top_cpuset.css;
1799 parent = cgroup_cs(parent_cg);
1800 1861
1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1862 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1802 if (!cs) 1863 if (!cs)
1803 return ERR_PTR(-ENOMEM); 1864 return ERR_PTR(-ENOMEM);
1804 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1865 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1806 return ERR_PTR(-ENOMEM); 1867 return ERR_PTR(-ENOMEM);
1807 } 1868 }
1808 1869
1809 cs->flags = 0;
1810 if (is_spread_page(parent))
1811 set_bit(CS_SPREAD_PAGE, &cs->flags);
1812 if (is_spread_slab(parent))
1813 set_bit(CS_SPREAD_SLAB, &cs->flags);
1814 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1815 cpumask_clear(cs->cpus_allowed); 1871 cpumask_clear(cs->cpus_allowed);
1816 nodes_clear(cs->mems_allowed); 1872 nodes_clear(cs->mems_allowed);
1817 fmeter_init(&cs->fmeter); 1873 fmeter_init(&cs->fmeter);
1874 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1818 cs->relax_domain_level = -1; 1875 cs->relax_domain_level = -1;
1819 1876
1820 cs->parent = parent; 1877 return &cs->css;
1878}
1879
1880static int cpuset_css_online(struct cgroup *cgrp)
1881{
1882 struct cpuset *cs = cgroup_cs(cgrp);
1883 struct cpuset *parent = parent_cs(cs);
1884 struct cpuset *tmp_cs;
1885 struct cgroup *pos_cg;
1886
1887 if (!parent)
1888 return 0;
1889
1890 mutex_lock(&cpuset_mutex);
1891
1892 set_bit(CS_ONLINE, &cs->flags);
1893 if (is_spread_page(parent))
1894 set_bit(CS_SPREAD_PAGE, &cs->flags);
1895 if (is_spread_slab(parent))
1896 set_bit(CS_SPREAD_SLAB, &cs->flags);
1897
1821 number_of_cpusets++; 1898 number_of_cpusets++;
1822 1899
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) 1900 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
1824 goto skip_clone; 1901 goto out_unlock;
1825 1902
1826 /* 1903 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 1904 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 1913 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup. 1914 * (and likewise for mems) to the new cgroup.
1838 */ 1915 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { 1916 rcu_read_lock();
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg); 1917 cpuset_for_each_child(tmp_cs, pos_cg, parent) {
1841 1918 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) 1919 rcu_read_unlock();
1843 goto skip_clone; 1920 goto out_unlock;
1921 }
1844 } 1922 }
1923 rcu_read_unlock();
1845 1924
1846 mutex_lock(&callback_mutex); 1925 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed; 1926 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1927 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex); 1928 mutex_unlock(&callback_mutex);
1850skip_clone: 1929out_unlock:
1851 return &cs->css; 1930 mutex_unlock(&cpuset_mutex);
1931 return 0;
1932}
1933
1934static void cpuset_css_offline(struct cgroup *cgrp)
1935{
1936 struct cpuset *cs = cgroup_cs(cgrp);
1937
1938 mutex_lock(&cpuset_mutex);
1939
1940 if (is_sched_load_balance(cs))
1941 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1942
1943 number_of_cpusets--;
1944 clear_bit(CS_ONLINE, &cs->flags);
1945
1946 mutex_unlock(&cpuset_mutex);
1852} 1947}
1853 1948
1854/* 1949/*
1855 * If the cpuset being removed has its flag 'sched_load_balance' 1950 * If the cpuset being removed has its flag 'sched_load_balance'
1856 * enabled, then simulate turning sched_load_balance off, which 1951 * enabled, then simulate turning sched_load_balance off, which
1857 * will call async_rebuild_sched_domains(). 1952 * will call rebuild_sched_domains_locked().
1858 */ 1953 */
1859 1954
1860static void cpuset_css_free(struct cgroup *cont) 1955static void cpuset_css_free(struct cgroup *cont)
1861{ 1956{
1862 struct cpuset *cs = cgroup_cs(cont); 1957 struct cpuset *cs = cgroup_cs(cont);
1863 1958
1864 if (is_sched_load_balance(cs))
1865 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1866
1867 number_of_cpusets--;
1868 free_cpumask_var(cs->cpus_allowed); 1959 free_cpumask_var(cs->cpus_allowed);
1869 kfree(cs); 1960 kfree(cs);
1870} 1961}
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
1872struct cgroup_subsys cpuset_subsys = { 1963struct cgroup_subsys cpuset_subsys = {
1873 .name = "cpuset", 1964 .name = "cpuset",
1874 .css_alloc = cpuset_css_alloc, 1965 .css_alloc = cpuset_css_alloc,
1966 .css_online = cpuset_css_online,
1967 .css_offline = cpuset_css_offline,
1875 .css_free = cpuset_css_free, 1968 .css_free = cpuset_css_free,
1876 .can_attach = cpuset_can_attach, 1969 .can_attach = cpuset_can_attach,
1970 .cancel_attach = cpuset_cancel_attach,
1877 .attach = cpuset_attach, 1971 .attach = cpuset_attach,
1878 .subsys_id = cpuset_subsys_id, 1972 .subsys_id = cpuset_subsys_id,
1879 .base_cftypes = files, 1973 .base_cftypes = files,
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1924{ 2018{
1925 struct cgroup *new_cgroup = scan->data; 2019 struct cgroup *new_cgroup = scan->data;
1926 2020
2021 cgroup_lock();
1927 cgroup_attach_task(new_cgroup, tsk); 2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
1928} 2024}
1929 2025
1930/** 2026/**
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932 * @from: cpuset in which the tasks currently reside 2028 * @from: cpuset in which the tasks currently reside
1933 * @to: cpuset to which the tasks will be moved 2029 * @to: cpuset to which the tasks will be moved
1934 * 2030 *
1935 * Called with cgroup_mutex held 2031 * Called with cpuset_mutex held
1936 * callback_mutex must not be held, as cpuset_attach() will take it. 2032 * callback_mutex must not be held, as cpuset_attach() will take it.
1937 * 2033 *
1938 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1959 * removing that CPU or node from all cpusets. If this removes the 2055 * removing that CPU or node from all cpusets. If this removes the
1960 * last CPU or node from a cpuset, then move the tasks in the empty 2056 * last CPU or node from a cpuset, then move the tasks in the empty
1961 * cpuset to its next-highest non-empty parent. 2057 * cpuset to its next-highest non-empty parent.
1962 *
1963 * Called with cgroup_mutex held
1964 * callback_mutex must not be held, as cpuset_attach() will take it.
1965 */ 2058 */
1966static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2059static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1967{ 2060{
1968 struct cpuset *parent; 2061 struct cpuset *parent;
1969 2062
1970 /* 2063 /*
1971 * The cgroup's css_sets list is in use if there are tasks
1972 * in the cpuset; the list is empty if there are none;
1973 * the cs->css.refcnt seems always 0.
1974 */
1975 if (list_empty(&cs->css.cgroup->css_sets))
1976 return;
1977
1978 /*
1979 * Find its next-highest non-empty parent, (top cpuset 2064 * Find its next-highest non-empty parent, (top cpuset
1980 * has online cpus, so can't be empty). 2065 * has online cpus, so can't be empty).
1981 */ 2066 */
1982 parent = cs->parent; 2067 parent = parent_cs(cs);
1983 while (cpumask_empty(parent->cpus_allowed) || 2068 while (cpumask_empty(parent->cpus_allowed) ||
1984 nodes_empty(parent->mems_allowed)) 2069 nodes_empty(parent->mems_allowed))
1985 parent = parent->parent; 2070 parent = parent_cs(parent);
1986 2071
1987 move_member_tasks_to_cpuset(cs, parent); 2072 move_member_tasks_to_cpuset(cs, parent);
1988} 2073}
1989 2074
1990/* 2075/**
1991 * Helper function to traverse cpusets. 2076 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
1992 * It can be used to walk the cpuset tree from top to bottom, completing 2077 * @cs: cpuset in interest
1993 * one layer before dropping down to the next (thus always processing a 2078 *
1994 * node before any of its children). 2079 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2080 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2081 * all its tasks are moved to the nearest ancestor with both resources.
1995 */ 2082 */
1996static struct cpuset *cpuset_next(struct list_head *queue) 2083static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
1997{ 2084{
1998 struct cpuset *cp; 2085 static cpumask_t off_cpus;
1999 struct cpuset *child; /* scans child cpusets of cp */ 2086 static nodemask_t off_mems, tmp_mems;
2000 struct cgroup *cont; 2087 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2088 bool is_empty;
2001 2089
2002 if (list_empty(queue)) 2090 mutex_lock(&cpuset_mutex);
2003 return NULL; 2091
2092 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2093 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2004 2094
2005 cp = list_first_entry(queue, struct cpuset, stack_list); 2095 /* remove offline cpus from @cs */
2006 list_del(queue->next); 2096 if (!cpumask_empty(&off_cpus)) {
2007 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2097 mutex_lock(&callback_mutex);
2008 child = cgroup_cs(cont); 2098 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2009 list_add_tail(&child->stack_list, queue); 2099 mutex_unlock(&callback_mutex);
2100 update_tasks_cpumask(cs, NULL);
2101 }
2102
2103 /* remove offline mems from @cs */
2104 if (!nodes_empty(off_mems)) {
2105 tmp_mems = cs->mems_allowed;
2106 mutex_lock(&callback_mutex);
2107 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2108 mutex_unlock(&callback_mutex);
2109 update_tasks_nodemask(cs, &tmp_mems, NULL);
2010 } 2110 }
2011 2111
2012 return cp; 2112 is_empty = cpumask_empty(cs->cpus_allowed) ||
2113 nodes_empty(cs->mems_allowed);
2114
2115 mutex_unlock(&cpuset_mutex);
2116
2117 /*
2118 * If @cs became empty, move tasks to the nearest ancestor with
2119 * execution resources. This is full cgroup operation which will
2120 * also call back into cpuset. Should be done outside any lock.
2121 */
2122 if (is_empty)
2123 remove_tasks_in_empty_cpuset(cs);
2124
2125 /* the following may free @cs, should be the last operation */
2126 css_put(&cs->css);
2013} 2127}
2014 2128
2129/**
2130 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2131 * @cs: cpuset of interest
2132 *
2133 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2134 * memory masks according to top_cpuset.
2135 */
2136static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2137{
2138 /*
2139 * Pin @cs. The refcnt will be released when the work item
2140 * finishes executing.
2141 */
2142 if (!css_tryget(&cs->css))
2143 return;
2015 2144
2016/* 2145 /*
2017 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory 2146 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2018 * online/offline) and update the cpusets accordingly. 2147 * cpuset_propagate_hotplug_wq is ordered and propagation will
2019 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such 2148 * happen in the order this function is called.
2020 * cpuset must be moved to a parent cpuset. 2149 */
2150 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2151 css_put(&cs->css);
2152}
2153
2154/**
2155 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2021 * 2156 *
2022 * Called with cgroup_mutex held. We take callback_mutex to modify 2157 * This function is called after either CPU or memory configuration has
2023 * cpus_allowed and mems_allowed. 2158 * changed and updates cpuset accordingly. The top_cpuset is always
2159 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
2160 * order to make cpusets transparent (of no affect) on systems that are
2161 * actively using CPU hotplug but making no active use of cpusets.
2024 * 2162 *
2025 * This walk processes the tree from top to bottom, completing one layer 2163 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2026 * before dropping down to the next. It always processes a node before 2164 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
2027 * any of its children. 2165 * descendants.
2028 * 2166 *
2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY 2167 * Note that CPU offlining during suspend is ignored. We don't modify
2030 * if all present pages from a node are offlined. 2168 * cpusets across suspend/resume cycles at all.
2031 */ 2169 */
2032static void 2170static void cpuset_hotplug_workfn(struct work_struct *work)
2033scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2034{ 2171{
2035 LIST_HEAD(queue); 2172 static cpumask_t new_cpus, tmp_cpus;
2036 struct cpuset *cp; /* scans cpusets being updated */ 2173 static nodemask_t new_mems, tmp_mems;
2037 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2174 bool cpus_updated, mems_updated;
2175 bool cpus_offlined, mems_offlined;
2038 2176
2039 list_add_tail((struct list_head *)&root->stack_list, &queue); 2177 mutex_lock(&cpuset_mutex);
2040 2178
2041 switch (event) { 2179 /* fetch the available cpus/mems and find out which changed how */
2042 case CPUSET_CPU_OFFLINE: 2180 cpumask_copy(&new_cpus, cpu_active_mask);
2043 while ((cp = cpuset_next(&queue)) != NULL) { 2181 new_mems = node_states[N_MEMORY];
2044 2182
2045 /* Continue past cpusets with all cpus online */ 2183 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2046 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) 2184 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2047 continue; 2185 &new_cpus);
2048 2186
2049 /* Remove offline cpus from this cpuset. */ 2187 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2050 mutex_lock(&callback_mutex); 2188 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2051 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2189 mems_offlined = !nodes_empty(tmp_mems);
2052 cpu_active_mask);
2053 mutex_unlock(&callback_mutex);
2054 2190
2055 /* Move tasks from the empty cpuset to a parent */ 2191 /* synchronize cpus_allowed to cpu_active_mask */
2056 if (cpumask_empty(cp->cpus_allowed)) 2192 if (cpus_updated) {
2057 remove_tasks_in_empty_cpuset(cp); 2193 mutex_lock(&callback_mutex);
2058 else 2194 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2059 update_tasks_cpumask(cp, NULL); 2195 mutex_unlock(&callback_mutex);
2060 } 2196 /* we don't mess with cpumasks of tasks in top_cpuset */
2061 break; 2197 }
2062 2198
2063 case CPUSET_MEM_OFFLINE: 2199 /* synchronize mems_allowed to N_MEMORY */
2064 while ((cp = cpuset_next(&queue)) != NULL) { 2200 if (mems_updated) {
2201 tmp_mems = top_cpuset.mems_allowed;
2202 mutex_lock(&callback_mutex);
2203 top_cpuset.mems_allowed = new_mems;
2204 mutex_unlock(&callback_mutex);
2205 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
2206 }
2065 2207
2066 /* Continue past cpusets with all mems online */ 2208 /* if cpus or mems went down, we need to propagate to descendants */
2067 if (nodes_subset(cp->mems_allowed, 2209 if (cpus_offlined || mems_offlined) {
2068 node_states[N_MEMORY])) 2210 struct cpuset *cs;
2069 continue; 2211 struct cgroup *pos_cgrp;
2070 2212
2071 oldmems = cp->mems_allowed; 2213 rcu_read_lock();
2214 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
2215 schedule_cpuset_propagate_hotplug(cs);
2216 rcu_read_unlock();
2217 }
2072 2218
2073 /* Remove offline mems from this cpuset. */ 2219 mutex_unlock(&cpuset_mutex);
2074 mutex_lock(&callback_mutex);
2075 nodes_and(cp->mems_allowed, cp->mems_allowed,
2076 node_states[N_MEMORY]);
2077 mutex_unlock(&callback_mutex);
2078 2220
2079 /* Move tasks from the empty cpuset to a parent */ 2221 /* wait for propagations to finish */
2080 if (nodes_empty(cp->mems_allowed)) 2222 flush_workqueue(cpuset_propagate_hotplug_wq);
2081 remove_tasks_in_empty_cpuset(cp); 2223
2082 else 2224 /* rebuild sched domains if cpus_allowed has changed */
2083 update_tasks_nodemask(cp, &oldmems, NULL); 2225 if (cpus_updated) {
2084 } 2226 struct sched_domain_attr *attr;
2227 cpumask_var_t *doms;
2228 int ndoms;
2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2085 } 2235 }
2086} 2236}
2087 2237
2088/*
2089 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2090 * period. This is necessary in order to make cpusets transparent
2091 * (of no affect) on systems that are actively using CPU hotplug
2092 * but making no active use of cpusets.
2093 *
2094 * The only exception to this is suspend/resume, where we don't
2095 * modify cpusets at all.
2096 *
2097 * This routine ensures that top_cpuset.cpus_allowed tracks
2098 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2099 *
2100 * Called within get_online_cpus(). Needs to call cgroup_lock()
2101 * before calling generate_sched_domains().
2102 *
2103 * @cpu_online: Indicates whether this is a CPU online event (true) or
2104 * a CPU offline event (false).
2105 */
2106void cpuset_update_active_cpus(bool cpu_online) 2238void cpuset_update_active_cpus(bool cpu_online)
2107{ 2239{
2108 struct sched_domain_attr *attr; 2240 /*
2109 cpumask_var_t *doms; 2241 * We're inside cpu hotplug critical region which usually nests
2110 int ndoms; 2242 * inside cgroup synchronization. Bounce actual hotplug processing
2111 2243 * to a work item to avoid reverse locking order.
2112 cgroup_lock(); 2244 *
2113 mutex_lock(&callback_mutex); 2245 * We still need to do partition_sched_domains() synchronously;
2114 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2246 * otherwise, the scheduler will get confused and put tasks to the
2115 mutex_unlock(&callback_mutex); 2247 * dead CPU. Fall back to the default single domain.
2116 2248 * cpuset_hotplug_workfn() will rebuild it as necessary.
2117 if (!cpu_online) 2249 */
2118 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); 2250 partition_sched_domains(1, NULL, NULL);
2119 2251 schedule_work(&cpuset_hotplug_work);
2120 ndoms = generate_sched_domains(&doms, &attr);
2121 cgroup_unlock();
2122
2123 /* Have scheduler rebuild the domains */
2124 partition_sched_domains(ndoms, doms, attr);
2125} 2252}
2126 2253
2127#ifdef CONFIG_MEMORY_HOTPLUG 2254#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
2133static int cpuset_track_online_nodes(struct notifier_block *self, 2260static int cpuset_track_online_nodes(struct notifier_block *self,
2134 unsigned long action, void *arg) 2261 unsigned long action, void *arg)
2135{ 2262{
2136 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2263 schedule_work(&cpuset_hotplug_work);
2137
2138 cgroup_lock();
2139 switch (action) {
2140 case MEM_ONLINE:
2141 oldmems = top_cpuset.mems_allowed;
2142 mutex_lock(&callback_mutex);
2143 top_cpuset.mems_allowed = node_states[N_MEMORY];
2144 mutex_unlock(&callback_mutex);
2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2146 break;
2147 case MEM_OFFLINE:
2148 /*
2149 * needn't update top_cpuset.mems_allowed explicitly because
2150 * scan_cpusets_upon_hotplug() will update it.
2151 */
2152 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2153 break;
2154 default:
2155 break;
2156 }
2157 cgroup_unlock();
2158
2159 return NOTIFY_OK; 2264 return NOTIFY_OK;
2160} 2265}
2161#endif 2266#endif
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
2173 2278
2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2279 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2175 2280
2176 cpuset_wq = create_singlethread_workqueue("cpuset"); 2281 cpuset_propagate_hotplug_wq =
2177 BUG_ON(!cpuset_wq); 2282 alloc_ordered_workqueue("cpuset_hotplug", 0);
2283 BUG_ON(!cpuset_propagate_hotplug_wq);
2178} 2284}
2179 2285
2180/** 2286/**
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2273 */ 2379 */
2274static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2380static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2275{ 2381{
2276 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) 2382 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2277 cs = cs->parent; 2383 cs = parent_cs(cs);
2278 return cs; 2384 return cs;
2279} 2385}
2280 2386
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2412} 2518}
2413 2519
2414/** 2520/**
2415 * cpuset_unlock - release lock on cpuset changes
2416 *
2417 * Undo the lock taken in a previous cpuset_lock() call.
2418 */
2419
2420void cpuset_unlock(void)
2421{
2422 mutex_unlock(&callback_mutex);
2423}
2424
2425/**
2426 * cpuset_mem_spread_node() - On which node to begin search for a file page 2521 * cpuset_mem_spread_node() - On which node to begin search for a file page
2427 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2522 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2428 * 2523 *
@@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2511 2606
2512 dentry = task_cs(tsk)->css.cgroup->dentry; 2607 dentry = task_cs(tsk)->css.cgroup->dentry;
2513 spin_lock(&cpuset_buffer_lock); 2608 spin_lock(&cpuset_buffer_lock);
2514 snprintf(cpuset_name, CPUSET_NAME_LEN, 2609
2515 dentry ? (const char *)dentry->d_name.name : "/"); 2610 if (!dentry) {
2611 strcpy(cpuset_name, "/");
2612 } else {
2613 spin_lock(&dentry->d_lock);
2614 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2615 CPUSET_NAME_LEN);
2616 spin_unlock(&dentry->d_lock);
2617 }
2618
2516 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2619 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2517 tsk->mems_allowed); 2620 tsk->mems_allowed);
2518 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2621 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
2560 * - Used for /proc/<pid>/cpuset. 2663 * - Used for /proc/<pid>/cpuset.
2561 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2664 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2562 * doesn't really matter if tsk->cpuset changes after we read it, 2665 * doesn't really matter if tsk->cpuset changes after we read it,
2563 * and we take cgroup_mutex, keeping cpuset_attach() from changing it 2666 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2564 * anyway. 2667 * anyway.
2565 */ 2668 */
2566static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2669static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2582 if (!tsk) 2685 if (!tsk)
2583 goto out_free; 2686 goto out_free;
2584 2687
2585 retval = -EINVAL; 2688 rcu_read_lock();
2586 cgroup_lock();
2587 css = task_subsys_state(tsk, cpuset_subsys_id); 2689 css = task_subsys_state(tsk, cpuset_subsys_id);
2588 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2690 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2691 rcu_read_unlock();
2589 if (retval < 0) 2692 if (retval < 0)
2590 goto out_unlock; 2693 goto out_put_task;
2591 seq_puts(m, buf); 2694 seq_puts(m, buf);
2592 seq_putc(m, '\n'); 2695 seq_putc(m, '\n');
2593out_unlock: 2696out_put_task:
2594 cgroup_unlock();
2595 put_task_struct(tsk); 2697 put_task_struct(tsk);
2596out_free: 2698out_free:
2597 kfree(buf); 2699 kfree(buf);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc8..c26278fd4851 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
29 */ 29 */
30#include <linux/pid_namespace.h> 30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h> 31#include <linux/clocksource.h>
32#include <linux/serial_core.h>
32#include <linux/interrupt.h> 33#include <linux/interrupt.h>
33#include <linux/spinlock.h> 34#include <linux/spinlock.h>
34#include <linux/console.h> 35#include <linux/console.h>
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e064482..38573f35a5ad 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/kgdb.h> 32#include <linux/kgdb.h>
33#include <linux/kdb.h> 33#include <linux/kdb.h>
34#include <linux/serial_core.h>
34#include <linux/reboot.h> 35#include <linux/reboot.h>
35#include <linux/uaccess.h> 36#include <linux/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4d5f8d5612f3..8875254120b6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv)
1970 1970
1971 kdb_printf("Module Size modstruct Used by\n"); 1971 kdb_printf("Module Size modstruct Used by\n");
1972 list_for_each_entry(mod, kdb_modules, list) { 1972 list_for_each_entry(mod, kdb_modules, list) {
1973 if (mod->state == MODULE_STATE_UNFORMED)
1974 continue;
1973 1975
1974 kdb_printf("%-20s%8u 0x%p ", mod->name, 1976 kdb_printf("%-20s%8u 0x%p ", mod->name,
1975 mod->core_size, (void *)mod); 1977 mod->core_size, (void *)mod);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053aa..d473988c1d0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
106 unsigned long long t2, t3; 106 unsigned long long t2, t3;
107 unsigned long flags; 107 unsigned long flags;
108 struct timespec ts; 108 struct timespec ts;
109 cputime_t utime, stime, stimescaled, utimescaled;
109 110
110 /* Though tsk->delays accessed later, early exit avoids 111 /* Though tsk->delays accessed later, early exit avoids
111 * unnecessary returning of other data 112 * unnecessary returning of other data
@@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
114 goto done; 115 goto done;
115 116
116 tmp = (s64)d->cpu_run_real_total; 117 tmp = (s64)d->cpu_run_real_total;
117 cputime_to_timespec(tsk->utime + tsk->stime, &ts); 118 task_cputime(tsk, &utime, &stime);
119 cputime_to_timespec(utime + stime, &ts);
118 tmp += timespec_to_ns(&ts); 120 tmp += timespec_to_ns(&ts);
119 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; 121 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
120 122
121 tmp = (s64)d->cpu_scaled_run_real_total; 123 tmp = (s64)d->cpu_scaled_run_real_total;
122 cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); 124 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
125 cputime_to_timespec(utimescaled + stimescaled, &ts);
123 tmp += timespec_to_ns(&ts); 126 tmp += timespec_to_ns(&ts);
124 d->cpu_scaled_run_real_total = 127 d->cpu_scaled_run_real_total =
125 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; 128 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 301079d06f24..b0cd86501c30 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -908,6 +908,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
908} 908}
909 909
910/* 910/*
911 * Initialize event state based on the perf_event_attr::disabled.
912 */
913static inline void perf_event__state_init(struct perf_event *event)
914{
915 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
916 PERF_EVENT_STATE_INACTIVE;
917}
918
919/*
911 * Called at perf_event creation and when events are attached/detached from a 920 * Called at perf_event creation and when events are attached/detached from a
912 * group. 921 * group.
913 */ 922 */
@@ -3682,7 +3691,7 @@ unlock:
3682 3691
3683static int perf_fasync(int fd, struct file *filp, int on) 3692static int perf_fasync(int fd, struct file *filp, int on)
3684{ 3693{
3685 struct inode *inode = filp->f_path.dentry->d_inode; 3694 struct inode *inode = file_inode(filp);
3686 struct perf_event *event = filp->private_data; 3695 struct perf_event *event = filp->private_data;
3687 int retval; 3696 int retval;
3688 3697
@@ -5117,7 +5126,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5117{ 5126{
5118 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5127 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5119 struct perf_event *event; 5128 struct perf_event *event;
5120 struct hlist_node *node;
5121 struct hlist_head *head; 5129 struct hlist_head *head;
5122 5130
5123 rcu_read_lock(); 5131 rcu_read_lock();
@@ -5125,7 +5133,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5125 if (!head) 5133 if (!head)
5126 goto end; 5134 goto end;
5127 5135
5128 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5136 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5129 if (perf_swevent_match(event, type, event_id, data, regs)) 5137 if (perf_swevent_match(event, type, event_id, data, regs))
5130 perf_swevent_event(event, nr, data, regs); 5138 perf_swevent_event(event, nr, data, regs);
5131 } 5139 }
@@ -5410,7 +5418,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5410{ 5418{
5411 struct perf_sample_data data; 5419 struct perf_sample_data data;
5412 struct perf_event *event; 5420 struct perf_event *event;
5413 struct hlist_node *node;
5414 5421
5415 struct perf_raw_record raw = { 5422 struct perf_raw_record raw = {
5416 .size = entry_size, 5423 .size = entry_size,
@@ -5420,7 +5427,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5420 perf_sample_data_init(&data, addr, 0); 5427 perf_sample_data_init(&data, addr, 0);
5421 data.raw = &raw; 5428 data.raw = &raw;
5422 5429
5423 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5430 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5424 if (perf_tp_event_match(event, &data, regs)) 5431 if (perf_tp_event_match(event, &data, regs))
5425 perf_swevent_event(event, count, &data, regs); 5432 perf_swevent_event(event, count, &data, regs);
5426 } 5433 }
@@ -5956,13 +5963,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
5956 pmu->name = name; 5963 pmu->name = name;
5957 5964
5958 if (type < 0) { 5965 if (type < 0) {
5959 int err = idr_pre_get(&pmu_idr, GFP_KERNEL); 5966 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
5960 if (!err) 5967 if (type < 0) {
5961 goto free_pdc; 5968 ret = type;
5962
5963 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5964 if (err) {
5965 ret = err;
5966 goto free_pdc; 5969 goto free_pdc;
5967 } 5970 }
5968 } 5971 }
@@ -6162,11 +6165,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6162 6165
6163 if (task) { 6166 if (task) {
6164 event->attach_state = PERF_ATTACH_TASK; 6167 event->attach_state = PERF_ATTACH_TASK;
6168
6169 if (attr->type == PERF_TYPE_TRACEPOINT)
6170 event->hw.tp_target = task;
6165#ifdef CONFIG_HAVE_HW_BREAKPOINT 6171#ifdef CONFIG_HAVE_HW_BREAKPOINT
6166 /* 6172 /*
6167 * hw_breakpoint is a bit difficult here.. 6173 * hw_breakpoint is a bit difficult here..
6168 */ 6174 */
6169 if (attr->type == PERF_TYPE_BREAKPOINT) 6175 else if (attr->type == PERF_TYPE_BREAKPOINT)
6170 event->hw.bp_target = task; 6176 event->hw.bp_target = task;
6171#endif 6177#endif
6172 } 6178 }
@@ -6179,8 +6185,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6179 event->overflow_handler = overflow_handler; 6185 event->overflow_handler = overflow_handler;
6180 event->overflow_handler_context = context; 6186 event->overflow_handler_context = context;
6181 6187
6182 if (attr->disabled) 6188 perf_event__state_init(event);
6183 event->state = PERF_EVENT_STATE_OFF;
6184 6189
6185 pmu = NULL; 6190 pmu = NULL;
6186 6191
@@ -6609,9 +6614,17 @@ SYSCALL_DEFINE5(perf_event_open,
6609 6614
6610 mutex_lock(&gctx->mutex); 6615 mutex_lock(&gctx->mutex);
6611 perf_remove_from_context(group_leader); 6616 perf_remove_from_context(group_leader);
6617
6618 /*
6619 * Removing from the context ends up with disabled
6620 * event. What we want here is event in the initial
6621 * startup state, ready to be add into new context.
6622 */
6623 perf_event__state_init(group_leader);
6612 list_for_each_entry(sibling, &group_leader->sibling_list, 6624 list_for_each_entry(sibling, &group_leader->sibling_list,
6613 group_entry) { 6625 group_entry) {
6614 perf_remove_from_context(sibling); 6626 perf_remove_from_context(sibling);
6627 perf_event__state_init(sibling);
6615 put_ctx(gctx); 6628 put_ctx(gctx);
6616 } 6629 }
6617 mutex_unlock(&gctx->mutex); 6630 mutex_unlock(&gctx->mutex);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507ed..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)
676 err_alloc: 676 err_alloc:
677 for_each_possible_cpu(err_cpu) { 677 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++) 678 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], cpu)); 679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
680 if (err_cpu == cpu) 680 if (err_cpu == cpu)
681 break; 681 break;
682 } 682 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acfbb071..a567c8c7ef31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
27#include <linux/pagemap.h> /* read_mapping_page */ 27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/export.h>
30#include <linux/rmap.h> /* anon_vma_prepare */ 31#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */ 32#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */ 33#include <linux/swap.h> /* try_to_free_swap */
@@ -41,58 +42,31 @@
41#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE 42#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
42 43
43static struct rb_root uprobes_tree = RB_ROOT; 44static struct rb_root uprobes_tree = RB_ROOT;
44
45static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
46
47#define UPROBES_HASH_SZ 13
48
49/* 45/*
50 * We need separate register/unregister and mmap/munmap lock hashes because 46 * allows us to skip the uprobe_mmap if there are no uprobe events active
51 * of mmap_sem nesting. 47 * at this time. Probably a fine grained per inode count is better?
52 *
53 * uprobe_register() needs to install probes on (potentially) all processes
54 * and thus needs to acquire multiple mmap_sems (consequtively, not
55 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
56 * for the particular process doing the mmap.
57 *
58 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
59 * because of lock order against i_mmap_mutex. This means there's a hole in
60 * the register vma iteration where a mmap() can happen.
61 *
62 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
63 * install a probe where one is already installed.
64 */ 48 */
49#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
65 50
66/* serialize (un)register */ 51static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
67static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
68
69#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
70 52
53#define UPROBES_HASH_SZ 13
71/* serialize uprobe->pending_list */ 54/* serialize uprobe->pending_list */
72static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 55static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
73#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 56#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
74 57
75static struct percpu_rw_semaphore dup_mmap_sem; 58static struct percpu_rw_semaphore dup_mmap_sem;
76 59
77/*
78 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
79 * events active at this time. Probably a fine grained per inode count is
80 * better?
81 */
82static atomic_t uprobe_events = ATOMIC_INIT(0);
83
84/* Have a copy of original instruction */ 60/* Have a copy of original instruction */
85#define UPROBE_COPY_INSN 0 61#define UPROBE_COPY_INSN 0
86/* Dont run handlers when first register/ last unregister in progress*/
87#define UPROBE_RUN_HANDLER 1
88/* Can skip singlestep */ 62/* Can skip singlestep */
89#define UPROBE_SKIP_SSTEP 2 63#define UPROBE_SKIP_SSTEP 1
90 64
91struct uprobe { 65struct uprobe {
92 struct rb_node rb_node; /* node in the rb tree */ 66 struct rb_node rb_node; /* node in the rb tree */
93 atomic_t ref; 67 atomic_t ref;
68 struct rw_semaphore register_rwsem;
94 struct rw_semaphore consumer_rwsem; 69 struct rw_semaphore consumer_rwsem;
95 struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
96 struct list_head pending_list; 70 struct list_head pending_list;
97 struct uprobe_consumer *consumers; 71 struct uprobe_consumer *consumers;
98 struct inode *inode; /* Also hold a ref to inode */ 72 struct inode *inode; /* Also hold a ref to inode */
@@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
430 u = __insert_uprobe(uprobe); 404 u = __insert_uprobe(uprobe);
431 spin_unlock(&uprobes_treelock); 405 spin_unlock(&uprobes_treelock);
432 406
433 /* For now assume that the instruction need not be single-stepped */
434 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
435
436 return u; 407 return u;
437} 408}
438 409
@@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
452 423
453 uprobe->inode = igrab(inode); 424 uprobe->inode = igrab(inode);
454 uprobe->offset = offset; 425 uprobe->offset = offset;
426 init_rwsem(&uprobe->register_rwsem);
455 init_rwsem(&uprobe->consumer_rwsem); 427 init_rwsem(&uprobe->consumer_rwsem);
456 mutex_init(&uprobe->copy_mutex); 428 /* For now assume that the instruction need not be single-stepped */
429 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
457 430
458 /* add to uprobes_tree, sorted on inode:offset */ 431 /* add to uprobes_tree, sorted on inode:offset */
459 cur_uprobe = insert_uprobe(uprobe); 432 cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
463 kfree(uprobe); 436 kfree(uprobe);
464 uprobe = cur_uprobe; 437 uprobe = cur_uprobe;
465 iput(inode); 438 iput(inode);
466 } else {
467 atomic_inc(&uprobe_events);
468 } 439 }
469 440
470 return uprobe; 441 return uprobe;
471} 442}
472 443
473static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) 444static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
474{
475 struct uprobe_consumer *uc;
476
477 if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
478 return;
479
480 down_read(&uprobe->consumer_rwsem);
481 for (uc = uprobe->consumers; uc; uc = uc->next) {
482 if (!uc->filter || uc->filter(uc, current))
483 uc->handler(uc, regs);
484 }
485 up_read(&uprobe->consumer_rwsem);
486}
487
488/* Returns the previous consumer */
489static struct uprobe_consumer *
490consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
491{ 445{
492 down_write(&uprobe->consumer_rwsem); 446 down_write(&uprobe->consumer_rwsem);
493 uc->next = uprobe->consumers; 447 uc->next = uprobe->consumers;
494 uprobe->consumers = uc; 448 uprobe->consumers = uc;
495 up_write(&uprobe->consumer_rwsem); 449 up_write(&uprobe->consumer_rwsem);
496
497 return uc->next;
498} 450}
499 451
500/* 452/*
@@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
588 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 540 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
589 return ret; 541 return ret;
590 542
591 mutex_lock(&uprobe->copy_mutex); 543 /* TODO: move this into _register, until then we abuse this sem. */
544 down_write(&uprobe->consumer_rwsem);
592 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 545 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
593 goto out; 546 goto out;
594 547
@@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
612 set_bit(UPROBE_COPY_INSN, &uprobe->flags); 565 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
613 566
614 out: 567 out:
615 mutex_unlock(&uprobe->copy_mutex); 568 up_write(&uprobe->consumer_rwsem);
569
570 return ret;
571}
572
573static inline bool consumer_filter(struct uprobe_consumer *uc,
574 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
575{
576 return !uc->filter || uc->filter(uc, ctx, mm);
577}
578
579static bool filter_chain(struct uprobe *uprobe,
580 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
581{
582 struct uprobe_consumer *uc;
583 bool ret = false;
584
585 down_read(&uprobe->consumer_rwsem);
586 for (uc = uprobe->consumers; uc; uc = uc->next) {
587 ret = consumer_filter(uc, ctx, mm);
588 if (ret)
589 break;
590 }
591 up_read(&uprobe->consumer_rwsem);
616 592
617 return ret; 593 return ret;
618} 594}
@@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
624 bool first_uprobe; 600 bool first_uprobe;
625 int ret; 601 int ret;
626 602
627 /*
628 * If probe is being deleted, unregister thread could be done with
629 * the vma-rmap-walk through. Adding a probe now can be fatal since
630 * nobody will be able to cleanup. Also we could be from fork or
631 * mremap path, where the probe might have already been inserted.
632 * Hence behave as if probe already existed.
633 */
634 if (!uprobe->consumers)
635 return 0;
636
637 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); 603 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
638 if (ret) 604 if (ret)
639 return ret; 605 return ret;
@@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
658static int 624static int
659remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) 625remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
660{ 626{
661 /* can happen if uprobe_register() fails */
662 if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
663 return 0;
664
665 set_bit(MMF_RECALC_UPROBES, &mm->flags); 627 set_bit(MMF_RECALC_UPROBES, &mm->flags);
666 return set_orig_insn(&uprobe->arch, mm, vaddr); 628 return set_orig_insn(&uprobe->arch, mm, vaddr);
667} 629}
668 630
631static inline bool uprobe_is_active(struct uprobe *uprobe)
632{
633 return !RB_EMPTY_NODE(&uprobe->rb_node);
634}
669/* 635/*
670 * There could be threads that have already hit the breakpoint. They 636 * There could be threads that have already hit the breakpoint. They
671 * will recheck the current insn and restart if find_uprobe() fails. 637 * will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
673 */ 639 */
674static void delete_uprobe(struct uprobe *uprobe) 640static void delete_uprobe(struct uprobe *uprobe)
675{ 641{
642 if (WARN_ON(!uprobe_is_active(uprobe)))
643 return;
644
676 spin_lock(&uprobes_treelock); 645 spin_lock(&uprobes_treelock);
677 rb_erase(&uprobe->rb_node, &uprobes_tree); 646 rb_erase(&uprobe->rb_node, &uprobes_tree);
678 spin_unlock(&uprobes_treelock); 647 spin_unlock(&uprobes_treelock);
648 RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
679 iput(uprobe->inode); 649 iput(uprobe->inode);
680 put_uprobe(uprobe); 650 put_uprobe(uprobe);
681 atomic_dec(&uprobe_events);
682} 651}
683 652
684struct map_info { 653struct map_info {
@@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
764 return curr; 733 return curr;
765} 734}
766 735
767static int register_for_each_vma(struct uprobe *uprobe, bool is_register) 736static int
737register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
768{ 738{
739 bool is_register = !!new;
769 struct map_info *info; 740 struct map_info *info;
770 int err = 0; 741 int err = 0;
771 742
@@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
794 vaddr_to_offset(vma, info->vaddr) != uprobe->offset) 765 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
795 goto unlock; 766 goto unlock;
796 767
797 if (is_register) 768 if (is_register) {
798 err = install_breakpoint(uprobe, mm, vma, info->vaddr); 769 /* consult only the "caller", new consumer. */
799 else 770 if (consumer_filter(new,
800 err |= remove_breakpoint(uprobe, mm, info->vaddr); 771 UPROBE_FILTER_REGISTER, mm))
772 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
773 } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
774 if (!filter_chain(uprobe,
775 UPROBE_FILTER_UNREGISTER, mm))
776 err |= remove_breakpoint(uprobe, mm, info->vaddr);
777 }
801 778
802 unlock: 779 unlock:
803 up_write(&mm->mmap_sem); 780 up_write(&mm->mmap_sem);
@@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
810 return err; 787 return err;
811} 788}
812 789
813static int __uprobe_register(struct uprobe *uprobe) 790static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
814{ 791{
815 return register_for_each_vma(uprobe, true); 792 consumer_add(uprobe, uc);
793 return register_for_each_vma(uprobe, uc);
816} 794}
817 795
818static void __uprobe_unregister(struct uprobe *uprobe) 796static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
819{ 797{
820 if (!register_for_each_vma(uprobe, false)) 798 int err;
821 delete_uprobe(uprobe); 799
800 if (!consumer_del(uprobe, uc)) /* WARN? */
801 return;
822 802
803 err = register_for_each_vma(uprobe, NULL);
823 /* TODO : cant unregister? schedule a worker thread */ 804 /* TODO : cant unregister? schedule a worker thread */
805 if (!uprobe->consumers && !err)
806 delete_uprobe(uprobe);
824} 807}
825 808
826/* 809/*
@@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
845 struct uprobe *uprobe; 828 struct uprobe *uprobe;
846 int ret; 829 int ret;
847 830
848 if (!inode || !uc || uc->next) 831 /* Racy, just to catch the obvious mistakes */
849 return -EINVAL;
850
851 if (offset > i_size_read(inode)) 832 if (offset > i_size_read(inode))
852 return -EINVAL; 833 return -EINVAL;
853 834
854 ret = 0; 835 retry:
855 mutex_lock(uprobes_hash(inode));
856 uprobe = alloc_uprobe(inode, offset); 836 uprobe = alloc_uprobe(inode, offset);
857 837 if (!uprobe)
858 if (!uprobe) { 838 return -ENOMEM;
859 ret = -ENOMEM; 839 /*
860 } else if (!consumer_add(uprobe, uc)) { 840 * We can race with uprobe_unregister()->delete_uprobe().
861 ret = __uprobe_register(uprobe); 841 * Check uprobe_is_active() and retry if it is false.
862 if (ret) { 842 */
863 uprobe->consumers = NULL; 843 down_write(&uprobe->register_rwsem);
864 __uprobe_unregister(uprobe); 844 ret = -EAGAIN;
865 } else { 845 if (likely(uprobe_is_active(uprobe))) {
866 set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 846 ret = __uprobe_register(uprobe, uc);
867 } 847 if (ret)
848 __uprobe_unregister(uprobe, uc);
868 } 849 }
850 up_write(&uprobe->register_rwsem);
851 put_uprobe(uprobe);
869 852
870 mutex_unlock(uprobes_hash(inode)); 853 if (unlikely(ret == -EAGAIN))
871 if (uprobe) 854 goto retry;
872 put_uprobe(uprobe); 855 return ret;
856}
857EXPORT_SYMBOL_GPL(uprobe_register);
858
859/*
860 * uprobe_apply - unregister a already registered probe.
861 * @inode: the file in which the probe has to be removed.
862 * @offset: offset from the start of the file.
863 * @uc: consumer which wants to add more or remove some breakpoints
864 * @add: add or remove the breakpoints
865 */
866int uprobe_apply(struct inode *inode, loff_t offset,
867 struct uprobe_consumer *uc, bool add)
868{
869 struct uprobe *uprobe;
870 struct uprobe_consumer *con;
871 int ret = -ENOENT;
872
873 uprobe = find_uprobe(inode, offset);
874 if (!uprobe)
875 return ret;
876
877 down_write(&uprobe->register_rwsem);
878 for (con = uprobe->consumers; con && con != uc ; con = con->next)
879 ;
880 if (con)
881 ret = register_for_each_vma(uprobe, add ? uc : NULL);
882 up_write(&uprobe->register_rwsem);
883 put_uprobe(uprobe);
873 884
874 return ret; 885 return ret;
875} 886}
@@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
884{ 895{
885 struct uprobe *uprobe; 896 struct uprobe *uprobe;
886 897
887 if (!inode || !uc)
888 return;
889
890 uprobe = find_uprobe(inode, offset); 898 uprobe = find_uprobe(inode, offset);
891 if (!uprobe) 899 if (!uprobe)
892 return; 900 return;
893 901
894 mutex_lock(uprobes_hash(inode)); 902 down_write(&uprobe->register_rwsem);
903 __uprobe_unregister(uprobe, uc);
904 up_write(&uprobe->register_rwsem);
905 put_uprobe(uprobe);
906}
907EXPORT_SYMBOL_GPL(uprobe_unregister);
895 908
896 if (consumer_del(uprobe, uc)) { 909static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
897 if (!uprobe->consumers) { 910{
898 __uprobe_unregister(uprobe); 911 struct vm_area_struct *vma;
899 clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 912 int err = 0;
900 } 913
914 down_read(&mm->mmap_sem);
915 for (vma = mm->mmap; vma; vma = vma->vm_next) {
916 unsigned long vaddr;
917 loff_t offset;
918
919 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode)
921 continue;
922
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
924 if (uprobe->offset < offset ||
925 uprobe->offset >= offset + vma->vm_end - vma->vm_start)
926 continue;
927
928 vaddr = offset_to_vaddr(vma, uprobe->offset);
929 err |= remove_breakpoint(uprobe, mm, vaddr);
901 } 930 }
931 up_read(&mm->mmap_sem);
902 932
903 mutex_unlock(uprobes_hash(inode)); 933 return err;
904 if (uprobe)
905 put_uprobe(uprobe);
906} 934}
907 935
908static struct rb_node * 936static struct rb_node *
@@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
979 struct uprobe *uprobe, *u; 1007 struct uprobe *uprobe, *u;
980 struct inode *inode; 1008 struct inode *inode;
981 1009
982 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) 1010 if (no_uprobe_events() || !valid_vma(vma, true))
983 return 0; 1011 return 0;
984 1012
985 inode = vma->vm_file->f_mapping->host; 1013 inode = vma->vm_file->f_mapping->host;
@@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
988 1016
989 mutex_lock(uprobes_mmap_hash(inode)); 1017 mutex_lock(uprobes_mmap_hash(inode));
990 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); 1018 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
991 1019 /*
1020 * We can race with uprobe_unregister(), this uprobe can be already
1021 * removed. But in this case filter_chain() must return false, all
1022 * consumers have gone away.
1023 */
992 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1024 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
993 if (!fatal_signal_pending(current)) { 1025 if (!fatal_signal_pending(current) &&
1026 filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
994 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); 1027 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
995 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1028 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
996 } 1029 }
@@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1025 */ 1058 */
1026void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1059void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1027{ 1060{
1028 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1061 if (no_uprobe_events() || !valid_vma(vma, false))
1029 return; 1062 return;
1030 1063
1031 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ 1064 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1042/* Slot allocation for XOL */ 1075/* Slot allocation for XOL */
1043static int xol_add_vma(struct xol_area *area) 1076static int xol_add_vma(struct xol_area *area)
1044{ 1077{
1045 struct mm_struct *mm; 1078 struct mm_struct *mm = current->mm;
1046 int ret; 1079 int ret = -EALREADY;
1047
1048 area->page = alloc_page(GFP_HIGHUSER);
1049 if (!area->page)
1050 return -ENOMEM;
1051
1052 ret = -EALREADY;
1053 mm = current->mm;
1054 1080
1055 down_write(&mm->mmap_sem); 1081 down_write(&mm->mmap_sem);
1056 if (mm->uprobes_state.xol_area) 1082 if (mm->uprobes_state.xol_area)
1057 goto fail; 1083 goto fail;
1058 1084
1059 ret = -ENOMEM; 1085 ret = -ENOMEM;
1060
1061 /* Try to map as high as possible, this is only a hint. */ 1086 /* Try to map as high as possible, this is only a hint. */
1062 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1087 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1063 if (area->vaddr & ~PAGE_MASK) { 1088 if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
1073 smp_wmb(); /* pairs with get_xol_area() */ 1098 smp_wmb(); /* pairs with get_xol_area() */
1074 mm->uprobes_state.xol_area = area; 1099 mm->uprobes_state.xol_area = area;
1075 ret = 0; 1100 ret = 0;
1076 1101 fail:
1077fail:
1078 up_write(&mm->mmap_sem); 1102 up_write(&mm->mmap_sem);
1079 if (ret)
1080 __free_page(area->page);
1081 1103
1082 return ret; 1104 return ret;
1083} 1105}
1084 1106
1085static struct xol_area *get_xol_area(struct mm_struct *mm)
1086{
1087 struct xol_area *area;
1088
1089 area = mm->uprobes_state.xol_area;
1090 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1091
1092 return area;
1093}
1094
1095/* 1107/*
1096 * xol_alloc_area - Allocate process's xol_area. 1108 * get_xol_area - Allocate process's xol_area if necessary.
1097 * This area will be used for storing instructions for execution out of 1109 * This area will be used for storing instructions for execution out of line.
1098 * line.
1099 * 1110 *
1100 * Returns the allocated area or NULL. 1111 * Returns the allocated area or NULL.
1101 */ 1112 */
1102static struct xol_area *xol_alloc_area(void) 1113static struct xol_area *get_xol_area(void)
1103{ 1114{
1115 struct mm_struct *mm = current->mm;
1104 struct xol_area *area; 1116 struct xol_area *area;
1105 1117
1118 area = mm->uprobes_state.xol_area;
1119 if (area)
1120 goto ret;
1121
1106 area = kzalloc(sizeof(*area), GFP_KERNEL); 1122 area = kzalloc(sizeof(*area), GFP_KERNEL);
1107 if (unlikely(!area)) 1123 if (unlikely(!area))
1108 return NULL; 1124 goto out;
1109 1125
1110 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); 1126 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1111
1112 if (!area->bitmap) 1127 if (!area->bitmap)
1113 goto fail; 1128 goto free_area;
1129
1130 area->page = alloc_page(GFP_HIGHUSER);
1131 if (!area->page)
1132 goto free_bitmap;
1114 1133
1115 init_waitqueue_head(&area->wq); 1134 init_waitqueue_head(&area->wq);
1116 if (!xol_add_vma(area)) 1135 if (!xol_add_vma(area))
1117 return area; 1136 return area;
1118 1137
1119fail: 1138 __free_page(area->page);
1139 free_bitmap:
1120 kfree(area->bitmap); 1140 kfree(area->bitmap);
1141 free_area:
1121 kfree(area); 1142 kfree(area);
1122 1143 out:
1123 return get_xol_area(current->mm); 1144 area = mm->uprobes_state.xol_area;
1145 ret:
1146 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1147 return area;
1124} 1148}
1125 1149
1126/* 1150/*
@@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1186} 1210}
1187 1211
1188/* 1212/*
1189 * xol_get_insn_slot - If was not allocated a slot, then 1213 * xol_get_insn_slot - allocate a slot for xol.
1190 * allocate a slot.
1191 * Returns the allocated slot address or 0. 1214 * Returns the allocated slot address or 0.
1192 */ 1215 */
1193static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) 1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1194{ 1217{
1195 struct xol_area *area; 1218 struct xol_area *area;
1196 unsigned long offset; 1219 unsigned long offset;
1220 unsigned long xol_vaddr;
1197 void *vaddr; 1221 void *vaddr;
1198 1222
1199 area = get_xol_area(current->mm); 1223 area = get_xol_area();
1200 if (!area) { 1224 if (!area)
1201 area = xol_alloc_area(); 1225 return 0;
1202 if (!area)
1203 return 0;
1204 }
1205 current->utask->xol_vaddr = xol_take_insn_slot(area);
1206 1226
1207 /* 1227 xol_vaddr = xol_take_insn_slot(area);
1208 * Initialize the slot if xol_vaddr points to valid 1228 if (unlikely(!xol_vaddr))
1209 * instruction slot.
1210 */
1211 if (unlikely(!current->utask->xol_vaddr))
1212 return 0; 1229 return 0;
1213 1230
1214 current->utask->vaddr = slot_addr; 1231 /* Initialize the slot */
1215 offset = current->utask->xol_vaddr & ~PAGE_MASK; 1232 offset = xol_vaddr & ~PAGE_MASK;
1216 vaddr = kmap_atomic(area->page); 1233 vaddr = kmap_atomic(area->page);
1217 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); 1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1218 kunmap_atomic(vaddr); 1235 kunmap_atomic(vaddr);
@@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
1222 */ 1239 */
1223 flush_dcache_page(area->page); 1240 flush_dcache_page(area->page);
1224 1241
1225 return current->utask->xol_vaddr; 1242 return xol_vaddr;
1226} 1243}
1227 1244
1228/* 1245/*
@@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1240 return; 1257 return;
1241 1258
1242 slot_addr = tsk->utask->xol_vaddr; 1259 slot_addr = tsk->utask->xol_vaddr;
1243 1260 if (unlikely(!slot_addr))
1244 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1245 return; 1261 return;
1246 1262
1247 area = tsk->mm->uprobes_state.xol_area; 1263 area = tsk->mm->uprobes_state.xol_area;
@@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
1303} 1319}
1304 1320
1305/* 1321/*
1306 * Allocate a uprobe_task object for the task. 1322 * Allocate a uprobe_task object for the task if if necessary.
1307 * Called when the thread hits a breakpoint for the first time. 1323 * Called when the thread hits a breakpoint.
1308 * 1324 *
1309 * Returns: 1325 * Returns:
1310 * - pointer to new uprobe_task on success 1326 * - pointer to new uprobe_task on success
1311 * - NULL otherwise 1327 * - NULL otherwise
1312 */ 1328 */
1313static struct uprobe_task *add_utask(void) 1329static struct uprobe_task *get_utask(void)
1314{ 1330{
1315 struct uprobe_task *utask; 1331 if (!current->utask)
1316 1332 current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1317 utask = kzalloc(sizeof *utask, GFP_KERNEL); 1333 return current->utask;
1318 if (unlikely(!utask))
1319 return NULL;
1320
1321 current->utask = utask;
1322 return utask;
1323} 1334}
1324 1335
1325/* Prepare to single-step probed instruction out of line. */ 1336/* Prepare to single-step probed instruction out of line. */
1326static int 1337static int
1327pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) 1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1328{ 1339{
1329 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) 1340 struct uprobe_task *utask;
1330 return 0; 1341 unsigned long xol_vaddr;
1342 int err;
1343
1344 utask = get_utask();
1345 if (!utask)
1346 return -ENOMEM;
1347
1348 xol_vaddr = xol_get_insn_slot(uprobe);
1349 if (!xol_vaddr)
1350 return -ENOMEM;
1351
1352 utask->xol_vaddr = xol_vaddr;
1353 utask->vaddr = bp_vaddr;
1354
1355 err = arch_uprobe_pre_xol(&uprobe->arch, regs);
1356 if (unlikely(err)) {
1357 xol_free_insn_slot(current);
1358 return err;
1359 }
1331 1360
1332 return -EFAULT; 1361 utask->active_uprobe = uprobe;
1362 utask->state = UTASK_SSTEP;
1363 return 0;
1333} 1364}
1334 1365
1335/* 1366/*
@@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1391 * This is not strictly accurate, we can race with 1422 * This is not strictly accurate, we can race with
1392 * uprobe_unregister() and see the already removed 1423 * uprobe_unregister() and see the already removed
1393 * uprobe if delete_uprobe() was not yet called. 1424 * uprobe if delete_uprobe() was not yet called.
1425 * Or this uprobe can be filtered out.
1394 */ 1426 */
1395 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) 1427 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1396 return; 1428 return;
@@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1452 return uprobe; 1484 return uprobe;
1453} 1485}
1454 1486
1487static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1488{
1489 struct uprobe_consumer *uc;
1490 int remove = UPROBE_HANDLER_REMOVE;
1491
1492 down_read(&uprobe->register_rwsem);
1493 for (uc = uprobe->consumers; uc; uc = uc->next) {
1494 int rc = uc->handler(uc, regs);
1495
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc;
1499 }
1500
1501 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm);
1504 }
1505 up_read(&uprobe->register_rwsem);
1506}
1507
1455/* 1508/*
1456 * Run handler and ask thread to singlestep. 1509 * Run handler and ask thread to singlestep.
1457 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1510 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1458 */ 1511 */
1459static void handle_swbp(struct pt_regs *regs) 1512static void handle_swbp(struct pt_regs *regs)
1460{ 1513{
1461 struct uprobe_task *utask;
1462 struct uprobe *uprobe; 1514 struct uprobe *uprobe;
1463 unsigned long bp_vaddr; 1515 unsigned long bp_vaddr;
1464 int uninitialized_var(is_swbp); 1516 int uninitialized_var(is_swbp);
@@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
1483 } 1535 }
1484 return; 1536 return;
1485 } 1537 }
1538
1539 /* change it in advance for ->handler() and restart */
1540 instruction_pointer_set(regs, bp_vaddr);
1541
1486 /* 1542 /*
1487 * TODO: move copy_insn/etc into _register and remove this hack. 1543 * TODO: move copy_insn/etc into _register and remove this hack.
1488 * After we hit the bp, _unregister + _register can install the 1544 * After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
1490 */ 1546 */
1491 smp_rmb(); /* pairs with wmb() in install_breakpoint() */ 1547 smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1492 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) 1548 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1493 goto restart; 1549 goto out;
1494
1495 utask = current->utask;
1496 if (!utask) {
1497 utask = add_utask();
1498 /* Cannot allocate; re-execute the instruction. */
1499 if (!utask)
1500 goto restart;
1501 }
1502 1550
1503 handler_chain(uprobe, regs); 1551 handler_chain(uprobe, regs);
1504 if (can_skip_sstep(uprobe, regs)) 1552 if (can_skip_sstep(uprobe, regs))
1505 goto out; 1553 goto out;
1506 1554
1507 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1555 if (!pre_ssout(uprobe, regs, bp_vaddr))
1508 utask->active_uprobe = uprobe;
1509 utask->state = UTASK_SSTEP;
1510 return; 1556 return;
1511 }
1512 1557
1513restart: 1558 /* can_skip_sstep() succeeded, or restart if can't singlestep */
1514 /*
1515 * cannot singlestep; cannot skip instruction;
1516 * re-execute the instruction.
1517 */
1518 instruction_pointer_set(regs, bp_vaddr);
1519out: 1559out:
1520 put_uprobe(uprobe); 1560 put_uprobe(uprobe);
1521} 1561}
@@ -1609,10 +1649,8 @@ static int __init init_uprobes(void)
1609{ 1649{
1610 int i; 1650 int i;
1611 1651
1612 for (i = 0; i < UPROBES_HASH_SZ; i++) { 1652 for (i = 0; i < UPROBES_HASH_SZ; i++)
1613 mutex_init(&uprobes_mutex[i]);
1614 mutex_init(&uprobes_mmap_mutex[i]); 1653 mutex_init(&uprobes_mmap_mutex[i]);
1615 }
1616 1654
1617 if (percpu_init_rwsem(&dup_mmap_sem)) 1655 if (percpu_init_rwsem(&dup_mmap_sem))
1618 return -ENOMEM; 1656 return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df21937216..51e485ca9935 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
20#include <linux/tsacct_kern.h> 20#include <linux/tsacct_kern.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/fdtable.h> 22#include <linux/fdtable.h>
23#include <linux/freezer.h>
23#include <linux/binfmts.h> 24#include <linux/binfmts.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h> 26#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
31#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
32#include <linux/taskstats_kern.h> 33#include <linux/taskstats_kern.h>
33#include <linux/delayacct.h> 34#include <linux/delayacct.h>
34#include <linux/freezer.h>
35#include <linux/cgroup.h> 35#include <linux/cgroup.h>
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/signal.h> 37#include <linux/signal.h>
@@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)
85 bool group_dead = thread_group_leader(tsk); 85 bool group_dead = thread_group_leader(tsk);
86 struct sighand_struct *sighand; 86 struct sighand_struct *sighand;
87 struct tty_struct *uninitialized_var(tty); 87 struct tty_struct *uninitialized_var(tty);
88 cputime_t utime, stime;
88 89
89 sighand = rcu_dereference_check(tsk->sighand, 90 sighand = rcu_dereference_check(tsk->sighand,
90 lockdep_tasklist_lock_is_held()); 91 lockdep_tasklist_lock_is_held());
@@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)
123 * We won't ever get here for the group leader, since it 124 * We won't ever get here for the group leader, since it
124 * will have been the last reference on the signal_struct. 125 * will have been the last reference on the signal_struct.
125 */ 126 */
126 sig->utime += tsk->utime; 127 task_cputime(tsk, &utime, &stime);
127 sig->stime += tsk->stime; 128 sig->utime += utime;
128 sig->gtime += tsk->gtime; 129 sig->stime += stime;
130 sig->gtime += task_gtime(tsk);
129 sig->min_flt += tsk->min_flt; 131 sig->min_flt += tsk->min_flt;
130 sig->maj_flt += tsk->maj_flt; 132 sig->maj_flt += tsk->maj_flt;
131 sig->nvcsw += tsk->nvcsw; 133 sig->nvcsw += tsk->nvcsw;
@@ -483,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
483 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 485 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
484 if (!self.task) /* see coredump_finish() */ 486 if (!self.task) /* see coredump_finish() */
485 break; 487 break;
486 schedule(); 488 freezable_schedule();
487 } 489 }
488 __set_task_state(tsk, TASK_RUNNING); 490 __set_task_state(tsk, TASK_RUNNING);
489 down_read(&mm->mmap_sem); 491 down_read(&mm->mmap_sem);
@@ -833,7 +835,7 @@ void do_exit(long code)
833 /* 835 /*
834 * Make sure we are holding no locks: 836 * Make sure we are holding no locks:
835 */ 837 */
836 debug_check_no_locks_held(tsk); 838 debug_check_no_locks_held();
837 /* 839 /*
838 * We can do this unlocked here. The futex code uses this flag 840 * We can do this unlocked here. The futex code uses this flag
839 * just to verify whether the pi state cleanup has been done 841 * just to verify whether the pi state cleanup has been done
@@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1092 sig = p->signal; 1094 sig = p->signal;
1093 psig->cutime += tgutime + sig->cutime; 1095 psig->cutime += tgutime + sig->cutime;
1094 psig->cstime += tgstime + sig->cstime; 1096 psig->cstime += tgstime + sig->cstime;
1095 psig->cgtime += p->gtime + sig->gtime + sig->cgtime; 1097 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1096 psig->cmin_flt += 1098 psig->cmin_flt +=
1097 p->min_flt + sig->min_flt + sig->cmin_flt; 1099 p->min_flt + sig->min_flt + sig->cmin_flt;
1098 psig->cmaj_flt += 1100 psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 65ca6d27f24e..8d932b1c9056 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
413 tmp->vm_next = tmp->vm_prev = NULL; 413 tmp->vm_next = tmp->vm_prev = NULL;
414 file = tmp->vm_file; 414 file = tmp->vm_file;
415 if (file) { 415 if (file) {
416 struct inode *inode = file->f_path.dentry->d_inode; 416 struct inode *inode = file_inode(file);
417 struct address_space *mapping = file->f_mapping; 417 struct address_space *mapping = file->f_mapping;
418 418
419 get_file(file); 419 get_file(file);
@@ -1233,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1233#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1234 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1234 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1235#endif 1235#endif
1236#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1237 seqlock_init(&p->vtime_seqlock);
1238 p->vtime_snap = 0;
1239 p->vtime_snap_whence = VTIME_SLEEPING;
1240#endif
1241
1236#if defined(SPLIT_RSS_COUNTING) 1242#if defined(SPLIT_RSS_COUNTING)
1237 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1243 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1238#endif 1244#endif
@@ -1668,8 +1674,10 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1668 int, tls_val) 1674 int, tls_val)
1669#endif 1675#endif
1670{ 1676{
1671 return do_fork(clone_flags, newsp, 0, 1677 long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
1672 parent_tidptr, child_tidptr); 1678 asmlinkage_protect(5, ret, clone_flags, newsp,
1679 parent_tidptr, child_tidptr, tls_val);
1680 return ret;
1673} 1681}
1674#endif 1682#endif
1675 1683
@@ -1853,10 +1861,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1853 exit_sem(current); 1861 exit_sem(current);
1854 } 1862 }
1855 1863
1856 if (new_nsproxy) { 1864 if (new_nsproxy)
1857 switch_task_namespaces(current, new_nsproxy); 1865 switch_task_namespaces(current, new_nsproxy);
1858 new_nsproxy = NULL;
1859 }
1860 1866
1861 task_lock(current); 1867 task_lock(current);
1862 1868
@@ -1886,9 +1892,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1886 } 1892 }
1887 } 1893 }
1888 1894
1889 if (new_nsproxy)
1890 put_nsproxy(new_nsproxy);
1891
1892bad_unshare_cleanup_cred: 1895bad_unshare_cleanup_cred:
1893 if (new_cred) 1896 if (new_cred)
1894 put_cred(new_cred); 1897 put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089ca003..fbc07a29ec53 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
60#include <linux/pid.h> 60#include <linux/pid.h>
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h> 62#include <linux/ptrace.h>
63#include <linux/sched/rt.h>
63 64
64#include <asm/futex.h> 65#include <asm/futex.h>
65 66
@@ -2471,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2471 if (!futex_cmpxchg_enabled) 2472 if (!futex_cmpxchg_enabled)
2472 return -ENOSYS; 2473 return -ENOSYS;
2473 2474
2474 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
2475
2476 rcu_read_lock(); 2475 rcu_read_lock();
2477 2476
2478 ret = -ESRCH; 2477 ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005fc..f9f44fd4d34d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
11#include <linux/nsproxy.h> 11#include <linux/nsproxy.h>
12#include <linux/futex.h> 12#include <linux/futex.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/syscalls.h>
14 15
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
116 } 117 }
117} 118}
118 119
119asmlinkage long 120COMPAT_SYSCALL_DEFINE2(set_robust_list,
120compat_sys_set_robust_list(struct compat_robust_list_head __user *head, 121 struct compat_robust_list_head __user *, head,
121 compat_size_t len) 122 compat_size_t, len)
122{ 123{
123 if (!futex_cmpxchg_enabled) 124 if (!futex_cmpxchg_enabled)
124 return -ENOSYS; 125 return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
131 return 0; 132 return 0;
132} 133}
133 134
134asmlinkage long 135COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
135compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, 136 compat_uptr_t __user *, head_ptr,
136 compat_size_t __user *len_ptr) 137 compat_size_t __user *, len_ptr)
137{ 138{
138 struct compat_robust_list_head __user *head; 139 struct compat_robust_list_head __user *head;
139 unsigned long ret; 140 unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
142 if (!futex_cmpxchg_enabled) 143 if (!futex_cmpxchg_enabled)
143 return -ENOSYS; 144 return -ENOSYS;
144 145
145 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
146
147 rcu_read_lock(); 146 rcu_read_lock();
148 147
149 ret = -ESRCH; 148 ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
172 return ret; 171 return ret;
173} 172}
174 173
175asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, 174COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
176 struct compat_timespec __user *utime, u32 __user *uaddr2, 175 struct compat_timespec __user *, utime, u32 __user *, uaddr2,
177 u32 val3) 176 u32, val3)
178{ 177{
179 struct timespec ts; 178 struct timespec ts;
180 ktime_t t, *tp = NULL; 179 ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..d4da55d1fb65 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
35config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
39 default n 39 default n
40 ---help--- 40 ---help---
41 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b5..cc47812d3feb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,8 @@
44#include <linux/err.h> 44#include <linux/err.h>
45#include <linux/debugobjects.h> 45#include <linux/debugobjects.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h>
47#include <linux/timer.h> 49#include <linux/timer.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
@@ -640,21 +642,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
640 * and expiry check is done in the hrtimer_interrupt or in the softirq. 642 * and expiry check is done in the hrtimer_interrupt or in the softirq.
641 */ 643 */
642static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 644static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
643 struct hrtimer_clock_base *base, 645 struct hrtimer_clock_base *base)
644 int wakeup)
645{ 646{
646 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 647 return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
647 if (wakeup) {
648 raw_spin_unlock(&base->cpu_base->lock);
649 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
650 raw_spin_lock(&base->cpu_base->lock);
651 } else
652 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
653
654 return 1;
655 }
656
657 return 0;
658} 648}
659 649
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) 650static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -735,8 +725,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
735static inline void 725static inline void
736hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } 726hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
737static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 727static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
738 struct hrtimer_clock_base *base, 728 struct hrtimer_clock_base *base)
739 int wakeup)
740{ 729{
741 return 0; 730 return 0;
742} 731}
@@ -995,8 +984,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
995 * 984 *
996 * XXX send_remote_softirq() ? 985 * XXX send_remote_softirq() ?
997 */ 986 */
998 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) 987 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
999 hrtimer_enqueue_reprogram(timer, new_base, wakeup); 988 && hrtimer_enqueue_reprogram(timer, new_base)) {
989 if (wakeup) {
990 /*
991 * We need to drop cpu_base->lock to avoid a
992 * lock ordering issue vs. rq->lock.
993 */
994 raw_spin_unlock(&new_base->cpu_base->lock);
995 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
996 local_irq_restore(flags);
997 return ret;
998 } else {
999 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
1000 }
1001 }
1000 1002
1001 unlock_hrtimer_base(timer, &flags); 1003 unlock_hrtimer_base(timer, &flags);
1002 1004
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f29d30e..cbd97ce0b000 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
90EXPORT_SYMBOL(irq_set_handler_data); 90EXPORT_SYMBOL(irq_set_handler_data);
91 91
92/** 92/**
93 * irq_set_msi_desc - set MSI descriptor data for an irq 93 * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
94 * @irq: Interrupt number 94 * @irq_base: Interrupt number base
95 * @entry: Pointer to MSI descriptor data 95 * @irq_offset: Interrupt number offset
96 * @entry: Pointer to MSI descriptor data
96 * 97 *
97 * Set the MSI descriptor entry for an irq 98 * Set the MSI descriptor entry for an irq at offset
98 */ 99 */
99int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) 100int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
101 struct msi_desc *entry)
100{ 102{
101 unsigned long flags; 103 unsigned long flags;
102 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 104 struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
103 105
104 if (!desc) 106 if (!desc)
105 return -EINVAL; 107 return -EINVAL;
106 desc->irq_data.msi_desc = entry; 108 desc->irq_data.msi_desc = entry;
107 if (entry) 109 if (entry && !irq_offset)
108 entry->irq = irq; 110 entry->irq = irq_base;
109 irq_put_desc_unlock(desc, flags); 111 irq_put_desc_unlock(desc, flags);
110 return 0; 112 return 0;
111} 113}
112 114
113/** 115/**
116 * irq_set_msi_desc - set MSI descriptor data for an irq
117 * @irq: Interrupt number
118 * @entry: Pointer to MSI descriptor data
119 *
120 * Set the MSI descriptor entry for an irq
121 */
122int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
123{
124 return irq_set_msi_desc_off(irq, 0, entry);
125}
126
127/**
114 * irq_set_chip_data - set irq chip data for an irq 128 * irq_set_chip_data - set irq chip data for an irq
115 * @irq: Interrupt number 129 * @irq: Interrupt number
116 * @data: Pointer to chip specific data 130 * @data: Pointer to chip specific data
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288fa479..fa17855ca65a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/sched/rt.h>
19#include <linux/task_work.h> 20#include <linux/task_work.h>
20 21
21#include "internals.h" 22#include "internals.h"
@@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
1524out: 1525out:
1525 irq_put_desc_unlock(desc, flags); 1526 irq_put_desc_unlock(desc, flags);
1526} 1527}
1528EXPORT_SYMBOL_GPL(enable_percpu_irq);
1527 1529
1528void disable_percpu_irq(unsigned int irq) 1530void disable_percpu_irq(unsigned int irq)
1529{ 1531{
@@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)
1537 irq_percpu_disable(desc, cpu); 1539 irq_percpu_disable(desc, cpu);
1538 irq_put_desc_unlock(desc, flags); 1540 irq_put_desc_unlock(desc, flags);
1539} 1541}
1542EXPORT_SYMBOL_GPL(disable_percpu_irq);
1540 1543
1541/* 1544/*
1542 * Internal function to unregister a percpu irqaction. 1545 * Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323a..397db02209ed 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
76static ssize_t write_irq_affinity(int type, struct file *file, 76static ssize_t write_irq_affinity(int type, struct file *file,
77 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
78{ 78{
79 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 79 unsigned int irq = (int)(long)PDE(file_inode(file))->data;
80 cpumask_var_t new_value; 80 cpumask_var_t new_value;
81 int err; 81 int err;
82 82
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c45..7b5f012bde9d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
80 80
81 /* 81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the 82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well. 83 * first.
84 */ 84 */
85 action = desc->action; 85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) || 86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || 87 (action->flags & __IRQF_TIMER))
88 (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
89 !action->next)
90 goto out; 88 goto out;
91 89
92 /* Already running on another processor */ 90 /* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
104 do { 102 do {
105 if (handle_irq_event(desc) == IRQ_HANDLED) 103 if (handle_irq_event(desc) == IRQ_HANDLED)
106 ret = IRQ_HANDLED; 104 ret = IRQ_HANDLED;
105 /* Make sure that there is still a valid action */
107 action = desc->action; 106 action = desc->action;
108 } while ((desc->istate & IRQS_PENDING) && action); 107 } while ((desc->istate & IRQS_PENDING) && action);
109 desc->istate &= ~IRQS_POLL_INPROGRESS; 108 desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871b..55fcce6065cf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -12,37 +12,36 @@
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/irqflags.h> 14#include <linux/irqflags.h>
15#include <linux/sched.h>
16#include <linux/tick.h>
17#include <linux/cpu.h>
18#include <linux/notifier.h>
15#include <asm/processor.h> 19#include <asm/processor.h>
16 20
17/*
18 * An entry can be in one of four states:
19 *
20 * free NULL, 0 -> {claimed} : free to be used
21 * claimed NULL, 3 -> {pending} : claimed to be enqueued
22 * pending next, 3 -> {busy} : queued, pending callback
23 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
24 */
25
26#define IRQ_WORK_PENDING 1UL
27#define IRQ_WORK_BUSY 2UL
28#define IRQ_WORK_FLAGS 3UL
29 21
30static DEFINE_PER_CPU(struct llist_head, irq_work_list); 22static DEFINE_PER_CPU(struct llist_head, irq_work_list);
23static DEFINE_PER_CPU(int, irq_work_raised);
31 24
32/* 25/*
33 * Claim the entry so that no one else will poke at it. 26 * Claim the entry so that no one else will poke at it.
34 */ 27 */
35static bool irq_work_claim(struct irq_work *work) 28static bool irq_work_claim(struct irq_work *work)
36{ 29{
37 unsigned long flags, nflags; 30 unsigned long flags, oflags, nflags;
38 31
32 /*
33 * Start with our best wish as a premise but only trust any
34 * flag value after cmpxchg() result.
35 */
36 flags = work->flags & ~IRQ_WORK_PENDING;
39 for (;;) { 37 for (;;) {
40 flags = work->flags;
41 if (flags & IRQ_WORK_PENDING)
42 return false;
43 nflags = flags | IRQ_WORK_FLAGS; 38 nflags = flags | IRQ_WORK_FLAGS;
44 if (cmpxchg(&work->flags, flags, nflags) == flags) 39 oflags = cmpxchg(&work->flags, flags, nflags);
40 if (oflags == flags)
45 break; 41 break;
42 if (oflags & IRQ_WORK_PENDING)
43 return false;
44 flags = oflags;
46 cpu_relax(); 45 cpu_relax();
47 } 46 }
48 47
@@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)
57} 56}
58 57
59/* 58/*
60 * Queue the entry and raise the IPI if needed. 59 * Enqueue the irq_work @entry unless it's already pending
60 * somewhere.
61 *
62 * Can be re-enqueued while the callback is still in progress.
61 */ 63 */
62static void __irq_work_queue(struct irq_work *work) 64void irq_work_queue(struct irq_work *work)
63{ 65{
64 bool empty; 66 /* Only queue if not already pending */
67 if (!irq_work_claim(work))
68 return;
65 69
70 /* Queue the entry and raise the IPI if needed. */
66 preempt_disable(); 71 preempt_disable();
67 72
68 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 73 llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
69 /* The list was empty, raise self-interrupt to start processing. */ 74
70 if (empty) 75 /*
71 arch_irq_work_raise(); 76 * If the work is not "lazy" or the tick is stopped, raise the irq
77 * work interrupt (if supported by the arch), otherwise, just wait
78 * for the next tick.
79 */
80 if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
81 if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
82 arch_irq_work_raise();
83 }
72 84
73 preempt_enable(); 85 preempt_enable();
74} 86}
87EXPORT_SYMBOL_GPL(irq_work_queue);
75 88
76/* 89bool irq_work_needs_cpu(void)
77 * Enqueue the irq_work @entry, returns true on success, failure when the
78 * @entry was already enqueued by someone else.
79 *
80 * Can be re-enqueued while the callback is still in progress.
81 */
82bool irq_work_queue(struct irq_work *work)
83{ 90{
84 if (!irq_work_claim(work)) { 91 struct llist_head *this_list;
85 /* 92
86 * Already enqueued, can't do! 93 this_list = &__get_cpu_var(irq_work_list);
87 */ 94 if (llist_empty(this_list))
88 return false; 95 return false;
89 }
90 96
91 __irq_work_queue(work); 97 /* All work should have been flushed before going offline */
98 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
99
92 return true; 100 return true;
93} 101}
94EXPORT_SYMBOL_GPL(irq_work_queue);
95 102
96/* 103static void __irq_work_run(void)
97 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
98 * context with local IRQs disabled.
99 */
100void irq_work_run(void)
101{ 104{
105 unsigned long flags;
102 struct irq_work *work; 106 struct irq_work *work;
103 struct llist_head *this_list; 107 struct llist_head *this_list;
104 struct llist_node *llnode; 108 struct llist_node *llnode;
105 109
110
111 /*
112 * Reset the "raised" state right before we check the list because
113 * an NMI may enqueue after we find the list empty from the runner.
114 */
115 __this_cpu_write(irq_work_raised, 0);
116 barrier();
117
106 this_list = &__get_cpu_var(irq_work_list); 118 this_list = &__get_cpu_var(irq_work_list);
107 if (llist_empty(this_list)) 119 if (llist_empty(this_list))
108 return; 120 return;
109 121
110 BUG_ON(!in_irq());
111 BUG_ON(!irqs_disabled()); 122 BUG_ON(!irqs_disabled());
112 123
113 llnode = llist_del_all(this_list); 124 llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@ void irq_work_run(void)
119 /* 130 /*
120 * Clear the PENDING bit, after this point the @work 131 * Clear the PENDING bit, after this point the @work
121 * can be re-used. 132 * can be re-used.
133 * Make it immediately visible so that other CPUs trying
134 * to claim that work don't rely on us to handle their data
135 * while we are in the middle of the func.
122 */ 136 */
123 work->flags = IRQ_WORK_BUSY; 137 flags = work->flags & ~IRQ_WORK_PENDING;
138 xchg(&work->flags, flags);
139
124 work->func(work); 140 work->func(work);
125 /* 141 /*
126 * Clear the BUSY bit and return to the free state if 142 * Clear the BUSY bit and return to the free state if
127 * no-one else claimed it meanwhile. 143 * no-one else claimed it meanwhile.
128 */ 144 */
129 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); 145 (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
130 } 146 }
131} 147}
148
149/*
150 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
151 * context with local IRQs disabled.
152 */
153void irq_work_run(void)
154{
155 BUG_ON(!in_irq());
156 __irq_work_run();
157}
132EXPORT_SYMBOL_GPL(irq_work_run); 158EXPORT_SYMBOL_GPL(irq_work_run);
133 159
134/* 160/*
@@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)
143 cpu_relax(); 169 cpu_relax();
144} 170}
145EXPORT_SYMBOL_GPL(irq_work_sync); 171EXPORT_SYMBOL_GPL(irq_work_sync);
172
173#ifdef CONFIG_HOTPLUG_CPU
174static int irq_work_cpu_notify(struct notifier_block *self,
175 unsigned long action, void *hcpu)
176{
177 long cpu = (long)hcpu;
178
179 switch (action) {
180 case CPU_DYING:
181 /* Called from stop_machine */
182 if (WARN_ON_ONCE(cpu != smp_processor_id()))
183 break;
184 __irq_work_run();
185 break;
186 default:
187 break;
188 }
189 return NOTIFY_OK;
190}
191
192static struct notifier_block cpu_notify;
193
194static __init int irq_work_init_cpu_notifier(void)
195{
196 cpu_notify.notifier_call = irq_work_cpu_notify;
197 cpu_notify.priority = 0;
198 register_cpu_notifier(&cpu_notify);
199 return 0;
200}
201device_initcall(irq_work_init_cpu_notifier);
202
203#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5d..bddd3d7a74b6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
54 .end = 0, 54 .end = 0,
55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
56}; 56};
57struct resource crashk_low_res = {
58 .name = "Crash kernel low",
59 .start = 0,
60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
62};
57 63
58int kexec_should_crash(struct task_struct *p) 64int kexec_should_crash(struct task_struct *p)
59{ 65{
@@ -223,6 +229,8 @@ out:
223 229
224} 230}
225 231
232static void kimage_free_page_list(struct list_head *list);
233
226static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 234static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
227 unsigned long nr_segments, 235 unsigned long nr_segments,
228 struct kexec_segment __user *segments) 236 struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
236 if (result) 244 if (result)
237 goto out; 245 goto out;
238 246
239 *rimage = image;
240
241 /* 247 /*
242 * Find a location for the control code buffer, and add it 248 * Find a location for the control code buffer, and add it
243 * the vector of segments so that it's pages will also be 249 * the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
248 get_order(KEXEC_CONTROL_PAGE_SIZE)); 254 get_order(KEXEC_CONTROL_PAGE_SIZE));
249 if (!image->control_code_page) { 255 if (!image->control_code_page) {
250 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 256 printk(KERN_ERR "Could not allocate control_code_buffer\n");
251 goto out; 257 goto out_free;
252 } 258 }
253 259
254 image->swap_page = kimage_alloc_control_pages(image, 0); 260 image->swap_page = kimage_alloc_control_pages(image, 0);
255 if (!image->swap_page) { 261 if (!image->swap_page) {
256 printk(KERN_ERR "Could not allocate swap buffer\n"); 262 printk(KERN_ERR "Could not allocate swap buffer\n");
257 goto out; 263 goto out_free;
258 } 264 }
259 265
260 result = 0; 266 *rimage = image;
261 out: 267 return 0;
262 if (result == 0)
263 *rimage = image;
264 else
265 kfree(image);
266 268
269out_free:
270 kimage_free_page_list(&image->control_pages);
271 kfree(image);
272out:
267 return result; 273 return result;
268} 274}
269 275
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
310 mend = mstart + image->segment[i].memsz - 1; 316 mend = mstart + image->segment[i].memsz - 1;
311 /* Ensure we are within the crash kernel limits */ 317 /* Ensure we are within the crash kernel limits */
312 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 318 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
313 goto out; 319 goto out_free;
314 } 320 }
315 321
316 /* 322 /*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
323 get_order(KEXEC_CONTROL_PAGE_SIZE)); 329 get_order(KEXEC_CONTROL_PAGE_SIZE));
324 if (!image->control_code_page) { 330 if (!image->control_code_page) {
325 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 331 printk(KERN_ERR "Could not allocate control_code_buffer\n");
326 goto out; 332 goto out_free;
327 } 333 }
328 334
329 result = 0; 335 *rimage = image;
330out: 336 return 0;
331 if (result == 0)
332 *rimage = image;
333 else
334 kfree(image);
335 337
338out_free:
339 kfree(image);
340out:
336 return result; 341 return result;
337} 342}
338 343
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
497 502
498 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) 503 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
499 break; 504 break;
500 if (hole_end > crashk_res.end)
501 break;
502 /* See if I overlap any of the segments */ 505 /* See if I overlap any of the segments */
503 for (i = 0; i < image->nr_segments; i++) { 506 for (i = 0; i < image->nr_segments; i++) {
504 unsigned long mstart, mend; 507 unsigned long mstart, mend;
@@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char *cmdline,
1369 * That function is the entry point for command line parsing and should be 1372 * That function is the entry point for command line parsing and should be
1370 * called from the arch-specific code. 1373 * called from the arch-specific code.
1371 */ 1374 */
1372int __init parse_crashkernel(char *cmdline, 1375static int __init __parse_crashkernel(char *cmdline,
1373 unsigned long long system_ram, 1376 unsigned long long system_ram,
1374 unsigned long long *crash_size, 1377 unsigned long long *crash_size,
1375 unsigned long long *crash_base) 1378 unsigned long long *crash_base,
1379 const char *name)
1376{ 1380{
1377 char *p = cmdline, *ck_cmdline = NULL; 1381 char *p = cmdline, *ck_cmdline = NULL;
1378 char *first_colon, *first_space; 1382 char *first_colon, *first_space;
@@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char *cmdline,
1382 *crash_base = 0; 1386 *crash_base = 0;
1383 1387
1384 /* find crashkernel and use the last one if there are more */ 1388 /* find crashkernel and use the last one if there are more */
1385 p = strstr(p, "crashkernel="); 1389 p = strstr(p, name);
1386 while (p) { 1390 while (p) {
1387 ck_cmdline = p; 1391 ck_cmdline = p;
1388 p = strstr(p+1, "crashkernel="); 1392 p = strstr(p+1, name);
1389 } 1393 }
1390 1394
1391 if (!ck_cmdline) 1395 if (!ck_cmdline)
1392 return -EINVAL; 1396 return -EINVAL;
1393 1397
1394 ck_cmdline += 12; /* strlen("crashkernel=") */ 1398 ck_cmdline += strlen(name);
1395 1399
1396 /* 1400 /*
1397 * if the commandline contains a ':', then that's the extended 1401 * if the commandline contains a ':', then that's the extended
@@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char *cmdline,
1409 return 0; 1413 return 0;
1410} 1414}
1411 1415
1416int __init parse_crashkernel(char *cmdline,
1417 unsigned long long system_ram,
1418 unsigned long long *crash_size,
1419 unsigned long long *crash_base)
1420{
1421 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1422 "crashkernel=");
1423}
1424
1425int __init parse_crashkernel_low(char *cmdline,
1426 unsigned long long system_ram,
1427 unsigned long long *crash_size,
1428 unsigned long long *crash_base)
1429{
1430 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1431 "crashkernel_low=");
1432}
1412 1433
1413static void update_vmcoreinfo_note(void) 1434static void update_vmcoreinfo_note(void)
1414{ 1435{
@@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void)
1490 VMCOREINFO_OFFSET(page, _count); 1511 VMCOREINFO_OFFSET(page, _count);
1491 VMCOREINFO_OFFSET(page, mapping); 1512 VMCOREINFO_OFFSET(page, mapping);
1492 VMCOREINFO_OFFSET(page, lru); 1513 VMCOREINFO_OFFSET(page, lru);
1514 VMCOREINFO_OFFSET(page, _mapcount);
1515 VMCOREINFO_OFFSET(page, private);
1493 VMCOREINFO_OFFSET(pglist_data, node_zones); 1516 VMCOREINFO_OFFSET(pglist_data, node_zones);
1494 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1517 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1495#ifdef CONFIG_FLAT_NODE_MEM_MAP 1518#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void)
1512 VMCOREINFO_NUMBER(PG_lru); 1535 VMCOREINFO_NUMBER(PG_lru);
1513 VMCOREINFO_NUMBER(PG_private); 1536 VMCOREINFO_NUMBER(PG_private);
1514 VMCOREINFO_NUMBER(PG_swapcache); 1537 VMCOREINFO_NUMBER(PG_swapcache);
1538 VMCOREINFO_NUMBER(PG_slab);
1539#ifdef CONFIG_MEMORY_FAILURE
1540 VMCOREINFO_NUMBER(PG_hwpoison);
1541#endif
1542 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1515 1543
1516 arch_crash_save_vmcoreinfo(); 1544 arch_crash_save_vmcoreinfo();
1517 update_vmcoreinfo_note(); 1545 update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b81d24..000000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
1/*
2 * A generic kernel FIFO implementation
3 *
4 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 */
21
22#include <linux/kernel.h>
23#include <linux/export.h>
24#include <linux/slab.h>
25#include <linux/err.h>
26#include <linux/log2.h>
27#include <linux/uaccess.h>
28#include <linux/kfifo.h>
29
30/*
31 * internal helper to calculate the unused elements in a fifo
32 */
33static inline unsigned int kfifo_unused(struct __kfifo *fifo)
34{
35 return (fifo->mask + 1) - (fifo->in - fifo->out);
36}
37
38int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
39 size_t esize, gfp_t gfp_mask)
40{
41 /*
42 * round down to the next power of 2, since our 'let the indices
43 * wrap' technique works only in this case.
44 */
45 if (!is_power_of_2(size))
46 size = rounddown_pow_of_two(size);
47
48 fifo->in = 0;
49 fifo->out = 0;
50 fifo->esize = esize;
51
52 if (size < 2) {
53 fifo->data = NULL;
54 fifo->mask = 0;
55 return -EINVAL;
56 }
57
58 fifo->data = kmalloc(size * esize, gfp_mask);
59
60 if (!fifo->data) {
61 fifo->mask = 0;
62 return -ENOMEM;
63 }
64 fifo->mask = size - 1;
65
66 return 0;
67}
68EXPORT_SYMBOL(__kfifo_alloc);
69
70void __kfifo_free(struct __kfifo *fifo)
71{
72 kfree(fifo->data);
73 fifo->in = 0;
74 fifo->out = 0;
75 fifo->esize = 0;
76 fifo->data = NULL;
77 fifo->mask = 0;
78}
79EXPORT_SYMBOL(__kfifo_free);
80
81int __kfifo_init(struct __kfifo *fifo, void *buffer,
82 unsigned int size, size_t esize)
83{
84 size /= esize;
85
86 if (!is_power_of_2(size))
87 size = rounddown_pow_of_two(size);
88
89 fifo->in = 0;
90 fifo->out = 0;
91 fifo->esize = esize;
92 fifo->data = buffer;
93
94 if (size < 2) {
95 fifo->mask = 0;
96 return -EINVAL;
97 }
98 fifo->mask = size - 1;
99
100 return 0;
101}
102EXPORT_SYMBOL(__kfifo_init);
103
104static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
105 unsigned int len, unsigned int off)
106{
107 unsigned int size = fifo->mask + 1;
108 unsigned int esize = fifo->esize;
109 unsigned int l;
110
111 off &= fifo->mask;
112 if (esize != 1) {
113 off *= esize;
114 size *= esize;
115 len *= esize;
116 }
117 l = min(len, size - off);
118
119 memcpy(fifo->data + off, src, l);
120 memcpy(fifo->data, src + l, len - l);
121 /*
122 * make sure that the data in the fifo is up to date before
123 * incrementing the fifo->in index counter
124 */
125 smp_wmb();
126}
127
128unsigned int __kfifo_in(struct __kfifo *fifo,
129 const void *buf, unsigned int len)
130{
131 unsigned int l;
132
133 l = kfifo_unused(fifo);
134 if (len > l)
135 len = l;
136
137 kfifo_copy_in(fifo, buf, len, fifo->in);
138 fifo->in += len;
139 return len;
140}
141EXPORT_SYMBOL(__kfifo_in);
142
143static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
144 unsigned int len, unsigned int off)
145{
146 unsigned int size = fifo->mask + 1;
147 unsigned int esize = fifo->esize;
148 unsigned int l;
149
150 off &= fifo->mask;
151 if (esize != 1) {
152 off *= esize;
153 size *= esize;
154 len *= esize;
155 }
156 l = min(len, size - off);
157
158 memcpy(dst, fifo->data + off, l);
159 memcpy(dst + l, fifo->data, len - l);
160 /*
161 * make sure that the data is copied before
162 * incrementing the fifo->out index counter
163 */
164 smp_wmb();
165}
166
167unsigned int __kfifo_out_peek(struct __kfifo *fifo,
168 void *buf, unsigned int len)
169{
170 unsigned int l;
171
172 l = fifo->in - fifo->out;
173 if (len > l)
174 len = l;
175
176 kfifo_copy_out(fifo, buf, len, fifo->out);
177 return len;
178}
179EXPORT_SYMBOL(__kfifo_out_peek);
180
181unsigned int __kfifo_out(struct __kfifo *fifo,
182 void *buf, unsigned int len)
183{
184 len = __kfifo_out_peek(fifo, buf, len);
185 fifo->out += len;
186 return len;
187}
188EXPORT_SYMBOL(__kfifo_out);
189
190static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
191 const void __user *from, unsigned int len, unsigned int off,
192 unsigned int *copied)
193{
194 unsigned int size = fifo->mask + 1;
195 unsigned int esize = fifo->esize;
196 unsigned int l;
197 unsigned long ret;
198
199 off &= fifo->mask;
200 if (esize != 1) {
201 off *= esize;
202 size *= esize;
203 len *= esize;
204 }
205 l = min(len, size - off);
206
207 ret = copy_from_user(fifo->data + off, from, l);
208 if (unlikely(ret))
209 ret = DIV_ROUND_UP(ret + len - l, esize);
210 else {
211 ret = copy_from_user(fifo->data, from + l, len - l);
212 if (unlikely(ret))
213 ret = DIV_ROUND_UP(ret, esize);
214 }
215 /*
216 * make sure that the data in the fifo is up to date before
217 * incrementing the fifo->in index counter
218 */
219 smp_wmb();
220 *copied = len - ret;
221 /* return the number of elements which are not copied */
222 return ret;
223}
224
225int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
226 unsigned long len, unsigned int *copied)
227{
228 unsigned int l;
229 unsigned long ret;
230 unsigned int esize = fifo->esize;
231 int err;
232
233 if (esize != 1)
234 len /= esize;
235
236 l = kfifo_unused(fifo);
237 if (len > l)
238 len = l;
239
240 ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
241 if (unlikely(ret)) {
242 len -= ret;
243 err = -EFAULT;
244 } else
245 err = 0;
246 fifo->in += len;
247 return err;
248}
249EXPORT_SYMBOL(__kfifo_from_user);
250
251static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
252 unsigned int len, unsigned int off, unsigned int *copied)
253{
254 unsigned int l;
255 unsigned long ret;
256 unsigned int size = fifo->mask + 1;
257 unsigned int esize = fifo->esize;
258
259 off &= fifo->mask;
260 if (esize != 1) {
261 off *= esize;
262 size *= esize;
263 len *= esize;
264 }
265 l = min(len, size - off);
266
267 ret = copy_to_user(to, fifo->data + off, l);
268 if (unlikely(ret))
269 ret = DIV_ROUND_UP(ret + len - l, esize);
270 else {
271 ret = copy_to_user(to + l, fifo->data, len - l);
272 if (unlikely(ret))
273 ret = DIV_ROUND_UP(ret, esize);
274 }
275 /*
276 * make sure that the data is copied before
277 * incrementing the fifo->out index counter
278 */
279 smp_wmb();
280 *copied = len - ret;
281 /* return the number of elements which are not copied */
282 return ret;
283}
284
285int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
286 unsigned long len, unsigned int *copied)
287{
288 unsigned int l;
289 unsigned long ret;
290 unsigned int esize = fifo->esize;
291 int err;
292
293 if (esize != 1)
294 len /= esize;
295
296 l = fifo->in - fifo->out;
297 if (len > l)
298 len = l;
299 ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
300 if (unlikely(ret)) {
301 len -= ret;
302 err = -EFAULT;
303 } else
304 err = 0;
305 fifo->out += len;
306 return err;
307}
308EXPORT_SYMBOL(__kfifo_to_user);
309
310static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
311 int nents, unsigned int len)
312{
313 int n;
314 unsigned int l;
315 unsigned int off;
316 struct page *page;
317
318 if (!nents)
319 return 0;
320
321 if (!len)
322 return 0;
323
324 n = 0;
325 page = virt_to_page(buf);
326 off = offset_in_page(buf);
327 l = 0;
328
329 while (len >= l + PAGE_SIZE - off) {
330 struct page *npage;
331
332 l += PAGE_SIZE;
333 buf += PAGE_SIZE;
334 npage = virt_to_page(buf);
335 if (page_to_phys(page) != page_to_phys(npage) - l) {
336 sg_set_page(sgl, page, l - off, off);
337 sgl = sg_next(sgl);
338 if (++n == nents || sgl == NULL)
339 return n;
340 page = npage;
341 len -= l - off;
342 l = off = 0;
343 }
344 }
345 sg_set_page(sgl, page, len, off);
346 return n + 1;
347}
348
349static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
350 int nents, unsigned int len, unsigned int off)
351{
352 unsigned int size = fifo->mask + 1;
353 unsigned int esize = fifo->esize;
354 unsigned int l;
355 unsigned int n;
356
357 off &= fifo->mask;
358 if (esize != 1) {
359 off *= esize;
360 size *= esize;
361 len *= esize;
362 }
363 l = min(len, size - off);
364
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367
368 return n;
369}
370
371unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
372 struct scatterlist *sgl, int nents, unsigned int len)
373{
374 unsigned int l;
375
376 l = kfifo_unused(fifo);
377 if (len > l)
378 len = l;
379
380 return setup_sgl(fifo, sgl, nents, len, fifo->in);
381}
382EXPORT_SYMBOL(__kfifo_dma_in_prepare);
383
384unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
385 struct scatterlist *sgl, int nents, unsigned int len)
386{
387 unsigned int l;
388
389 l = fifo->in - fifo->out;
390 if (len > l)
391 len = l;
392
393 return setup_sgl(fifo, sgl, nents, len, fifo->out);
394}
395EXPORT_SYMBOL(__kfifo_dma_out_prepare);
396
397unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
398{
399 unsigned int max = (1 << (recsize << 3)) - 1;
400
401 if (len > max)
402 return max;
403 return len;
404}
405EXPORT_SYMBOL(__kfifo_max_r);
406
407#define __KFIFO_PEEK(data, out, mask) \
408 ((data)[(out) & (mask)])
409/*
410 * __kfifo_peek_n internal helper function for determinate the length of
411 * the next record in the fifo
412 */
413static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
414{
415 unsigned int l;
416 unsigned int mask = fifo->mask;
417 unsigned char *data = fifo->data;
418
419 l = __KFIFO_PEEK(data, fifo->out, mask);
420
421 if (--recsize)
422 l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
423
424 return l;
425}
426
427#define __KFIFO_POKE(data, in, mask, val) \
428 ( \
429 (data)[(in) & (mask)] = (unsigned char)(val) \
430 )
431
432/*
433 * __kfifo_poke_n internal helper function for storeing the length of
434 * the record into the fifo
435 */
436static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
437{
438 unsigned int mask = fifo->mask;
439 unsigned char *data = fifo->data;
440
441 __KFIFO_POKE(data, fifo->in, mask, n);
442
443 if (recsize > 1)
444 __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
445}
446
447unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
448{
449 return __kfifo_peek_n(fifo, recsize);
450}
451EXPORT_SYMBOL(__kfifo_len_r);
452
453unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
454 unsigned int len, size_t recsize)
455{
456 if (len + recsize > kfifo_unused(fifo))
457 return 0;
458
459 __kfifo_poke_n(fifo, len, recsize);
460
461 kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
462 fifo->in += len + recsize;
463 return len;
464}
465EXPORT_SYMBOL(__kfifo_in_r);
466
467static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
468 void *buf, unsigned int len, size_t recsize, unsigned int *n)
469{
470 *n = __kfifo_peek_n(fifo, recsize);
471
472 if (len > *n)
473 len = *n;
474
475 kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
476 return len;
477}
478
479unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
480 unsigned int len, size_t recsize)
481{
482 unsigned int n;
483
484 if (fifo->in == fifo->out)
485 return 0;
486
487 return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
488}
489EXPORT_SYMBOL(__kfifo_out_peek_r);
490
491unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
492 unsigned int len, size_t recsize)
493{
494 unsigned int n;
495
496 if (fifo->in == fifo->out)
497 return 0;
498
499 len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
500 fifo->out += n + recsize;
501 return len;
502}
503EXPORT_SYMBOL(__kfifo_out_r);
504
505void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
506{
507 unsigned int n;
508
509 n = __kfifo_peek_n(fifo, recsize);
510 fifo->out += n + recsize;
511}
512EXPORT_SYMBOL(__kfifo_skip_r);
513
514int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
515 unsigned long len, unsigned int *copied, size_t recsize)
516{
517 unsigned long ret;
518
519 len = __kfifo_max_r(len, recsize);
520
521 if (len + recsize > kfifo_unused(fifo)) {
522 *copied = 0;
523 return 0;
524 }
525
526 __kfifo_poke_n(fifo, len, recsize);
527
528 ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
529 if (unlikely(ret)) {
530 *copied = 0;
531 return -EFAULT;
532 }
533 fifo->in += len + recsize;
534 return 0;
535}
536EXPORT_SYMBOL(__kfifo_from_user_r);
537
538int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
539 unsigned long len, unsigned int *copied, size_t recsize)
540{
541 unsigned long ret;
542 unsigned int n;
543
544 if (fifo->in == fifo->out) {
545 *copied = 0;
546 return 0;
547 }
548
549 n = __kfifo_peek_n(fifo, recsize);
550 if (len > n)
551 len = n;
552
553 ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
554 if (unlikely(ret)) {
555 *copied = 0;
556 return -EFAULT;
557 }
558 fifo->out += n + recsize;
559 return 0;
560}
561EXPORT_SYMBOL(__kfifo_to_user_r);
562
563unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
564 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
565{
566 if (!nents)
567 BUG();
568
569 len = __kfifo_max_r(len, recsize);
570
571 if (len + recsize > kfifo_unused(fifo))
572 return 0;
573
574 return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
575}
576EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
577
578void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
579 unsigned int len, size_t recsize)
580{
581 len = __kfifo_max_r(len, recsize);
582 __kfifo_poke_n(fifo, len, recsize);
583 fifo->in += len + recsize;
584}
585EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
586
587unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
588 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
589{
590 if (!nents)
591 BUG();
592
593 len = __kfifo_max_r(len, recsize);
594
595 if (len + recsize > fifo->in - fifo->out)
596 return 0;
597
598 return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
599}
600EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
601
602void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
603{
604 unsigned int len;
605
606 len = __kfifo_peek_n(fifo, recsize);
607 fifo->out += len + recsize;
608}
609EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0023a87e8de6..56dd34976d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -38,6 +38,7 @@
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h> 39#include <linux/rwsem.h>
40#include <linux/ptrace.h> 40#include <linux/ptrace.h>
41#include <linux/async.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42 43
43#include <trace/events/module.h> 44#include <trace/events/module.h>
@@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...)
130#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 131#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
131 static int kmod_loop_msg; 132 static int kmod_loop_msg;
132 133
134 /*
135 * We don't allow synchronous module loading from async. Module
136 * init may invoke async_synchronize_full() which will end up
137 * waiting for this task which already is waiting for the module
138 * loading to complete, leading to a deadlock.
139 */
140 WARN_ON_ONCE(wait && current_is_async());
141
133 va_start(args, fmt); 142 va_start(args, fmt);
134 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 143 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
135 va_end(args); 144 va_end(args);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..e35be53f6613 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
334struct kprobe __kprobes *get_kprobe(void *addr) 334struct kprobe __kprobes *get_kprobe(void *addr)
335{ 335{
336 struct hlist_head *head; 336 struct hlist_head *head;
337 struct hlist_node *node;
338 struct kprobe *p; 337 struct kprobe *p;
339 338
340 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; 339 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
341 hlist_for_each_entry_rcu(p, node, head, hlist) { 340 hlist_for_each_entry_rcu(p, head, hlist) {
342 if (p->addr == addr) 341 if (p->addr == addr)
343 return p; 342 return p;
344 } 343 }
@@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list);
471 470
472static void kprobe_optimizer(struct work_struct *work); 471static void kprobe_optimizer(struct work_struct *work);
473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 472static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
475#define OPTIMIZE_DELAY 5 473#define OPTIMIZE_DELAY 5
476 474
477/* 475/*
@@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
552/* Start optimizer after OPTIMIZE_DELAY passed */ 550/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void) 551static __kprobes void kick_kprobe_optimizer(void)
554{ 552{
555 if (!delayed_work_pending(&optimizing_work)) 553 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557} 554}
558 555
559/* Kprobe jump optimizer */ 556/* Kprobe jump optimizer */
@@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
592 /* Step 5: Kick optimizer again if needed */ 589 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) 590 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer(); 591 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598} 592}
599 593
600/* Wait for completing optimization and unoptimization */ 594/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void) 595static __kprobes void wait_for_kprobe_optimizer(void)
602{ 596{
603 if (delayed_work_pending(&optimizing_work)) 597 mutex_lock(&kprobe_mutex);
604 wait_for_completion(&optimizer_comp); 598
599 while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
600 mutex_unlock(&kprobe_mutex);
601
602 /* this will also make optimizing_work execute immmediately */
603 flush_delayed_work(&optimizing_work);
604 /* @optimizing_work might not have been queued yet, relax */
605 cpu_relax();
606
607 mutex_lock(&kprobe_mutex);
608 }
609
610 mutex_unlock(&kprobe_mutex);
605} 611}
606 612
607/* Optimize kprobe if p is ready to be optimized */ 613/* Optimize kprobe if p is ready to be optimized */
@@ -792,7 +798,6 @@ out:
792static void __kprobes optimize_all_kprobes(void) 798static void __kprobes optimize_all_kprobes(void)
793{ 799{
794 struct hlist_head *head; 800 struct hlist_head *head;
795 struct hlist_node *node;
796 struct kprobe *p; 801 struct kprobe *p;
797 unsigned int i; 802 unsigned int i;
798 803
@@ -803,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void)
803 kprobes_allow_optimization = true; 808 kprobes_allow_optimization = true;
804 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
805 head = &kprobe_table[i]; 810 head = &kprobe_table[i];
806 hlist_for_each_entry_rcu(p, node, head, hlist) 811 hlist_for_each_entry_rcu(p, head, hlist)
807 if (!kprobe_disabled(p)) 812 if (!kprobe_disabled(p))
808 optimize_kprobe(p); 813 optimize_kprobe(p);
809 } 814 }
@@ -814,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void)
814static void __kprobes unoptimize_all_kprobes(void) 819static void __kprobes unoptimize_all_kprobes(void)
815{ 820{
816 struct hlist_head *head; 821 struct hlist_head *head;
817 struct hlist_node *node;
818 struct kprobe *p; 822 struct kprobe *p;
819 unsigned int i; 823 unsigned int i;
820 824
@@ -825,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void)
825 kprobes_allow_optimization = false; 829 kprobes_allow_optimization = false;
826 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 830 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
827 head = &kprobe_table[i]; 831 head = &kprobe_table[i];
828 hlist_for_each_entry_rcu(p, node, head, hlist) { 832 hlist_for_each_entry_rcu(p, head, hlist) {
829 if (!kprobe_disabled(p)) 833 if (!kprobe_disabled(p))
830 unoptimize_kprobe(p, false); 834 unoptimize_kprobe(p, false);
831 } 835 }
@@ -919,7 +923,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
919} 923}
920#endif /* CONFIG_OPTPROBES */ 924#endif /* CONFIG_OPTPROBES */
921 925
922#ifdef KPROBES_CAN_USE_FTRACE 926#ifdef CONFIG_KPROBES_ON_FTRACE
923static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 927static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
924 .func = kprobe_ftrace_handler, 928 .func = kprobe_ftrace_handler,
925 .flags = FTRACE_OPS_FL_SAVE_REGS, 929 .flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +968,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
964 (unsigned long)p->addr, 1, 0); 968 (unsigned long)p->addr, 1, 0);
965 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); 969 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
966} 970}
967#else /* !KPROBES_CAN_USE_FTRACE */ 971#else /* !CONFIG_KPROBES_ON_FTRACE */
968#define prepare_kprobe(p) arch_prepare_kprobe(p) 972#define prepare_kprobe(p) arch_prepare_kprobe(p)
969#define arm_kprobe_ftrace(p) do {} while (0) 973#define arm_kprobe_ftrace(p) do {} while (0)
970#define disarm_kprobe_ftrace(p) do {} while (0) 974#define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1141,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1141{ 1145{
1142 struct kretprobe_instance *ri; 1146 struct kretprobe_instance *ri;
1143 struct hlist_head *head, empty_rp; 1147 struct hlist_head *head, empty_rp;
1144 struct hlist_node *node, *tmp; 1148 struct hlist_node *tmp;
1145 unsigned long hash, flags = 0; 1149 unsigned long hash, flags = 0;
1146 1150
1147 if (unlikely(!kprobes_initialized)) 1151 if (unlikely(!kprobes_initialized))
@@ -1152,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1152 hash = hash_ptr(tk, KPROBE_HASH_BITS); 1156 hash = hash_ptr(tk, KPROBE_HASH_BITS);
1153 head = &kretprobe_inst_table[hash]; 1157 head = &kretprobe_inst_table[hash];
1154 kretprobe_table_lock(hash, &flags); 1158 kretprobe_table_lock(hash, &flags);
1155 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 1159 hlist_for_each_entry_safe(ri, tmp, head, hlist) {
1156 if (ri->task == tk) 1160 if (ri->task == tk)
1157 recycle_rp_inst(ri, &empty_rp); 1161 recycle_rp_inst(ri, &empty_rp);
1158 } 1162 }
1159 kretprobe_table_unlock(hash, &flags); 1163 kretprobe_table_unlock(hash, &flags);
1160 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 1164 hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
1161 hlist_del(&ri->hlist); 1165 hlist_del(&ri->hlist);
1162 kfree(ri); 1166 kfree(ri);
1163 } 1167 }
@@ -1166,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1166static inline void free_rp_inst(struct kretprobe *rp) 1170static inline void free_rp_inst(struct kretprobe *rp)
1167{ 1171{
1168 struct kretprobe_instance *ri; 1172 struct kretprobe_instance *ri;
1169 struct hlist_node *pos, *next; 1173 struct hlist_node *next;
1170 1174
1171 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { 1175 hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
1172 hlist_del(&ri->hlist); 1176 hlist_del(&ri->hlist);
1173 kfree(ri); 1177 kfree(ri);
1174 } 1178 }
@@ -1178,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
1178{ 1182{
1179 unsigned long flags, hash; 1183 unsigned long flags, hash;
1180 struct kretprobe_instance *ri; 1184 struct kretprobe_instance *ri;
1181 struct hlist_node *pos, *next; 1185 struct hlist_node *next;
1182 struct hlist_head *head; 1186 struct hlist_head *head;
1183 1187
1184 /* No race here */ 1188 /* No race here */
1185 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { 1189 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
1186 kretprobe_table_lock(hash, &flags); 1190 kretprobe_table_lock(hash, &flags);
1187 head = &kretprobe_inst_table[hash]; 1191 head = &kretprobe_inst_table[hash];
1188 hlist_for_each_entry_safe(ri, pos, next, head, hlist) { 1192 hlist_for_each_entry_safe(ri, next, head, hlist) {
1189 if (ri->rp == rp) 1193 if (ri->rp == rp)
1190 ri->rp = NULL; 1194 ri->rp = NULL;
1191 } 1195 }
@@ -1414,12 +1418,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
1414 */ 1418 */
1415 ftrace_addr = ftrace_location((unsigned long)p->addr); 1419 ftrace_addr = ftrace_location((unsigned long)p->addr);
1416 if (ftrace_addr) { 1420 if (ftrace_addr) {
1417#ifdef KPROBES_CAN_USE_FTRACE 1421#ifdef CONFIG_KPROBES_ON_FTRACE
1418 /* Given address is not on the instruction boundary */ 1422 /* Given address is not on the instruction boundary */
1419 if ((unsigned long)p->addr != ftrace_addr) 1423 if ((unsigned long)p->addr != ftrace_addr)
1420 return -EILSEQ; 1424 return -EILSEQ;
1421 p->flags |= KPROBE_FLAG_FTRACE; 1425 p->flags |= KPROBE_FLAG_FTRACE;
1422#else /* !KPROBES_CAN_USE_FTRACE */ 1426#else /* !CONFIG_KPROBES_ON_FTRACE */
1423 return -EINVAL; 1427 return -EINVAL;
1424#endif 1428#endif
1425 } 1429 }
@@ -2021,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
2021{ 2025{
2022 struct module *mod = data; 2026 struct module *mod = data;
2023 struct hlist_head *head; 2027 struct hlist_head *head;
2024 struct hlist_node *node;
2025 struct kprobe *p; 2028 struct kprobe *p;
2026 unsigned int i; 2029 unsigned int i;
2027 int checkcore = (val == MODULE_STATE_GOING); 2030 int checkcore = (val == MODULE_STATE_GOING);
@@ -2038,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
2038 mutex_lock(&kprobe_mutex); 2041 mutex_lock(&kprobe_mutex);
2039 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2042 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2040 head = &kprobe_table[i]; 2043 head = &kprobe_table[i];
2041 hlist_for_each_entry_rcu(p, node, head, hlist) 2044 hlist_for_each_entry_rcu(p, head, hlist)
2042 if (within_module_init((unsigned long)p->addr, mod) || 2045 if (within_module_init((unsigned long)p->addr, mod) ||
2043 (checkcore && 2046 (checkcore &&
2044 within_module_core((unsigned long)p->addr, mod))) { 2047 within_module_core((unsigned long)p->addr, mod))) {
@@ -2185,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
2185static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) 2188static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2186{ 2189{
2187 struct hlist_head *head; 2190 struct hlist_head *head;
2188 struct hlist_node *node;
2189 struct kprobe *p, *kp; 2191 struct kprobe *p, *kp;
2190 const char *sym = NULL; 2192 const char *sym = NULL;
2191 unsigned int i = *(loff_t *) v; 2193 unsigned int i = *(loff_t *) v;
@@ -2194,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2194 2196
2195 head = &kprobe_table[i]; 2197 head = &kprobe_table[i];
2196 preempt_disable(); 2198 preempt_disable();
2197 hlist_for_each_entry_rcu(p, node, head, hlist) { 2199 hlist_for_each_entry_rcu(p, head, hlist) {
2198 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 2200 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
2199 &offset, &modname, namebuf); 2201 &offset, &modname, namebuf);
2200 if (kprobe_aggrprobe(p)) { 2202 if (kprobe_aggrprobe(p)) {
@@ -2229,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = {
2229static void __kprobes arm_all_kprobes(void) 2231static void __kprobes arm_all_kprobes(void)
2230{ 2232{
2231 struct hlist_head *head; 2233 struct hlist_head *head;
2232 struct hlist_node *node;
2233 struct kprobe *p; 2234 struct kprobe *p;
2234 unsigned int i; 2235 unsigned int i;
2235 2236
@@ -2242,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void)
2242 /* Arming kprobes doesn't optimize kprobe itself */ 2243 /* Arming kprobes doesn't optimize kprobe itself */
2243 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2244 head = &kprobe_table[i]; 2245 head = &kprobe_table[i];
2245 hlist_for_each_entry_rcu(p, node, head, hlist) 2246 hlist_for_each_entry_rcu(p, head, hlist)
2246 if (!kprobe_disabled(p)) 2247 if (!kprobe_disabled(p))
2247 arm_kprobe(p); 2248 arm_kprobe(p);
2248 } 2249 }
@@ -2258,7 +2259,6 @@ already_enabled:
2258static void __kprobes disarm_all_kprobes(void) 2259static void __kprobes disarm_all_kprobes(void)
2259{ 2260{
2260 struct hlist_head *head; 2261 struct hlist_head *head;
2261 struct hlist_node *node;
2262 struct kprobe *p; 2262 struct kprobe *p;
2263 unsigned int i; 2263 unsigned int i;
2264 2264
@@ -2275,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void)
2275 2275
2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2277 head = &kprobe_table[i]; 2277 head = &kprobe_table[i];
2278 hlist_for_each_entry_rcu(p, node, head, hlist) { 2278 hlist_for_each_entry_rcu(p, head, hlist) {
2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
2280 disarm_kprobe(p, false); 2280 disarm_kprobe(p, false);
2281 } 2281 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350d..259db207b5d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3190#endif 3190#endif
3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
3192 debug_locks_off(); 3192 debug_locks_off();
3193 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 3193 printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
3194 curr->lockdep_depth, MAX_LOCK_DEPTH);
3194 printk("turning off the locking correctness validator.\n"); 3195 printk("turning off the locking correctness validator.\n");
3196
3197 lockdep_print_held_locks(current);
3198 debug_show_all_locks();
3195 dump_stack(); 3199 dump_stack();
3200
3196 return 0; 3201 return 0;
3197 } 3202 }
3198 3203
@@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3203} 3208}
3204 3209
3205static int 3210static int
3206print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, 3211print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3207 unsigned long ip) 3212 unsigned long ip)
3208{ 3213{
3209 if (!debug_locks_off()) 3214 if (!debug_locks_off())
@@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
3246 return 0; 3251 return 0;
3247 3252
3248 if (curr->lockdep_depth <= 0) 3253 if (curr->lockdep_depth <= 0)
3249 return print_unlock_inbalance_bug(curr, lock, ip); 3254 return print_unlock_imbalance_bug(curr, lock, ip);
3250 3255
3251 return 1; 3256 return 1;
3252} 3257}
@@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3317 goto found_it; 3322 goto found_it;
3318 prev_hlock = hlock; 3323 prev_hlock = hlock;
3319 } 3324 }
3320 return print_unlock_inbalance_bug(curr, lock, ip); 3325 return print_unlock_imbalance_bug(curr, lock, ip);
3321 3326
3322found_it: 3327found_it:
3323 lockdep_init_map(lock, name, key, 0); 3328 lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,
3384 goto found_it; 3389 goto found_it;
3385 prev_hlock = hlock; 3390 prev_hlock = hlock;
3386 } 3391 }
3387 return print_unlock_inbalance_bug(curr, lock, ip); 3392 return print_unlock_imbalance_bug(curr, lock, ip);
3388 3393
3389found_it: 3394found_it:
3390 if (hlock->instance == lock) 3395 if (hlock->instance == lock)
@@ -4083,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
4083} 4088}
4084EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); 4089EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
4085 4090
4086static void print_held_locks_bug(struct task_struct *curr) 4091static void print_held_locks_bug(void)
4087{ 4092{
4088 if (!debug_locks_off()) 4093 if (!debug_locks_off())
4089 return; 4094 return;
@@ -4092,22 +4097,21 @@ static void print_held_locks_bug(struct task_struct *curr)
4092 4097
4093 printk("\n"); 4098 printk("\n");
4094 printk("=====================================\n"); 4099 printk("=====================================\n");
4095 printk("[ BUG: lock held at task exit time! ]\n"); 4100 printk("[ BUG: %s/%d still has locks held! ]\n",
4101 current->comm, task_pid_nr(current));
4096 print_kernel_ident(); 4102 print_kernel_ident();
4097 printk("-------------------------------------\n"); 4103 printk("-------------------------------------\n");
4098 printk("%s/%d is exiting with locks still held!\n", 4104 lockdep_print_held_locks(current);
4099 curr->comm, task_pid_nr(curr));
4100 lockdep_print_held_locks(curr);
4101
4102 printk("\nstack backtrace:\n"); 4105 printk("\nstack backtrace:\n");
4103 dump_stack(); 4106 dump_stack();
4104} 4107}
4105 4108
4106void debug_check_no_locks_held(struct task_struct *task) 4109void debug_check_no_locks_held(void)
4107{ 4110{
4108 if (unlikely(task->lockdep_depth > 0)) 4111 if (unlikely(current->lockdep_depth > 0))
4109 print_held_locks_bug(task); 4112 print_held_locks_bug();
4110} 4113}
4114EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4111 4115
4112void debug_show_all_locks(void) 4116void debug_show_all_locks(void)
4113{ 4117{
diff --git a/kernel/module.c b/kernel/module.c
index 250092c1d57d..0925c9a71975 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -188,6 +188,7 @@ struct load_info {
188 ongoing or failed initialization etc. */ 188 ongoing or failed initialization etc. */
189static inline int strong_try_module_get(struct module *mod) 189static inline int strong_try_module_get(struct module *mod)
190{ 190{
191 BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
191 if (mod && mod->state == MODULE_STATE_COMING) 192 if (mod && mod->state == MODULE_STATE_COMING)
192 return -EBUSY; 193 return -EBUSY;
193 if (try_module_get(mod)) 194 if (try_module_get(mod))
@@ -196,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
196 return -ENOENT; 197 return -ENOENT;
197} 198}
198 199
199static inline void add_taint_module(struct module *mod, unsigned flag) 200static inline void add_taint_module(struct module *mod, unsigned flag,
201 enum lockdep_ok lockdep_ok)
200{ 202{
201 add_taint(flag); 203 add_taint(flag, lockdep_ok);
202 mod->taints |= (1U << flag); 204 mod->taints |= (1U << flag);
203} 205}
204 206
@@ -343,6 +345,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
343#endif 345#endif
344 }; 346 };
345 347
348 if (mod->state == MODULE_STATE_UNFORMED)
349 continue;
350
346 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) 351 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
347 return true; 352 return true;
348 } 353 }
@@ -450,16 +455,24 @@ const struct kernel_symbol *find_symbol(const char *name,
450EXPORT_SYMBOL_GPL(find_symbol); 455EXPORT_SYMBOL_GPL(find_symbol);
451 456
452/* Search for module by name: must hold module_mutex. */ 457/* Search for module by name: must hold module_mutex. */
453struct module *find_module(const char *name) 458static struct module *find_module_all(const char *name,
459 bool even_unformed)
454{ 460{
455 struct module *mod; 461 struct module *mod;
456 462
457 list_for_each_entry(mod, &modules, list) { 463 list_for_each_entry(mod, &modules, list) {
464 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
465 continue;
458 if (strcmp(mod->name, name) == 0) 466 if (strcmp(mod->name, name) == 0)
459 return mod; 467 return mod;
460 } 468 }
461 return NULL; 469 return NULL;
462} 470}
471
472struct module *find_module(const char *name)
473{
474 return find_module_all(name, false);
475}
463EXPORT_SYMBOL_GPL(find_module); 476EXPORT_SYMBOL_GPL(find_module);
464 477
465#ifdef CONFIG_SMP 478#ifdef CONFIG_SMP
@@ -525,6 +538,8 @@ bool is_module_percpu_address(unsigned long addr)
525 preempt_disable(); 538 preempt_disable();
526 539
527 list_for_each_entry_rcu(mod, &modules, list) { 540 list_for_each_entry_rcu(mod, &modules, list) {
541 if (mod->state == MODULE_STATE_UNFORMED)
542 continue;
528 if (!mod->percpu_size) 543 if (!mod->percpu_size)
529 continue; 544 continue;
530 for_each_possible_cpu(cpu) { 545 for_each_possible_cpu(cpu) {
@@ -713,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
713{ 728{
714 int ret = (flags & O_TRUNC); 729 int ret = (flags & O_TRUNC);
715 if (ret) 730 if (ret)
716 add_taint(TAINT_FORCED_RMMOD); 731 add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
717 return ret; 732 return ret;
718} 733}
719#else 734#else
@@ -1048,6 +1063,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
1048 case MODULE_STATE_GOING: 1063 case MODULE_STATE_GOING:
1049 state = "going"; 1064 state = "going";
1050 break; 1065 break;
1066 default:
1067 BUG();
1051 } 1068 }
1052 return sprintf(buffer, "%s\n", state); 1069 return sprintf(buffer, "%s\n", state);
1053} 1070}
@@ -1122,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
1122 if (!test_taint(TAINT_FORCED_MODULE)) 1139 if (!test_taint(TAINT_FORCED_MODULE))
1123 printk(KERN_WARNING "%s: %s: kernel tainted.\n", 1140 printk(KERN_WARNING "%s: %s: kernel tainted.\n",
1124 mod->name, reason); 1141 mod->name, reason);
1125 add_taint_module(mod, TAINT_FORCED_MODULE); 1142 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
1126 return 0; 1143 return 0;
1127#else 1144#else
1128 return -ENOEXEC; 1145 return -ENOEXEC;
@@ -1786,6 +1803,8 @@ void set_all_modules_text_rw(void)
1786 1803
1787 mutex_lock(&module_mutex); 1804 mutex_lock(&module_mutex);
1788 list_for_each_entry_rcu(mod, &modules, list) { 1805 list_for_each_entry_rcu(mod, &modules, list) {
1806 if (mod->state == MODULE_STATE_UNFORMED)
1807 continue;
1789 if ((mod->module_core) && (mod->core_text_size)) { 1808 if ((mod->module_core) && (mod->core_text_size)) {
1790 set_page_attributes(mod->module_core, 1809 set_page_attributes(mod->module_core,
1791 mod->module_core + mod->core_text_size, 1810 mod->module_core + mod->core_text_size,
@@ -1807,6 +1826,8 @@ void set_all_modules_text_ro(void)
1807 1826
1808 mutex_lock(&module_mutex); 1827 mutex_lock(&module_mutex);
1809 list_for_each_entry_rcu(mod, &modules, list) { 1828 list_for_each_entry_rcu(mod, &modules, list) {
1829 if (mod->state == MODULE_STATE_UNFORMED)
1830 continue;
1810 if ((mod->module_core) && (mod->core_text_size)) { 1831 if ((mod->module_core) && (mod->core_text_size)) {
1811 set_page_attributes(mod->module_core, 1832 set_page_attributes(mod->module_core,
1812 mod->module_core + mod->core_text_size, 1833 mod->module_core + mod->core_text_size,
@@ -2127,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)
2127 if (!test_taint(TAINT_PROPRIETARY_MODULE)) 2148 if (!test_taint(TAINT_PROPRIETARY_MODULE))
2128 printk(KERN_WARNING "%s: module license '%s' taints " 2149 printk(KERN_WARNING "%s: module license '%s' taints "
2129 "kernel.\n", mod->name, license); 2150 "kernel.\n", mod->name, license);
2130 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2151 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2152 LOCKDEP_NOW_UNRELIABLE);
2131 } 2153 }
2132} 2154}
2133 2155
@@ -2519,7 +2541,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
2519 if (err) 2541 if (err)
2520 goto out; 2542 goto out;
2521 2543
2522 err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); 2544 err = vfs_getattr(&file->f_path, &stat);
2523 if (err) 2545 if (err)
2524 goto out; 2546 goto out;
2525 2547
@@ -2527,6 +2549,13 @@ static int copy_module_from_fd(int fd, struct load_info *info)
2527 err = -EFBIG; 2549 err = -EFBIG;
2528 goto out; 2550 goto out;
2529 } 2551 }
2552
2553 /* Don't hand 0 to vmalloc, it whines. */
2554 if (stat.size == 0) {
2555 err = -EINVAL;
2556 goto out;
2557 }
2558
2530 info->hdr = vmalloc(stat.size); 2559 info->hdr = vmalloc(stat.size);
2531 if (!info->hdr) { 2560 if (!info->hdr) {
2532 err = -ENOMEM; 2561 err = -ENOMEM;
@@ -2673,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2673 } 2702 }
2674 2703
2675 if (!get_modinfo(info, "intree")) 2704 if (!get_modinfo(info, "intree"))
2676 add_taint_module(mod, TAINT_OOT_MODULE); 2705 add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
2677 2706
2678 if (get_modinfo(info, "staging")) { 2707 if (get_modinfo(info, "staging")) {
2679 add_taint_module(mod, TAINT_CRAP); 2708 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
2680 printk(KERN_WARNING "%s: module is from the staging directory," 2709 printk(KERN_WARNING "%s: module is from the staging directory,"
2681 " the quality is unknown, you have been warned.\n", 2710 " the quality is unknown, you have been warned.\n",
2682 mod->name); 2711 mod->name);
@@ -2842,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)
2842 * using GPL-only symbols it needs. 2871 * using GPL-only symbols it needs.
2843 */ 2872 */
2844 if (strcmp(mod->name, "ndiswrapper") == 0) 2873 if (strcmp(mod->name, "ndiswrapper") == 0)
2845 add_taint(TAINT_PROPRIETARY_MODULE); 2874 add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
2846 2875
2847 /* driverloader was caught wrongly pretending to be under GPL */ 2876 /* driverloader was caught wrongly pretending to be under GPL */
2848 if (strcmp(mod->name, "driverloader") == 0) 2877 if (strcmp(mod->name, "driverloader") == 0)
2849 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2878 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2879 LOCKDEP_NOW_UNRELIABLE);
2850 2880
2851 /* lve claims to be GPL but upstream won't provide source */ 2881 /* lve claims to be GPL but upstream won't provide source */
2852 if (strcmp(mod->name, "lve") == 0) 2882 if (strcmp(mod->name, "lve") == 0)
2853 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2883 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2884 LOCKDEP_NOW_UNRELIABLE);
2854 2885
2855#ifdef CONFIG_MODVERSIONS 2886#ifdef CONFIG_MODVERSIONS
2856 if ((mod->num_syms && !mod->crcs) 2887 if ((mod->num_syms && !mod->crcs)
@@ -2990,8 +3021,9 @@ static bool finished_loading(const char *name)
2990 bool ret; 3021 bool ret;
2991 3022
2992 mutex_lock(&module_mutex); 3023 mutex_lock(&module_mutex);
2993 mod = find_module(name); 3024 mod = find_module_all(name, true);
2994 ret = !mod || mod->state != MODULE_STATE_COMING; 3025 ret = !mod || mod->state == MODULE_STATE_LIVE
3026 || mod->state == MODULE_STATE_GOING;
2995 mutex_unlock(&module_mutex); 3027 mutex_unlock(&module_mutex);
2996 3028
2997 return ret; 3029 return ret;
@@ -3013,6 +3045,12 @@ static int do_init_module(struct module *mod)
3013{ 3045{
3014 int ret = 0; 3046 int ret = 0;
3015 3047
3048 /*
3049 * We want to find out whether @mod uses async during init. Clear
3050 * PF_USED_ASYNC. async_schedule*() will set it.
3051 */
3052 current->flags &= ~PF_USED_ASYNC;
3053
3016 blocking_notifier_call_chain(&module_notify_list, 3054 blocking_notifier_call_chain(&module_notify_list,
3017 MODULE_STATE_COMING, mod); 3055 MODULE_STATE_COMING, mod);
3018 3056
@@ -3058,8 +3096,25 @@ static int do_init_module(struct module *mod)
3058 blocking_notifier_call_chain(&module_notify_list, 3096 blocking_notifier_call_chain(&module_notify_list,
3059 MODULE_STATE_LIVE, mod); 3097 MODULE_STATE_LIVE, mod);
3060 3098
3061 /* We need to finish all async code before the module init sequence is done */ 3099 /*
3062 async_synchronize_full(); 3100 * We need to finish all async code before the module init sequence
3101 * is done. This has potential to deadlock. For example, a newly
3102 * detected block device can trigger request_module() of the
3103 * default iosched from async probing task. Once userland helper
3104 * reaches here, async_synchronize_full() will wait on the async
3105 * task waiting on request_module() and deadlock.
3106 *
3107 * This deadlock is avoided by perfomring async_synchronize_full()
3108 * iff module init queued any async jobs. This isn't a full
3109 * solution as it will deadlock the same if module loading from
3110 * async jobs nests more than once; however, due to the various
3111 * constraints, this hack seems to be the best option for now.
3112 * Please refer to the following thread for details.
3113 *
3114 * http://thread.gmane.org/gmane.linux.kernel/1420814
3115 */
3116 if (current->flags & PF_USED_ASYNC)
3117 async_synchronize_full();
3063 3118
3064 mutex_lock(&module_mutex); 3119 mutex_lock(&module_mutex);
3065 /* Drop initial reference. */ 3120 /* Drop initial reference. */
@@ -3090,12 +3145,72 @@ static int may_init_module(void)
3090 return 0; 3145 return 0;
3091} 3146}
3092 3147
3148/*
3149 * We try to place it in the list now to make sure it's unique before
3150 * we dedicate too many resources. In particular, temporary percpu
3151 * memory exhaustion.
3152 */
3153static int add_unformed_module(struct module *mod)
3154{
3155 int err;
3156 struct module *old;
3157
3158 mod->state = MODULE_STATE_UNFORMED;
3159
3160again:
3161 mutex_lock(&module_mutex);
3162 if ((old = find_module_all(mod->name, true)) != NULL) {
3163 if (old->state == MODULE_STATE_COMING
3164 || old->state == MODULE_STATE_UNFORMED) {
3165 /* Wait in case it fails to load. */
3166 mutex_unlock(&module_mutex);
3167 err = wait_event_interruptible(module_wq,
3168 finished_loading(mod->name));
3169 if (err)
3170 goto out_unlocked;
3171 goto again;
3172 }
3173 err = -EEXIST;
3174 goto out;
3175 }
3176 list_add_rcu(&mod->list, &modules);
3177 err = 0;
3178
3179out:
3180 mutex_unlock(&module_mutex);
3181out_unlocked:
3182 return err;
3183}
3184
3185static int complete_formation(struct module *mod, struct load_info *info)
3186{
3187 int err;
3188
3189 mutex_lock(&module_mutex);
3190
3191 /* Find duplicate symbols (must be called under lock). */
3192 err = verify_export_symbols(mod);
3193 if (err < 0)
3194 goto out;
3195
3196 /* This relies on module_mutex for list integrity. */
3197 module_bug_finalize(info->hdr, info->sechdrs, mod);
3198
3199 /* Mark state as coming so strong_try_module_get() ignores us,
3200 * but kallsyms etc. can see us. */
3201 mod->state = MODULE_STATE_COMING;
3202
3203out:
3204 mutex_unlock(&module_mutex);
3205 return err;
3206}
3207
3093/* Allocate and load the module: note that size of section 0 is always 3208/* Allocate and load the module: note that size of section 0 is always
3094 zero, and we rely on this for optional sections. */ 3209 zero, and we rely on this for optional sections. */
3095static int load_module(struct load_info *info, const char __user *uargs, 3210static int load_module(struct load_info *info, const char __user *uargs,
3096 int flags) 3211 int flags)
3097{ 3212{
3098 struct module *mod, *old; 3213 struct module *mod;
3099 long err; 3214 long err;
3100 3215
3101 err = module_sig_check(info); 3216 err = module_sig_check(info);
@@ -3113,16 +3228,26 @@ static int load_module(struct load_info *info, const char __user *uargs,
3113 goto free_copy; 3228 goto free_copy;
3114 } 3229 }
3115 3230
3231 /* Reserve our place in the list. */
3232 err = add_unformed_module(mod);
3233 if (err)
3234 goto free_module;
3235
3116#ifdef CONFIG_MODULE_SIG 3236#ifdef CONFIG_MODULE_SIG
3117 mod->sig_ok = info->sig_ok; 3237 mod->sig_ok = info->sig_ok;
3118 if (!mod->sig_ok) 3238 if (!mod->sig_ok) {
3119 add_taint_module(mod, TAINT_FORCED_MODULE); 3239 printk_once(KERN_NOTICE
3240 "%s: module verification failed: signature and/or"
3241 " required key missing - tainting kernel\n",
3242 mod->name);
3243 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
3244 }
3120#endif 3245#endif
3121 3246
3122 /* Now module is in final location, initialize linked lists, etc. */ 3247 /* Now module is in final location, initialize linked lists, etc. */
3123 err = module_unload_init(mod); 3248 err = module_unload_init(mod);
3124 if (err) 3249 if (err)
3125 goto free_module; 3250 goto unlink_mod;
3126 3251
3127 /* Now we've got everything in the final locations, we can 3252 /* Now we've got everything in the final locations, we can
3128 * find optional sections. */ 3253 * find optional sections. */
@@ -3157,54 +3282,23 @@ static int load_module(struct load_info *info, const char __user *uargs,
3157 goto free_arch_cleanup; 3282 goto free_arch_cleanup;
3158 } 3283 }
3159 3284
3160 /* Mark state as coming so strong_try_module_get() ignores us. */
3161 mod->state = MODULE_STATE_COMING;
3162
3163 /* Now sew it into the lists so we can get lockdep and oops
3164 * info during argument parsing. No one should access us, since
3165 * strong_try_module_get() will fail.
3166 * lockdep/oops can run asynchronous, so use the RCU list insertion
3167 * function to insert in a way safe to concurrent readers.
3168 * The mutex protects against concurrent writers.
3169 */
3170again:
3171 mutex_lock(&module_mutex);
3172 if ((old = find_module(mod->name)) != NULL) {
3173 if (old->state == MODULE_STATE_COMING) {
3174 /* Wait in case it fails to load. */
3175 mutex_unlock(&module_mutex);
3176 err = wait_event_interruptible(module_wq,
3177 finished_loading(mod->name));
3178 if (err)
3179 goto free_arch_cleanup;
3180 goto again;
3181 }
3182 err = -EEXIST;
3183 goto unlock;
3184 }
3185
3186 /* This has to be done once we're sure module name is unique. */
3187 dynamic_debug_setup(info->debug, info->num_debug); 3285 dynamic_debug_setup(info->debug, info->num_debug);
3188 3286
3189 /* Find duplicate symbols */ 3287 /* Finally it's fully formed, ready to start executing. */
3190 err = verify_export_symbols(mod); 3288 err = complete_formation(mod, info);
3191 if (err < 0) 3289 if (err)
3192 goto ddebug; 3290 goto ddebug_cleanup;
3193
3194 module_bug_finalize(info->hdr, info->sechdrs, mod);
3195 list_add_rcu(&mod->list, &modules);
3196 mutex_unlock(&module_mutex);
3197 3291
3198 /* Module is ready to execute: parsing args may do that. */ 3292 /* Module is ready to execute: parsing args may do that. */
3199 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3293 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3200 -32768, 32767, &ddebug_dyndbg_module_param_cb); 3294 -32768, 32767, &ddebug_dyndbg_module_param_cb);
3201 if (err < 0) 3295 if (err < 0)
3202 goto unlink; 3296 goto bug_cleanup;
3203 3297
3204 /* Link in to syfs. */ 3298 /* Link in to syfs. */
3205 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); 3299 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
3206 if (err < 0) 3300 if (err < 0)
3207 goto unlink; 3301 goto bug_cleanup;
3208 3302
3209 /* Get rid of temporary copy. */ 3303 /* Get rid of temporary copy. */
3210 free_copy(info); 3304 free_copy(info);
@@ -3214,16 +3308,13 @@ again:
3214 3308
3215 return do_init_module(mod); 3309 return do_init_module(mod);
3216 3310
3217 unlink: 3311 bug_cleanup:
3312 /* module_bug_cleanup needs module_mutex protection */
3218 mutex_lock(&module_mutex); 3313 mutex_lock(&module_mutex);
3219 /* Unlink carefully: kallsyms could be walking list. */
3220 list_del_rcu(&mod->list);
3221 module_bug_cleanup(mod); 3314 module_bug_cleanup(mod);
3222 wake_up_all(&module_wq);
3223 ddebug:
3224 dynamic_debug_remove(info->debug);
3225 unlock:
3226 mutex_unlock(&module_mutex); 3315 mutex_unlock(&module_mutex);
3316 ddebug_cleanup:
3317 dynamic_debug_remove(info->debug);
3227 synchronize_sched(); 3318 synchronize_sched();
3228 kfree(mod->args); 3319 kfree(mod->args);
3229 free_arch_cleanup: 3320 free_arch_cleanup:
@@ -3232,6 +3323,12 @@ again:
3232 free_modinfo(mod); 3323 free_modinfo(mod);
3233 free_unload: 3324 free_unload:
3234 module_unload_free(mod); 3325 module_unload_free(mod);
3326 unlink_mod:
3327 mutex_lock(&module_mutex);
3328 /* Unlink carefully: kallsyms could be walking list. */
3329 list_del_rcu(&mod->list);
3330 wake_up_all(&module_wq);
3331 mutex_unlock(&module_mutex);
3235 free_module: 3332 free_module:
3236 module_deallocate(mod, info); 3333 module_deallocate(mod, info);
3237 free_copy: 3334 free_copy:
@@ -3354,6 +3451,8 @@ const char *module_address_lookup(unsigned long addr,
3354 3451
3355 preempt_disable(); 3452 preempt_disable();
3356 list_for_each_entry_rcu(mod, &modules, list) { 3453 list_for_each_entry_rcu(mod, &modules, list) {
3454 if (mod->state == MODULE_STATE_UNFORMED)
3455 continue;
3357 if (within_module_init(addr, mod) || 3456 if (within_module_init(addr, mod) ||
3358 within_module_core(addr, mod)) { 3457 within_module_core(addr, mod)) {
3359 if (modname) 3458 if (modname)
@@ -3377,6 +3476,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
3377 3476
3378 preempt_disable(); 3477 preempt_disable();
3379 list_for_each_entry_rcu(mod, &modules, list) { 3478 list_for_each_entry_rcu(mod, &modules, list) {
3479 if (mod->state == MODULE_STATE_UNFORMED)
3480 continue;
3380 if (within_module_init(addr, mod) || 3481 if (within_module_init(addr, mod) ||
3381 within_module_core(addr, mod)) { 3482 within_module_core(addr, mod)) {
3382 const char *sym; 3483 const char *sym;
@@ -3401,6 +3502,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
3401 3502
3402 preempt_disable(); 3503 preempt_disable();
3403 list_for_each_entry_rcu(mod, &modules, list) { 3504 list_for_each_entry_rcu(mod, &modules, list) {
3505 if (mod->state == MODULE_STATE_UNFORMED)
3506 continue;
3404 if (within_module_init(addr, mod) || 3507 if (within_module_init(addr, mod) ||
3405 within_module_core(addr, mod)) { 3508 within_module_core(addr, mod)) {
3406 const char *sym; 3509 const char *sym;
@@ -3428,6 +3531,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
3428 3531
3429 preempt_disable(); 3532 preempt_disable();
3430 list_for_each_entry_rcu(mod, &modules, list) { 3533 list_for_each_entry_rcu(mod, &modules, list) {
3534 if (mod->state == MODULE_STATE_UNFORMED)
3535 continue;
3431 if (symnum < mod->num_symtab) { 3536 if (symnum < mod->num_symtab) {
3432 *value = mod->symtab[symnum].st_value; 3537 *value = mod->symtab[symnum].st_value;
3433 *type = mod->symtab[symnum].st_info; 3538 *type = mod->symtab[symnum].st_info;
@@ -3470,9 +3575,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)
3470 ret = mod_find_symname(mod, colon+1); 3575 ret = mod_find_symname(mod, colon+1);
3471 *colon = ':'; 3576 *colon = ':';
3472 } else { 3577 } else {
3473 list_for_each_entry_rcu(mod, &modules, list) 3578 list_for_each_entry_rcu(mod, &modules, list) {
3579 if (mod->state == MODULE_STATE_UNFORMED)
3580 continue;
3474 if ((ret = mod_find_symname(mod, name)) != 0) 3581 if ((ret = mod_find_symname(mod, name)) != 0)
3475 break; 3582 break;
3583 }
3476 } 3584 }
3477 preempt_enable(); 3585 preempt_enable();
3478 return ret; 3586 return ret;
@@ -3487,6 +3595,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
3487 int ret; 3595 int ret;
3488 3596
3489 list_for_each_entry(mod, &modules, list) { 3597 list_for_each_entry(mod, &modules, list) {
3598 if (mod->state == MODULE_STATE_UNFORMED)
3599 continue;
3490 for (i = 0; i < mod->num_symtab; i++) { 3600 for (i = 0; i < mod->num_symtab; i++) {
3491 ret = fn(data, mod->strtab + mod->symtab[i].st_name, 3601 ret = fn(data, mod->strtab + mod->symtab[i].st_name,
3492 mod, mod->symtab[i].st_value); 3602 mod, mod->symtab[i].st_value);
@@ -3502,6 +3612,7 @@ static char *module_flags(struct module *mod, char *buf)
3502{ 3612{
3503 int bx = 0; 3613 int bx = 0;
3504 3614
3615 BUG_ON(mod->state == MODULE_STATE_UNFORMED);
3505 if (mod->taints || 3616 if (mod->taints ||
3506 mod->state == MODULE_STATE_GOING || 3617 mod->state == MODULE_STATE_GOING ||
3507 mod->state == MODULE_STATE_COMING) { 3618 mod->state == MODULE_STATE_COMING) {
@@ -3543,6 +3654,10 @@ static int m_show(struct seq_file *m, void *p)
3543 struct module *mod = list_entry(p, struct module, list); 3654 struct module *mod = list_entry(p, struct module, list);
3544 char buf[8]; 3655 char buf[8];
3545 3656
3657 /* We always ignore unformed modules. */
3658 if (mod->state == MODULE_STATE_UNFORMED)
3659 return 0;
3660
3546 seq_printf(m, "%s %u", 3661 seq_printf(m, "%s %u",
3547 mod->name, mod->init_size + mod->core_size); 3662 mod->name, mod->init_size + mod->core_size);
3548 print_unload_info(m, mod); 3663 print_unload_info(m, mod);
@@ -3603,6 +3718,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
3603 3718
3604 preempt_disable(); 3719 preempt_disable();
3605 list_for_each_entry_rcu(mod, &modules, list) { 3720 list_for_each_entry_rcu(mod, &modules, list) {
3721 if (mod->state == MODULE_STATE_UNFORMED)
3722 continue;
3606 if (mod->num_exentries == 0) 3723 if (mod->num_exentries == 0)
3607 continue; 3724 continue;
3608 3725
@@ -3651,10 +3768,13 @@ struct module *__module_address(unsigned long addr)
3651 if (addr < module_addr_min || addr > module_addr_max) 3768 if (addr < module_addr_min || addr > module_addr_max)
3652 return NULL; 3769 return NULL;
3653 3770
3654 list_for_each_entry_rcu(mod, &modules, list) 3771 list_for_each_entry_rcu(mod, &modules, list) {
3772 if (mod->state == MODULE_STATE_UNFORMED)
3773 continue;
3655 if (within_module_core(addr, mod) 3774 if (within_module_core(addr, mod)
3656 || within_module_init(addr, mod)) 3775 || within_module_init(addr, mod))
3657 return mod; 3776 return mod;
3777 }
3658 return NULL; 3778 return NULL;
3659} 3779}
3660EXPORT_SYMBOL_GPL(__module_address); 3780EXPORT_SYMBOL_GPL(__module_address);
@@ -3707,8 +3827,11 @@ void print_modules(void)
3707 printk(KERN_DEFAULT "Modules linked in:"); 3827 printk(KERN_DEFAULT "Modules linked in:");
3708 /* Most callers should already have preempt disabled, but make sure */ 3828 /* Most callers should already have preempt disabled, but make sure */
3709 preempt_disable(); 3829 preempt_disable();
3710 list_for_each_entry_rcu(mod, &modules, list) 3830 list_for_each_entry_rcu(mod, &modules, list) {
3831 if (mod->state == MODULE_STATE_UNFORMED)
3832 continue;
3711 printk(" %s%s", mod->name, module_flags(mod, buf)); 3833 printk(" %s%s", mod->name, module_flags(mod, buf));
3834 }
3712 preempt_enable(); 3835 preempt_enable();
3713 if (last_unloaded_module[0]) 3836 if (last_unloaded_module[0])
3714 printk(" [last unloaded: %s]", last_unloaded_module); 3837 printk(" [last unloaded: %s]", last_unloaded_module);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c9526..52f23011b6e0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,6 +19,7 @@
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/sched/rt.h>
22#include <linux/export.h> 23#include <linux/export.h>
23#include <linux/spinlock.h> 24#include <linux/spinlock.h>
24#include <linux/interrupt.h> 25#include <linux/interrupt.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb20165..afc0456f227a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
153 goto out; 153 goto out;
154 } 154 }
155 155
156 new_ns = create_new_namespaces(flags, tsk, 156 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
157 task_cred_xxx(tsk, user_ns), tsk->fs);
158 if (IS_ERR(new_ns)) { 157 if (IS_ERR(new_ns)) {
159 err = PTR_ERR(new_ns); 158 err = PTR_ERR(new_ns);
160 goto out; 159 goto out;
@@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
251 return PTR_ERR(file); 250 return PTR_ERR(file);
252 251
253 err = -EINVAL; 252 err = -EINVAL;
254 ei = PROC_I(file->f_dentry->d_inode); 253 ei = PROC_I(file_inode(file));
255 ops = ei->ns_ops; 254 ops = ei->ns_ops;
256 if (nstype && (ops->type != nstype)) 255 if (nstype && (ops->type != nstype))
257 goto out; 256 goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff97..7c57cc9eee2c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -259,26 +259,19 @@ unsigned long get_taint(void)
259 return tainted_mask; 259 return tainted_mask;
260} 260}
261 261
262void add_taint(unsigned flag) 262/**
263 * add_taint: add a taint flag if not already set.
264 * @flag: one of the TAINT_* constants.
265 * @lockdep_ok: whether lock debugging is still OK.
266 *
267 * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
268 * some notewortht-but-not-corrupting cases, it can be set to true.
269 */
270void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
263{ 271{
264 /* 272 if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
265 * Can't trust the integrity of the kernel anymore. 273 printk(KERN_WARNING
266 * We don't call directly debug_locks_off() because the issue 274 "Disabling lock debugging due to kernel taint\n");
267 * is not necessarily serious enough to set oops_in_progress to 1
268 * Also we want to keep up lockdep for staging/out-of-tree
269 * development and post-warning case.
270 */
271 switch (flag) {
272 case TAINT_CRAP:
273 case TAINT_OOT_MODULE:
274 case TAINT_WARN:
275 case TAINT_FIRMWARE_WORKAROUND:
276 break;
277
278 default:
279 if (__debug_locks_off())
280 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
281 }
282 275
283 set_bit(flag, &tainted_mask); 276 set_bit(flag, &tainted_mask);
284} 277}
@@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
421 print_modules(); 414 print_modules();
422 dump_stack(); 415 dump_stack();
423 print_oops_end_marker(); 416 print_oops_end_marker();
424 add_taint(taint); 417 /* Just a warning, don't kill lockdep. */
418 add_taint(taint, LOCKDEP_STILL_OK);
425} 419}
426 420
427void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 421void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/pid.c b/kernel/pid.c
index de9af600006f..047dc6264638 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -331,7 +331,7 @@ out:
331 return pid; 331 return pid;
332 332
333out_unlock: 333out_unlock:
334 spin_unlock(&pidmap_lock); 334 spin_unlock_irq(&pidmap_lock);
335out_free: 335out_free:
336 while (++i <= ns->level) 336 while (++i <= ns->level)
337 free_pidmap(pid->numbers + i); 337 free_pidmap(pid->numbers + i);
@@ -350,10 +350,9 @@ void disable_pid_allocation(struct pid_namespace *ns)
350 350
351struct pid *find_pid_ns(int nr, struct pid_namespace *ns) 351struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
352{ 352{
353 struct hlist_node *elem;
354 struct upid *pnr; 353 struct upid *pnr;
355 354
356 hlist_for_each_entry_rcu(pnr, elem, 355 hlist_for_each_entry_rcu(pnr,
357 &pid_hash[pid_hashfn(nr, ns)], pid_chain) 356 &pid_hash[pid_hashfn(nr, ns)], pid_chain)
358 if (pnr->nr == nr && pnr->ns == ns) 357 if (pnr->nr == nr && pnr->ns == ns)
359 return container_of(pnr, struct pid, 358 return container_of(pnr, struct pid,
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index a278cad1d5d6..8fd709c9bb58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -155,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer,
155 155
156static inline cputime_t prof_ticks(struct task_struct *p) 156static inline cputime_t prof_ticks(struct task_struct *p)
157{ 157{
158 return p->utime + p->stime; 158 cputime_t utime, stime;
159
160 task_cputime(p, &utime, &stime);
161
162 return utime + stime;
159} 163}
160static inline cputime_t virt_ticks(struct task_struct *p) 164static inline cputime_t virt_ticks(struct task_struct *p)
161{ 165{
162 return p->utime; 166 cputime_t utime;
167
168 task_cputime(p, &utime, NULL);
169
170 return utime;
163} 171}
164 172
165static int 173static int
@@ -471,18 +479,23 @@ static void cleanup_timers(struct list_head *head,
471 */ 479 */
472void posix_cpu_timers_exit(struct task_struct *tsk) 480void posix_cpu_timers_exit(struct task_struct *tsk)
473{ 481{
482 cputime_t utime, stime;
483
474 add_device_randomness((const void*) &tsk->se.sum_exec_runtime, 484 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
475 sizeof(unsigned long long)); 485 sizeof(unsigned long long));
486 task_cputime(tsk, &utime, &stime);
476 cleanup_timers(tsk->cpu_timers, 487 cleanup_timers(tsk->cpu_timers,
477 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); 488 utime, stime, tsk->se.sum_exec_runtime);
478 489
479} 490}
480void posix_cpu_timers_exit_group(struct task_struct *tsk) 491void posix_cpu_timers_exit_group(struct task_struct *tsk)
481{ 492{
482 struct signal_struct *const sig = tsk->signal; 493 struct signal_struct *const sig = tsk->signal;
494 cputime_t utime, stime;
483 495
496 task_cputime(tsk, &utime, &stime);
484 cleanup_timers(tsk->signal->cpu_timers, 497 cleanup_timers(tsk->signal->cpu_timers,
485 tsk->utime + sig->utime, tsk->stime + sig->stime, 498 utime + sig->utime, stime + sig->stime,
486 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 499 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
487} 500}
488 501
@@ -1226,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
1226static inline int fastpath_timer_check(struct task_struct *tsk) 1239static inline int fastpath_timer_check(struct task_struct *tsk)
1227{ 1240{
1228 struct signal_struct *sig; 1241 struct signal_struct *sig;
1242 cputime_t utime, stime;
1243
1244 task_cputime(tsk, &utime, &stime);
1229 1245
1230 if (!task_cputime_zero(&tsk->cputime_expires)) { 1246 if (!task_cputime_zero(&tsk->cputime_expires)) {
1231 struct task_cputime task_sample = { 1247 struct task_cputime task_sample = {
1232 .utime = tsk->utime, 1248 .utime = utime,
1233 .stime = tsk->stime, 1249 .stime = stime,
1234 .sum_exec_runtime = tsk->se.sum_exec_runtime 1250 .sum_exec_runtime = tsk->se.sum_exec_runtime
1235 }; 1251 };
1236 1252
@@ -1401,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1401 while (!signal_pending(current)) { 1417 while (!signal_pending(current)) {
1402 if (timer.it.cpu.expires.sched == 0) { 1418 if (timer.it.cpu.expires.sched == 0) {
1403 /* 1419 /*
1404 * Our timer fired and was reset. 1420 * Our timer fired and was reset, below
1421 * deletion can not fail.
1405 */ 1422 */
1423 posix_cpu_timer_del(&timer);
1406 spin_unlock_irq(&timer.it_lock); 1424 spin_unlock_irq(&timer.it_lock);
1407 return 0; 1425 return 0;
1408 } 1426 }
@@ -1420,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1420 * We were interrupted by a signal. 1438 * We were interrupted by a signal.
1421 */ 1439 */
1422 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); 1440 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1423 posix_cpu_timer_set(&timer, 0, &zero_it, it); 1441 error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
1442 if (!error) {
1443 /*
1444 * Timer is now unarmed, deletion can not fail.
1445 */
1446 posix_cpu_timer_del(&timer);
1447 }
1424 spin_unlock_irq(&timer.it_lock); 1448 spin_unlock_irq(&timer.it_lock);
1425 1449
1450 while (error == TIMER_RETRY) {
1451 /*
1452 * We need to handle case when timer was or is in the
1453 * middle of firing. In other cases we already freed
1454 * resources.
1455 */
1456 spin_lock_irq(&timer.it_lock);
1457 error = posix_cpu_timer_del(&timer);
1458 spin_unlock_irq(&timer.it_lock);
1459 }
1460
1426 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { 1461 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1427 /* 1462 /*
1428 * It actually did fire already. 1463 * It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b701..6edbb2c55c22 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
552 return -EAGAIN; 552 return -EAGAIN;
553 553
554 spin_lock_init(&new_timer->it_lock); 554 spin_lock_init(&new_timer->it_lock);
555 retry: 555
556 if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { 556 idr_preload(GFP_KERNEL);
557 error = -EAGAIN;
558 goto out;
559 }
560 spin_lock_irq(&idr_lock); 557 spin_lock_irq(&idr_lock);
561 error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); 558 error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
562 spin_unlock_irq(&idr_lock); 559 spin_unlock_irq(&idr_lock);
563 if (error) { 560 idr_preload_end();
564 if (error == -EAGAIN) 561 if (error < 0) {
565 goto retry;
566 /* 562 /*
567 * Weird looking, but we return EAGAIN if the IDR is 563 * Weird looking, but we return EAGAIN if the IDR is
568 * full (proper POSIX return value for this) 564 * full (proper POSIX return value for this)
569 */ 565 */
570 error = -EAGAIN; 566 if (error == -ENOSPC)
567 error = -EAGAIN;
571 goto out; 568 goto out;
572 } 569 }
570 new_timer_id = error;
573 571
574 it_id_set = IT_ID_SET; 572 it_id_set = IT_ID_SET;
575 new_timer->it_id = (timer_t) new_timer_id; 573 new_timer->it_id = (timer_t) new_timer_id;
@@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
639{ 637{
640 struct k_itimer *timr; 638 struct k_itimer *timr;
641 639
640 /*
641 * timer_t could be any type >= int and we want to make sure any
642 * @timer_id outside positive int range fails lookup.
643 */
644 if ((unsigned long long)timer_id > INT_MAX)
645 return NULL;
646
642 rcu_read_lock(); 647 rcu_read_lock();
643 timr = idr_find(&posix_timers_id, (int)timer_id); 648 timr = idr_find(&posix_timers_id, (int)timer_id);
644 if (timr) { 649 if (timr) {
@@ -997,7 +1002,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
997 1002
998 err = kc->clock_adj(which_clock, &ktx); 1003 err = kc->clock_adj(which_clock, &ktx);
999 1004
1000 if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) 1005 if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
1001 return -EFAULT; 1006 return -EFAULT;
1002 1007
1003 return err; 1008 return err;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
66 66
67void queue_up_suspend_work(void) 67void queue_up_suspend_work(void)
68{ 68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) 69 if (autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work); 70 queue_work(autosleep_wq, &suspend_work);
71} 71}
72 72
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de1..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
313static suspend_state_t decode_state(const char *buf, size_t n) 313static suspend_state_t decode_state(const char *buf, size_t n)
314{ 314{
315#ifdef CONFIG_SUSPEND 315#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_STANDBY; 316 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 317 const char * const *s;
318#endif 318#endif
319 char *p; 319 char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
553 553
554#endif /* CONFIG_PM_TRACE */ 554#endif /* CONFIG_PM_TRACE */
555 555
556#ifdef CONFIG_FREEZER
557static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
558 struct kobj_attribute *attr, char *buf)
559{
560 return sprintf(buf, "%u\n", freeze_timeout_msecs);
561}
562
563static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
564 struct kobj_attribute *attr,
565 const char *buf, size_t n)
566{
567 unsigned long val;
568
569 if (kstrtoul(buf, 10, &val))
570 return -EINVAL;
571
572 freeze_timeout_msecs = val;
573 return n;
574}
575
576power_attr(pm_freeze_timeout);
577
578#endif /* CONFIG_FREEZER*/
579
556static struct attribute * g[] = { 580static struct attribute * g[] = {
557 &state_attr.attr, 581 &state_attr.attr,
558#ifdef CONFIG_PM_TRACE 582#ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
576 &pm_print_times_attr.attr, 600 &pm_print_times_attr.attr,
577#endif 601#endif
578#endif 602#endif
603#ifdef CONFIG_FREEZER
604 &pm_freeze_timeout_attr.attr,
605#endif
579 NULL, 606 NULL,
580}; 607};
581 608
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6f..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
21/* 21/*
22 * Timeout for stopping processes 22 * Timeout for stopping processes
23 */ 23 */
24#define TIMEOUT (20 * HZ) 24unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
25 25
26static int try_to_freeze_tasks(bool user_only) 26static int try_to_freeze_tasks(bool user_only)
27{ 27{
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
36 36
37 do_gettimeofday(&start); 37 do_gettimeofday(&start);
38 38
39 end_time = jiffies + TIMEOUT; 39 end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
40 40
41 if (!user_only) 41 if (!user_only)
42 freeze_workqueues_begin(); 42 freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7eaad6..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
359 return; 359 return;
360 } 360 }
361 361
362 if (delayed_work_pending(&req->work)) 362 cancel_delayed_work_sync(&req->work);
363 cancel_delayed_work_sync(&req->work);
364 363
365 if (new_value != req->node.prio) 364 if (new_value != req->node.prio)
366 pm_qos_update_target( 365 pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
386 "%s called for unknown object.", __func__)) 385 "%s called for unknown object.", __func__))
387 return; 386 return;
388 387
389 if (delayed_work_pending(&req->work)) 388 cancel_delayed_work_sync(&req->work);
390 cancel_delayed_work_sync(&req->work);
391 389
392 if (new_value != req->node.prio) 390 if (new_value != req->node.prio)
393 pm_qos_update_target( 391 pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
416 return; 414 return;
417 } 415 }
418 416
419 if (delayed_work_pending(&req->work)) 417 cancel_delayed_work_sync(&req->work);
420 cancel_delayed_work_sync(&req->work);
421 418
422 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 419 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
423 &req->node, PM_QOS_REMOVE_REQ, 420 &req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
30#include "power.h" 30#include "power.h"
31 31
32const char *const pm_states[PM_SUSPEND_MAX] = { 32const char *const pm_states[PM_SUSPEND_MAX] = {
33 [PM_SUSPEND_FREEZE] = "freeze",
33 [PM_SUSPEND_STANDBY] = "standby", 34 [PM_SUSPEND_STANDBY] = "standby",
34 [PM_SUSPEND_MEM] = "mem", 35 [PM_SUSPEND_MEM] = "mem",
35}; 36};
36 37
37static const struct platform_suspend_ops *suspend_ops; 38static const struct platform_suspend_ops *suspend_ops;
38 39
40static bool need_suspend_ops(suspend_state_t state)
41{
42 return !!(state > PM_SUSPEND_FREEZE);
43}
44
45static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
46static bool suspend_freeze_wake;
47
48static void freeze_begin(void)
49{
50 suspend_freeze_wake = false;
51}
52
53static void freeze_enter(void)
54{
55 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
56}
57
58void freeze_wake(void)
59{
60 suspend_freeze_wake = true;
61 wake_up(&suspend_freeze_wait_head);
62}
63EXPORT_SYMBOL_GPL(freeze_wake);
64
39/** 65/**
40 * suspend_set_ops - Set the global suspend method table. 66 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Suspend operations to use. 67 * @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
50 76
51bool valid_state(suspend_state_t state) 77bool valid_state(suspend_state_t state)
52{ 78{
79 if (state == PM_SUSPEND_FREEZE)
80 return true;
53 /* 81 /*
54 * All states need lowlevel support and need to be valid to the lowlevel 82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
83 * support and need to be valid to the lowlevel
55 * implementation, no valid callback implies that none are valid. 84 * implementation, no valid callback implies that none are valid.
56 */ 85 */
57 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); 86 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
89 * hibernation). Run suspend notifiers, allocate the "suspend" console and 118 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes. 119 * freeze processes.
91 */ 120 */
92static int suspend_prepare(void) 121static int suspend_prepare(suspend_state_t state)
93{ 122{
94 int error; 123 int error;
95 124
96 if (!suspend_ops || !suspend_ops->enter) 125 if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
97 return -EPERM; 126 return -EPERM;
98 127
99 pm_prepare_console(); 128 pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
137{ 166{
138 int error; 167 int error;
139 168
140 if (suspend_ops->prepare) { 169 if (need_suspend_ops(state) && suspend_ops->prepare) {
141 error = suspend_ops->prepare(); 170 error = suspend_ops->prepare();
142 if (error) 171 if (error)
143 goto Platform_finish; 172 goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
149 goto Platform_finish; 178 goto Platform_finish;
150 } 179 }
151 180
152 if (suspend_ops->prepare_late) { 181 if (need_suspend_ops(state) && suspend_ops->prepare_late) {
153 error = suspend_ops->prepare_late(); 182 error = suspend_ops->prepare_late();
154 if (error) 183 if (error)
155 goto Platform_wake; 184 goto Platform_wake;
156 } 185 }
157 186
187 /*
188 * PM_SUSPEND_FREEZE equals
189 * frozen processes + suspended devices + idle processors.
190 * Thus we should invoke freeze_enter() soon after
191 * all the devices are suspended.
192 */
193 if (state == PM_SUSPEND_FREEZE) {
194 freeze_enter();
195 goto Platform_wake;
196 }
197
158 if (suspend_test(TEST_PLATFORM)) 198 if (suspend_test(TEST_PLATFORM))
159 goto Platform_wake; 199 goto Platform_wake;
160 200
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
182 enable_nonboot_cpus(); 222 enable_nonboot_cpus();
183 223
184 Platform_wake: 224 Platform_wake:
185 if (suspend_ops->wake) 225 if (need_suspend_ops(state) && suspend_ops->wake)
186 suspend_ops->wake(); 226 suspend_ops->wake();
187 227
188 dpm_resume_start(PMSG_RESUME); 228 dpm_resume_start(PMSG_RESUME);
189 229
190 Platform_finish: 230 Platform_finish:
191 if (suspend_ops->finish) 231 if (need_suspend_ops(state) && suspend_ops->finish)
192 suspend_ops->finish(); 232 suspend_ops->finish();
193 233
194 return error; 234 return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
203 int error; 243 int error;
204 bool wakeup = false; 244 bool wakeup = false;
205 245
206 if (!suspend_ops) 246 if (need_suspend_ops(state) && !suspend_ops)
207 return -ENOSYS; 247 return -ENOSYS;
208 248
209 trace_machine_suspend(state); 249 trace_machine_suspend(state);
210 if (suspend_ops->begin) { 250 if (need_suspend_ops(state) && suspend_ops->begin) {
211 error = suspend_ops->begin(state); 251 error = suspend_ops->begin(state);
212 if (error) 252 if (error)
213 goto Close; 253 goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
226 266
227 do { 267 do {
228 error = suspend_enter(state, &wakeup); 268 error = suspend_enter(state, &wakeup);
229 } while (!error && !wakeup 269 } while (!error && !wakeup && need_suspend_ops(state)
230 && suspend_ops->suspend_again && suspend_ops->suspend_again()); 270 && suspend_ops->suspend_again && suspend_ops->suspend_again());
231 271
232 Resume_devices: 272 Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
236 ftrace_start(); 276 ftrace_start();
237 resume_console(); 277 resume_console();
238 Close: 278 Close:
239 if (suspend_ops->end) 279 if (need_suspend_ops(state) && suspend_ops->end)
240 suspend_ops->end(); 280 suspend_ops->end();
241 trace_machine_suspend(PWR_EVENT_EXIT); 281 trace_machine_suspend(PWR_EVENT_EXIT);
242 return error; 282 return error;
243 283
244 Recover_platform: 284 Recover_platform:
245 if (suspend_ops->recover) 285 if (need_suspend_ops(state) && suspend_ops->recover)
246 suspend_ops->recover(); 286 suspend_ops->recover();
247 goto Resume_devices; 287 goto Resume_devices;
248} 288}
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
278 if (!mutex_trylock(&pm_mutex)) 318 if (!mutex_trylock(&pm_mutex))
279 return -EBUSY; 319 return -EBUSY;
280 320
321 if (state == PM_SUSPEND_FREEZE)
322 freeze_begin();
323
281 printk(KERN_INFO "PM: Syncing filesystems ... "); 324 printk(KERN_INFO "PM: Syncing filesystems ... ");
282 sys_sync(); 325 sys_sync();
283 printk("done.\n"); 326 printk("done.\n");
284 327
285 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 328 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
286 error = suspend_prepare(); 329 error = suspend_prepare(state);
287 if (error) 330 if (error)
288 goto Unlock; 331 goto Unlock;
289 332
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac7..9b2a1d58558d 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
112 rtc_set_alarm(rtc, &alm); 112 rtc_set_alarm(rtc, &alm);
113} 113}
114 114
115static int __init has_wakealarm(struct device *dev, void *name_ptr) 115static int __init has_wakealarm(struct device *dev, const void *data)
116{ 116{
117 struct rtc_device *candidate = to_rtc_device(dev); 117 struct rtc_device *candidate = to_rtc_device(dev);
118 118
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
121 if (!device_may_wakeup(candidate->dev.parent)) 121 if (!device_may_wakeup(candidate->dev.parent))
122 return 0; 122 return 0;
123 123
124 *(const char **)name_ptr = dev_name(dev);
125 return 1; 124 return 1;
126} 125}
127 126
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
159 static char warn_no_rtc[] __initdata = 158 static char warn_no_rtc[] __initdata =
160 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; 159 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
161 160
162 char *pony = NULL;
163 struct rtc_device *rtc = NULL; 161 struct rtc_device *rtc = NULL;
162 struct device *dev;
164 163
165 /* PM is initialized by now; is that state testable? */ 164 /* PM is initialized by now; is that state testable? */
166 if (test_state == PM_SUSPEND_ON) 165 if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
171 } 170 }
172 171
173 /* RTCs have initialized by now too ... can we use one? */ 172 /* RTCs have initialized by now too ... can we use one? */
174 class_find_device(rtc_class, NULL, &pony, has_wakealarm); 173 dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
175 if (pony) 174 if (dev)
176 rtc = rtc_class_open(pony); 175 rtc = rtc_class_open(dev_name(dev));
177 if (!rtc) { 176 if (!rtc) {
178 printk(warn_no_rtc); 177 printk(warn_no_rtc);
179 goto done; 178 goto done;
diff --git a/kernel/printk.c b/kernel/printk.c
index 357f714ddd49..0b31715f335a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -42,6 +42,7 @@
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h> 44#include <linux/poll.h>
45#include <linux/irq_work.h>
45 46
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
47 48
@@ -1967,30 +1968,32 @@ int is_console_locked(void)
1967static DEFINE_PER_CPU(int, printk_pending); 1968static DEFINE_PER_CPU(int, printk_pending);
1968static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); 1969static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1969 1970
1970void printk_tick(void) 1971static void wake_up_klogd_work_func(struct irq_work *irq_work)
1971{ 1972{
1972 if (__this_cpu_read(printk_pending)) { 1973 int pending = __this_cpu_xchg(printk_pending, 0);
1973 int pending = __this_cpu_xchg(printk_pending, 0); 1974
1974 if (pending & PRINTK_PENDING_SCHED) { 1975 if (pending & PRINTK_PENDING_SCHED) {
1975 char *buf = __get_cpu_var(printk_sched_buf); 1976 char *buf = __get_cpu_var(printk_sched_buf);
1976 printk(KERN_WARNING "[sched_delayed] %s", buf); 1977 printk(KERN_WARNING "[sched_delayed] %s", buf);
1977 }
1978 if (pending & PRINTK_PENDING_WAKEUP)
1979 wake_up_interruptible(&log_wait);
1980 } 1978 }
1981}
1982 1979
1983int printk_needs_cpu(int cpu) 1980 if (pending & PRINTK_PENDING_WAKEUP)
1984{ 1981 wake_up_interruptible(&log_wait);
1985 if (cpu_is_offline(cpu))
1986 printk_tick();
1987 return __this_cpu_read(printk_pending);
1988} 1982}
1989 1983
1984static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
1985 .func = wake_up_klogd_work_func,
1986 .flags = IRQ_WORK_LAZY,
1987};
1988
1990void wake_up_klogd(void) 1989void wake_up_klogd(void)
1991{ 1990{
1992 if (waitqueue_active(&log_wait)) 1991 preempt_disable();
1992 if (waitqueue_active(&log_wait)) {
1993 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1993 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1994 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
1995 }
1996 preempt_enable();
1994} 1997}
1995 1998
1996static void console_cont_flush(char *text, size_t size) 1999static void console_cont_flush(char *text, size_t size)
@@ -2471,6 +2474,7 @@ int printk_sched(const char *fmt, ...)
2471 va_end(args); 2474 va_end(args);
2472 2475
2473 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); 2476 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
2477 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2474 local_irq_restore(flags); 2478 local_irq_restore(flags);
2475 2479
2476 return r; 2480 return r;
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42f..dc3384ee874e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -37,9 +37,6 @@ struct profile_hit {
37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) 37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
39 39
40/* Oprofile timer tick hook */
41static int (*timer_hook)(struct pt_regs *) __read_mostly;
42
43static atomic_t *prof_buffer; 40static atomic_t *prof_buffer;
44static unsigned long prof_len, prof_shift; 41static unsigned long prof_len, prof_shift;
45 42
@@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
208} 205}
209EXPORT_SYMBOL_GPL(profile_event_unregister); 206EXPORT_SYMBOL_GPL(profile_event_unregister);
210 207
211int register_timer_hook(int (*hook)(struct pt_regs *))
212{
213 if (timer_hook)
214 return -EBUSY;
215 timer_hook = hook;
216 return 0;
217}
218EXPORT_SYMBOL_GPL(register_timer_hook);
219
220void unregister_timer_hook(int (*hook)(struct pt_regs *))
221{
222 WARN_ON(hook != timer_hook);
223 timer_hook = NULL;
224 /* make sure all CPUs see the NULL hook */
225 synchronize_sched(); /* Allow ongoing interrupts to complete. */
226}
227EXPORT_SYMBOL_GPL(unregister_timer_hook);
228
229
230#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
231/* 209/*
232 * Each cpu has a pair of open-addressed hashtables for pending 210 * Each cpu has a pair of open-addressed hashtables for pending
@@ -436,8 +414,6 @@ void profile_tick(int type)
436{ 414{
437 struct pt_regs *regs = get_irq_regs(); 415 struct pt_regs *regs = get_irq_regs();
438 416
439 if (type == CPU_PROFILING && timer_hook)
440 timer_hook(regs);
441 if (!user_mode(regs) && prof_cpu_mask != NULL && 417 if (!user_mode(regs) && prof_cpu_mask != NULL &&
442 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) 418 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
443 profile_hit(type, (void *)profile_pc(regs)); 419 profile_hit(type, (void *)profile_pc(regs));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1599157336a6..acbd28424d81 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child)
117 * TASK_KILLABLE sleeps. 117 * TASK_KILLABLE sleeps.
118 */ 118 */
119 if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) 119 if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
120 signal_wake_up(child, task_is_traced(child)); 120 ptrace_signal_wake_up(child, true);
121 121
122 spin_unlock(&child->sighand->siglock); 122 spin_unlock(&child->sighand->siglock);
123} 123}
124 124
125/* Ensure that nothing can wake it up, even SIGKILL */
126static bool ptrace_freeze_traced(struct task_struct *task)
127{
128 bool ret = false;
129
130 /* Lockless, nobody but us can set this flag */
131 if (task->jobctl & JOBCTL_LISTENING)
132 return ret;
133
134 spin_lock_irq(&task->sighand->siglock);
135 if (task_is_traced(task) && !__fatal_signal_pending(task)) {
136 task->state = __TASK_TRACED;
137 ret = true;
138 }
139 spin_unlock_irq(&task->sighand->siglock);
140
141 return ret;
142}
143
144static void ptrace_unfreeze_traced(struct task_struct *task)
145{
146 if (task->state != __TASK_TRACED)
147 return;
148
149 WARN_ON(!task->ptrace || task->parent != current);
150
151 spin_lock_irq(&task->sighand->siglock);
152 if (__fatal_signal_pending(task))
153 wake_up_state(task, __TASK_TRACED);
154 else
155 task->state = TASK_TRACED;
156 spin_unlock_irq(&task->sighand->siglock);
157}
158
125/** 159/**
126 * ptrace_check_attach - check whether ptracee is ready for ptrace operation 160 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
127 * @child: ptracee to check for 161 * @child: ptracee to check for
@@ -139,7 +173,7 @@ void __ptrace_unlink(struct task_struct *child)
139 * RETURNS: 173 * RETURNS:
140 * 0 on success, -ESRCH if %child is not ready. 174 * 0 on success, -ESRCH if %child is not ready.
141 */ 175 */
142int ptrace_check_attach(struct task_struct *child, bool ignore_state) 176static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
143{ 177{
144 int ret = -ESRCH; 178 int ret = -ESRCH;
145 179
@@ -151,24 +185,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
151 * be changed by us so it's not changing right after this. 185 * be changed by us so it's not changing right after this.
152 */ 186 */
153 read_lock(&tasklist_lock); 187 read_lock(&tasklist_lock);
154 if ((child->ptrace & PT_PTRACED) && child->parent == current) { 188 if (child->ptrace && child->parent == current) {
189 WARN_ON(child->state == __TASK_TRACED);
155 /* 190 /*
156 * child->sighand can't be NULL, release_task() 191 * child->sighand can't be NULL, release_task()
157 * does ptrace_unlink() before __exit_signal(). 192 * does ptrace_unlink() before __exit_signal().
158 */ 193 */
159 spin_lock_irq(&child->sighand->siglock); 194 if (ignore_state || ptrace_freeze_traced(child))
160 WARN_ON_ONCE(task_is_stopped(child));
161 if (ignore_state || (task_is_traced(child) &&
162 !(child->jobctl & JOBCTL_LISTENING)))
163 ret = 0; 195 ret = 0;
164 spin_unlock_irq(&child->sighand->siglock);
165 } 196 }
166 read_unlock(&tasklist_lock); 197 read_unlock(&tasklist_lock);
167 198
168 if (!ret && !ignore_state) 199 if (!ret && !ignore_state) {
169 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; 200 if (!wait_task_inactive(child, __TASK_TRACED)) {
201 /*
202 * This can only happen if may_ptrace_stop() fails and
203 * ptrace_stop() changes ->state back to TASK_RUNNING,
204 * so we should not worry about leaking __TASK_TRACED.
205 */
206 WARN_ON(child->state == __TASK_TRACED);
207 ret = -ESRCH;
208 }
209 }
170 210
171 /* All systems go.. */
172 return ret; 211 return ret;
173} 212}
174 213
@@ -317,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request,
317 */ 356 */
318 if (task_is_stopped(task) && 357 if (task_is_stopped(task) &&
319 task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) 358 task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
320 signal_wake_up(task, 1); 359 signal_wake_up_state(task, __TASK_STOPPED);
321 360
322 spin_unlock(&task->sighand->siglock); 361 spin_unlock(&task->sighand->siglock);
323 362
@@ -673,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
673 kiov->iov_len, kiov->iov_base); 712 kiov->iov_len, kiov->iov_base);
674} 713}
675 714
715/*
716 * This is declared in linux/regset.h and defined in machine-dependent
717 * code. We put the export here, near the primary machine-neutral use,
718 * to ensure no machine forgets it.
719 */
720EXPORT_SYMBOL_GPL(task_user_regset_view);
676#endif 721#endif
677 722
678int ptrace_request(struct task_struct *child, long request, 723int ptrace_request(struct task_struct *child, long request,
@@ -737,7 +782,7 @@ int ptrace_request(struct task_struct *child, long request,
737 * tracee into STOP. 782 * tracee into STOP.
738 */ 783 */
739 if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) 784 if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
740 signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); 785 ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
741 786
742 unlock_task_sighand(child, &flags); 787 unlock_task_sighand(child, &flags);
743 ret = 0; 788 ret = 0;
@@ -763,7 +808,7 @@ int ptrace_request(struct task_struct *child, long request,
763 * start of this trap and now. Trigger re-trap. 808 * start of this trap and now. Trigger re-trap.
764 */ 809 */
765 if (child->jobctl & JOBCTL_TRAP_NOTIFY) 810 if (child->jobctl & JOBCTL_TRAP_NOTIFY)
766 signal_wake_up(child, true); 811 ptrace_signal_wake_up(child, true);
767 ret = 0; 812 ret = 0;
768 } 813 }
769 unlock_task_sighand(child, &flags); 814 unlock_task_sighand(child, &flags);
@@ -900,6 +945,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
900 goto out_put_task_struct; 945 goto out_put_task_struct;
901 946
902 ret = arch_ptrace(child, request, addr, data); 947 ret = arch_ptrace(child, request, addr, data);
948 if (ret || request != PTRACE_DETACH)
949 ptrace_unfreeze_traced(child);
903 950
904 out_put_task_struct: 951 out_put_task_struct:
905 put_task_struct(child); 952 put_task_struct(child);
@@ -1039,8 +1086,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1039 1086
1040 ret = ptrace_check_attach(child, request == PTRACE_KILL || 1087 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
1041 request == PTRACE_INTERRUPT); 1088 request == PTRACE_INTERRUPT);
1042 if (!ret) 1089 if (!ret) {
1043 ret = compat_arch_ptrace(child, request, addr, data); 1090 ret = compat_arch_ptrace(child, request, addr, data);
1091 if (ret || request != PTRACE_DETACH)
1092 ptrace_unfreeze_traced(child);
1093 }
1044 1094
1045 out_put_task_struct: 1095 out_put_task_struct:
1046 put_task_struct(child); 1096 put_task_struct(child);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 20dfba576c2b..7f8e7590e3e5 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
111 111
112extern int rcu_expedited; 112extern int rcu_expedited;
113 113
114#ifdef CONFIG_RCU_STALL_COMMON
115
116extern int rcu_cpu_stall_suppress;
117int rcu_jiffies_till_stall_check(void);
118
119#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
120
114#endif /* __LINUX_RCU_H */ 121#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf76177b44..48ab70384a4c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
404#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 404#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
405 405
406#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 406#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
407void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) 407void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
408 unsigned long secs,
409 unsigned long c_old, unsigned long c)
408{ 410{
409 trace_rcu_torture_read(rcutorturename, rhp); 411 trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
410} 412}
411EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); 413EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
412#else 414#else
413#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) 415#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
416 do { } while (0)
414#endif 417#endif
418
419#ifdef CONFIG_RCU_STALL_COMMON
420
421#ifdef CONFIG_PROVE_RCU
422#define RCU_STALL_DELAY_DELTA (5 * HZ)
423#else
424#define RCU_STALL_DELAY_DELTA 0
425#endif
426
427int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
428int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
429
430module_param(rcu_cpu_stall_suppress, int, 0644);
431module_param(rcu_cpu_stall_timeout, int, 0644);
432
433int rcu_jiffies_till_stall_check(void)
434{
435 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
436
437 /*
438 * Limit check must be consistent with the Kconfig limits
439 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
440 */
441 if (till_stall_check < 3) {
442 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
443 till_stall_check = 3;
444 } else if (till_stall_check > 300) {
445 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
446 till_stall_check = 300;
447 }
448 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
449}
450
451static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
452{
453 rcu_cpu_stall_suppress = 1;
454 return NOTIFY_DONE;
455}
456
457static struct notifier_block rcu_panic_block = {
458 .notifier_call = rcu_panic,
459};
460
461static int __init check_cpu_stall_init(void)
462{
463 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
464 return 0;
465}
466early_initcall(check_cpu_stall_init);
467
468#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58f9c2a..a0714a51b6d7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu), 51 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp); 52 struct rcu_ctrlblk *rcp);
53 53
54#include "rcutiny_plugin.h"
55
56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 54static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 55
56#include "rcutiny_plugin.h"
57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long newval) 59static void rcu_idle_enter_common(long long newval)
60{ 60{
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
193 * interrupts don't count, we must be running at the first interrupt 193 * interrupts don't count, we must be running at the first interrupt
194 * level. 194 * level.
195 */ 195 */
196int rcu_is_cpu_rrupt_from_idle(void) 196static int rcu_is_cpu_rrupt_from_idle(void)
197{ 197{
198 return rcu_dynticks_nesting <= 1; 198 return rcu_dynticks_nesting <= 1;
199} 199}
@@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
205 */ 205 */
206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
207{ 207{
208 reset_cpu_stall_ticks(rcp);
208 if (rcp->rcucblist != NULL && 209 if (rcp->rcucblist != NULL &&
209 rcp->donetail != rcp->curtail) { 210 rcp->donetail != rcp->curtail) {
210 rcp->donetail = rcp->curtail; 211 rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)
251 */ 252 */
252void rcu_check_callbacks(int cpu, int user) 253void rcu_check_callbacks(int cpu, int user)
253{ 254{
255 check_cpu_stalls();
254 if (user || rcu_is_cpu_rrupt_from_idle()) 256 if (user || rcu_is_cpu_rrupt_from_idle())
255 rcu_sched_qs(cpu); 257 rcu_sched_qs(cpu);
256 else if (!in_softirq()) 258 else if (!in_softirq())
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a2309b..8a233002faeb 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -33,6 +33,9 @@ struct rcu_ctrlblk {
33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
34 struct rcu_head **curtail; /* ->next pointer of last CB. */ 34 struct rcu_head **curtail; /* ->next pointer of last CB. */
35 RCU_TRACE(long qlen); /* Number of pending CBs. */ 35 RCU_TRACE(long qlen); /* Number of pending CBs. */
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
36 RCU_TRACE(char *name); /* Name of RCU type. */ 39 RCU_TRACE(char *name); /* Name of RCU type. */
37}; 40};
38 41
@@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;
54EXPORT_SYMBOL_GPL(rcu_scheduler_active); 57EXPORT_SYMBOL_GPL(rcu_scheduler_active);
55#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 58#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
56 59
60#ifdef CONFIG_RCU_TRACE
61
62static void check_cpu_stall(struct rcu_ctrlblk *rcp)
63{
64 unsigned long j;
65 unsigned long js;
66
67 if (rcu_cpu_stall_suppress)
68 return;
69 rcp->ticks_this_gp++;
70 j = jiffies;
71 js = rcp->jiffies_stall;
72 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
73 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
74 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
75 jiffies - rcp->gp_start, rcp->qlen);
76 dump_stack();
77 }
78 if (*rcp->curtail && ULONG_CMP_GE(j, js))
79 rcp->jiffies_stall = jiffies +
80 3 * rcu_jiffies_till_stall_check() + 3;
81 else if (ULONG_CMP_GE(j, js))
82 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
83}
84
85static void check_cpu_stall_preempt(void);
86
87#endif /* #ifdef CONFIG_RCU_TRACE */
88
89static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
90{
91#ifdef CONFIG_RCU_TRACE
92 rcp->ticks_this_gp = 0;
93 rcp->gp_start = jiffies;
94 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
95#endif /* #ifdef CONFIG_RCU_TRACE */
96}
97
98static void check_cpu_stalls(void)
99{
100 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
101 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
102 RCU_TRACE(check_cpu_stall_preempt());
103}
104
57#ifdef CONFIG_TINY_PREEMPT_RCU 105#ifdef CONFIG_TINY_PREEMPT_RCU
58 106
59#include <linux/delay.h> 107#include <linux/delay.h>
@@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)
448 /* Official start of GP. */ 496 /* Official start of GP. */
449 rcu_preempt_ctrlblk.gpnum++; 497 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); 498 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
499 reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
451 500
452 /* Any blocked RCU readers block new GP. */ 501 /* Any blocked RCU readers block new GP. */
453 if (rcu_preempt_blocked_readers_any()) 502 if (rcu_preempt_blocked_readers_any())
@@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");
1054MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); 1103MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1055MODULE_LICENSE("GPL"); 1104MODULE_LICENSE("GPL");
1056 1105
1106static void check_cpu_stall_preempt(void)
1107{
1108#ifdef CONFIG_TINY_PREEMPT_RCU
1109 check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
1110#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
1111}
1112
1057#endif /* #ifdef CONFIG_RCU_TRACE */ 1113#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01c85fd..e1f3a8c96724 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -46,6 +46,7 @@
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/trace_clock.h>
49#include <asm/byteorder.h> 50#include <asm/byteorder.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
207#define rcu_can_boost() 0 208#define rcu_can_boost() 0
208#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 209#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
209 210
211#ifdef CONFIG_RCU_TRACE
212static u64 notrace rcu_trace_clock_local(void)
213{
214 u64 ts = trace_clock_local();
215 unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
216 return ts;
217}
218#else /* #ifdef CONFIG_RCU_TRACE */
219static u64 notrace rcu_trace_clock_local(void)
220{
221 return 0ULL;
222}
223#endif /* #else #ifdef CONFIG_RCU_TRACE */
224
210static unsigned long shutdown_time; /* jiffies to system shutdown. */ 225static unsigned long shutdown_time; /* jiffies to system shutdown. */
211static unsigned long boost_starttime; /* jiffies of next boost test start. */ 226static unsigned long boost_starttime; /* jiffies of next boost test start. */
212DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 227DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg)
845 /* Wait for the next test interval. */ 860 /* Wait for the next test interval. */
846 oldstarttime = boost_starttime; 861 oldstarttime = boost_starttime;
847 while (ULONG_CMP_LT(jiffies, oldstarttime)) { 862 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
848 schedule_timeout_uninterruptible(1); 863 schedule_timeout_interruptible(oldstarttime - jiffies);
849 rcu_stutter_wait("rcu_torture_boost"); 864 rcu_stutter_wait("rcu_torture_boost");
850 if (kthread_should_stop() || 865 if (kthread_should_stop() ||
851 fullstop != FULLSTOP_DONTSTOP) 866 fullstop != FULLSTOP_DONTSTOP)
@@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void)
1028 return; 1043 return;
1029 if (atomic_xchg(&beenhere, 1) != 0) 1044 if (atomic_xchg(&beenhere, 1) != 0)
1030 return; 1045 return;
1031 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
1032 ftrace_dump(DUMP_ALL); 1046 ftrace_dump(DUMP_ALL);
1033} 1047}
1034 1048
@@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)
1042{ 1056{
1043 int idx; 1057 int idx;
1044 int completed; 1058 int completed;
1059 int completed_end;
1045 static DEFINE_RCU_RANDOM(rand); 1060 static DEFINE_RCU_RANDOM(rand);
1046 static DEFINE_SPINLOCK(rand_lock); 1061 static DEFINE_SPINLOCK(rand_lock);
1047 struct rcu_torture *p; 1062 struct rcu_torture *p;
1048 int pipe_count; 1063 int pipe_count;
1064 unsigned long long ts;
1049 1065
1050 idx = cur_ops->readlock(); 1066 idx = cur_ops->readlock();
1051 completed = cur_ops->completed(); 1067 completed = cur_ops->completed();
1068 ts = rcu_trace_clock_local();
1052 p = rcu_dereference_check(rcu_torture_current, 1069 p = rcu_dereference_check(rcu_torture_current,
1053 rcu_read_lock_bh_held() || 1070 rcu_read_lock_bh_held() ||
1054 rcu_read_lock_sched_held() || 1071 rcu_read_lock_sched_held() ||
@@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)
1058 cur_ops->readunlock(idx); 1075 cur_ops->readunlock(idx);
1059 return; 1076 return;
1060 } 1077 }
1061 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1062 if (p->rtort_mbtest == 0) 1078 if (p->rtort_mbtest == 0)
1063 atomic_inc(&n_rcu_torture_mberror); 1079 atomic_inc(&n_rcu_torture_mberror);
1064 spin_lock(&rand_lock); 1080 spin_lock(&rand_lock);
@@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)
1071 /* Should not happen, but... */ 1087 /* Should not happen, but... */
1072 pipe_count = RCU_TORTURE_PIPE_LEN; 1088 pipe_count = RCU_TORTURE_PIPE_LEN;
1073 } 1089 }
1074 if (pipe_count > 1) 1090 completed_end = cur_ops->completed();
1091 if (pipe_count > 1) {
1092 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
1093 completed, completed_end);
1075 rcutorture_trace_dump(); 1094 rcutorture_trace_dump();
1095 }
1076 __this_cpu_inc(rcu_torture_count[pipe_count]); 1096 __this_cpu_inc(rcu_torture_count[pipe_count]);
1077 completed = cur_ops->completed() - completed; 1097 completed = completed_end - completed;
1078 if (completed > RCU_TORTURE_PIPE_LEN) { 1098 if (completed > RCU_TORTURE_PIPE_LEN) {
1079 /* Should not happen, but... */ 1099 /* Should not happen, but... */
1080 completed = RCU_TORTURE_PIPE_LEN; 1100 completed = RCU_TORTURE_PIPE_LEN;
@@ -1094,11 +1114,13 @@ static int
1094rcu_torture_reader(void *arg) 1114rcu_torture_reader(void *arg)
1095{ 1115{
1096 int completed; 1116 int completed;
1117 int completed_end;
1097 int idx; 1118 int idx;
1098 DEFINE_RCU_RANDOM(rand); 1119 DEFINE_RCU_RANDOM(rand);
1099 struct rcu_torture *p; 1120 struct rcu_torture *p;
1100 int pipe_count; 1121 int pipe_count;
1101 struct timer_list t; 1122 struct timer_list t;
1123 unsigned long long ts;
1102 1124
1103 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 1125 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
1104 set_user_nice(current, 19); 1126 set_user_nice(current, 19);
@@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg)
1112 } 1134 }
1113 idx = cur_ops->readlock(); 1135 idx = cur_ops->readlock();
1114 completed = cur_ops->completed(); 1136 completed = cur_ops->completed();
1137 ts = rcu_trace_clock_local();
1115 p = rcu_dereference_check(rcu_torture_current, 1138 p = rcu_dereference_check(rcu_torture_current,
1116 rcu_read_lock_bh_held() || 1139 rcu_read_lock_bh_held() ||
1117 rcu_read_lock_sched_held() || 1140 rcu_read_lock_sched_held() ||
@@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg)
1122 schedule_timeout_interruptible(HZ); 1145 schedule_timeout_interruptible(HZ);
1123 continue; 1146 continue;
1124 } 1147 }
1125 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1126 if (p->rtort_mbtest == 0) 1148 if (p->rtort_mbtest == 0)
1127 atomic_inc(&n_rcu_torture_mberror); 1149 atomic_inc(&n_rcu_torture_mberror);
1128 cur_ops->read_delay(&rand); 1150 cur_ops->read_delay(&rand);
@@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg)
1132 /* Should not happen, but... */ 1154 /* Should not happen, but... */
1133 pipe_count = RCU_TORTURE_PIPE_LEN; 1155 pipe_count = RCU_TORTURE_PIPE_LEN;
1134 } 1156 }
1135 if (pipe_count > 1) 1157 completed_end = cur_ops->completed();
1158 if (pipe_count > 1) {
1159 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
1160 ts, completed, completed_end);
1136 rcutorture_trace_dump(); 1161 rcutorture_trace_dump();
1162 }
1137 __this_cpu_inc(rcu_torture_count[pipe_count]); 1163 __this_cpu_inc(rcu_torture_count[pipe_count]);
1138 completed = cur_ops->completed() - completed; 1164 completed = completed_end - completed;
1139 if (completed > RCU_TORTURE_PIPE_LEN) { 1165 if (completed > RCU_TORTURE_PIPE_LEN) {
1140 /* Should not happen, but... */ 1166 /* Should not happen, but... */
1141 completed = RCU_TORTURE_PIPE_LEN; 1167 completed = RCU_TORTURE_PIPE_LEN;
@@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)
1301 set_cpus_allowed_ptr(reader_tasks[i], 1327 set_cpus_allowed_ptr(reader_tasks[i],
1302 shuffle_tmp_mask); 1328 shuffle_tmp_mask);
1303 } 1329 }
1304
1305 if (fakewriter_tasks) { 1330 if (fakewriter_tasks) {
1306 for (i = 0; i < nfakewriters; i++) 1331 for (i = 0; i < nfakewriters; i++)
1307 if (fakewriter_tasks[i]) 1332 if (fakewriter_tasks[i])
1308 set_cpus_allowed_ptr(fakewriter_tasks[i], 1333 set_cpus_allowed_ptr(fakewriter_tasks[i],
1309 shuffle_tmp_mask); 1334 shuffle_tmp_mask);
1310 } 1335 }
1311
1312 if (writer_task) 1336 if (writer_task)
1313 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); 1337 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
1314
1315 if (stats_task) 1338 if (stats_task)
1316 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); 1339 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
1340 if (stutter_task)
1341 set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
1342 if (fqs_task)
1343 set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
1344 if (shutdown_task)
1345 set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
1346#ifdef CONFIG_HOTPLUG_CPU
1347 if (onoff_task)
1348 set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
1349#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1350 if (stall_task)
1351 set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
1352 if (barrier_cbs_tasks)
1353 for (i = 0; i < n_barrier_cbs; i++)
1354 if (barrier_cbs_tasks[i])
1355 set_cpus_allowed_ptr(barrier_cbs_tasks[i],
1356 shuffle_tmp_mask);
1357 if (barrier_task)
1358 set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
1317 1359
1318 if (rcu_idle_cpu == -1) 1360 if (rcu_idle_cpu == -1)
1319 rcu_idle_cpu = num_online_cpus() - 1; 1361 rcu_idle_cpu = num_online_cpus() - 1;
@@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void)
1749 barrier_cbs_wq = 1791 barrier_cbs_wq =
1750 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), 1792 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1751 GFP_KERNEL); 1793 GFP_KERNEL);
1752 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) 1794 if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
1753 return -ENOMEM; 1795 return -ENOMEM;
1754 for (i = 0; i < n_barrier_cbs; i++) { 1796 for (i = 0; i < n_barrier_cbs; i++) {
1755 init_waitqueue_head(&barrier_cbs_wq[i]); 1797 init_waitqueue_head(&barrier_cbs_wq[i]);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77b614e..5b8ad827fd86 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
105 * The rcu_scheduler_active variable transitions from zero to one just 105 * The rcu_scheduler_active variable transitions from zero to one just
106 * before the first task is spawned. So when this variable is zero, RCU 106 * before the first task is spawned. So when this variable is zero, RCU
107 * can assume that there is but one task, allowing RCU to (for example) 107 * can assume that there is but one task, allowing RCU to (for example)
108 * optimized synchronize_sched() to a simple barrier(). When this variable 108 * optimize synchronize_sched() to a simple barrier(). When this variable
109 * is one, RCU must actually do all the hard work required to detect real 109 * is one, RCU must actually do all the hard work required to detect real
110 * grace periods. This variable is also used to suppress boot-time false 110 * grace periods. This variable is also used to suppress boot-time false
111 * positives from lockdep-RCU error checking. 111 * positives from lockdep-RCU error checking.
@@ -217,12 +217,6 @@ module_param(blimit, long, 0444);
217module_param(qhimark, long, 0444); 217module_param(qhimark, long, 0444);
218module_param(qlowmark, long, 0444); 218module_param(qlowmark, long, 0444);
219 219
220int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
221int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
222
223module_param(rcu_cpu_stall_suppress, int, 0644);
224module_param(rcu_cpu_stall_timeout, int, 0644);
225
226static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; 220static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
227static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; 221static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
228 222
@@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305} 299}
306 300
307/* 301/*
308 * Does the current CPU require a yet-as-unscheduled grace period? 302 * Does the current CPU require a not-yet-started grace period?
303 * The caller must have disabled interrupts to prevent races with
304 * normal callback registry.
309 */ 305 */
310static int 306static int
311cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 307cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
312{ 308{
313 struct rcu_head **ntp; 309 int i;
314 310
315 ntp = rdp->nxttail[RCU_DONE_TAIL + 311 if (rcu_gp_in_progress(rsp))
316 (ACCESS_ONCE(rsp->completed) != rdp->completed)]; 312 return 0; /* No, a grace period is already in progress. */
317 return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && 313 if (!rdp->nxttail[RCU_NEXT_TAIL])
318 !rcu_gp_in_progress(rsp); 314 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
316 return 1; /* Yes, this CPU has newly registered callbacks. */
317 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
318 if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
319 ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
320 rdp->nxtcompleted[i]))
321 return 1; /* Yes, CBs for future grace period. */
322 return 0; /* No grace period needed. */
319} 323}
320 324
321/* 325/*
@@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
336static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 340static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
337 bool user) 341 bool user)
338{ 342{
339 trace_rcu_dyntick("Start", oldval, 0); 343 trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
340 if (!user && !is_idle_task(current)) { 344 if (!user && !is_idle_task(current)) {
341 struct task_struct *idle = idle_task(smp_processor_id()); 345 struct task_struct *idle = idle_task(smp_processor_id());
342 346
@@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
727 * interrupt from idle, return true. The caller must have at least 731 * interrupt from idle, return true. The caller must have at least
728 * disabled preemption. 732 * disabled preemption.
729 */ 733 */
730int rcu_is_cpu_rrupt_from_idle(void) 734static int rcu_is_cpu_rrupt_from_idle(void)
731{ 735{
732 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 736 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
733} 737}
@@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
793 return 0; 797 return 0;
794} 798}
795 799
796static int jiffies_till_stall_check(void)
797{
798 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
799
800 /*
801 * Limit check must be consistent with the Kconfig limits
802 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
803 */
804 if (till_stall_check < 3) {
805 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
806 till_stall_check = 3;
807 } else if (till_stall_check > 300) {
808 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
809 till_stall_check = 300;
810 }
811 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
812}
813
814static void record_gp_stall_check_time(struct rcu_state *rsp) 800static void record_gp_stall_check_time(struct rcu_state *rsp)
815{ 801{
816 rsp->gp_start = jiffies; 802 rsp->gp_start = jiffies;
817 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 803 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
818} 804}
819 805
820/* 806/*
@@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
857 raw_spin_unlock_irqrestore(&rnp->lock, flags); 843 raw_spin_unlock_irqrestore(&rnp->lock, flags);
858 return; 844 return;
859 } 845 }
860 rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; 846 rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
861 raw_spin_unlock_irqrestore(&rnp->lock, flags); 847 raw_spin_unlock_irqrestore(&rnp->lock, flags);
862 848
863 /* 849 /*
@@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
935 raw_spin_lock_irqsave(&rnp->lock, flags); 921 raw_spin_lock_irqsave(&rnp->lock, flags);
936 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 922 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
937 rsp->jiffies_stall = jiffies + 923 rsp->jiffies_stall = jiffies +
938 3 * jiffies_till_stall_check() + 3; 924 3 * rcu_jiffies_till_stall_check() + 3;
939 raw_spin_unlock_irqrestore(&rnp->lock, flags); 925 raw_spin_unlock_irqrestore(&rnp->lock, flags);
940 926
941 set_need_resched(); /* kick ourselves to get things going. */ 927 set_need_resched(); /* kick ourselves to get things going. */
@@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
966 } 952 }
967} 953}
968 954
969static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
970{
971 rcu_cpu_stall_suppress = 1;
972 return NOTIFY_DONE;
973}
974
975/** 955/**
976 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period 956 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
977 * 957 *
@@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void)
989 rsp->jiffies_stall = jiffies + ULONG_MAX / 2; 969 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
990} 970}
991 971
992static struct notifier_block rcu_panic_block = {
993 .notifier_call = rcu_panic,
994};
995
996static void __init check_cpu_stall_init(void)
997{
998 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
999}
1000
1001/* 972/*
1002 * Update CPU-local rcu_data state to record the newly noticed grace period. 973 * Update CPU-local rcu_data state to record the newly noticed grace period.
1003 * This is used both when we started the grace period and when we notice 974 * This is used both when we started the grace period and when we notice
@@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp)
1071} 1042}
1072 1043
1073/* 1044/*
1045 * Determine the value that ->completed will have at the end of the
1046 * next subsequent grace period. This is used to tag callbacks so that
1047 * a CPU can invoke callbacks in a timely fashion even if that CPU has
1048 * been dyntick-idle for an extended period with callbacks under the
1049 * influence of RCU_FAST_NO_HZ.
1050 *
1051 * The caller must hold rnp->lock with interrupts disabled.
1052 */
1053static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1054 struct rcu_node *rnp)
1055{
1056 /*
1057 * If RCU is idle, we just wait for the next grace period.
1058 * But we can only be sure that RCU is idle if we are looking
1059 * at the root rcu_node structure -- otherwise, a new grace
1060 * period might have started, but just not yet gotten around
1061 * to initializing the current non-root rcu_node structure.
1062 */
1063 if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
1064 return rnp->completed + 1;
1065
1066 /*
1067 * Otherwise, wait for a possible partial grace period and
1068 * then the subsequent full grace period.
1069 */
1070 return rnp->completed + 2;
1071}
1072
1073/*
1074 * If there is room, assign a ->completed number to any callbacks on
1075 * this CPU that have not already been assigned. Also accelerate any
1076 * callbacks that were previously assigned a ->completed number that has
1077 * since proven to be too conservative, which can happen if callbacks get
1078 * assigned a ->completed number while RCU is idle, but with reference to
1079 * a non-root rcu_node structure. This function is idempotent, so it does
1080 * not hurt to call it repeatedly.
1081 *
1082 * The caller must hold rnp->lock with interrupts disabled.
1083 */
1084static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1085 struct rcu_data *rdp)
1086{
1087 unsigned long c;
1088 int i;
1089
1090 /* If the CPU has no callbacks, nothing to do. */
1091 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1092 return;
1093
1094 /*
1095 * Starting from the sublist containing the callbacks most
1096 * recently assigned a ->completed number and working down, find the
1097 * first sublist that is not assignable to an upcoming grace period.
1098 * Such a sublist has something in it (first two tests) and has
1099 * a ->completed number assigned that will complete sooner than
1100 * the ->completed number for newly arrived callbacks (last test).
1101 *
1102 * The key point is that any later sublist can be assigned the
1103 * same ->completed number as the newly arrived callbacks, which
1104 * means that the callbacks in any of these later sublist can be
1105 * grouped into a single sublist, whether or not they have already
1106 * been assigned a ->completed number.
1107 */
1108 c = rcu_cbs_completed(rsp, rnp);
1109 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
1110 if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
1111 !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
1112 break;
1113
1114 /*
1115 * If there are no sublist for unassigned callbacks, leave.
1116 * At the same time, advance "i" one sublist, so that "i" will
1117 * index into the sublist where all the remaining callbacks should
1118 * be grouped into.
1119 */
1120 if (++i >= RCU_NEXT_TAIL)
1121 return;
1122
1123 /*
1124 * Assign all subsequent callbacks' ->completed number to the next
1125 * full grace period and group them all in the sublist initially
1126 * indexed by "i".
1127 */
1128 for (; i <= RCU_NEXT_TAIL; i++) {
1129 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 rdp->nxtcompleted[i] = c;
1131 }
1132
1133 /* Trace depending on how much we were able to accelerate. */
1134 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1135 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
1136 else
1137 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
1138}
1139
1140/*
1141 * Move any callbacks whose grace period has completed to the
1142 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1143 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1144 * sublist. This function is idempotent, so it does not hurt to
1145 * invoke it repeatedly. As long as it is not invoked -too- often...
1146 *
1147 * The caller must hold rnp->lock with interrupts disabled.
1148 */
1149static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1150 struct rcu_data *rdp)
1151{
1152 int i, j;
1153
1154 /* If the CPU has no callbacks, nothing to do. */
1155 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1156 return;
1157
1158 /*
1159 * Find all callbacks whose ->completed numbers indicate that they
1160 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1161 */
1162 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
1163 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
1164 break;
1165 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
1166 }
1167 /* Clean up any sublist tail pointers that were misordered above. */
1168 for (j = RCU_WAIT_TAIL; j < i; j++)
1169 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
1170
1171 /* Copy down callbacks to fill in empty sublists. */
1172 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
1173 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
1174 break;
1175 rdp->nxttail[j] = rdp->nxttail[i];
1176 rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
1177 }
1178
1179 /* Classify any remaining callbacks. */
1180 rcu_accelerate_cbs(rsp, rnp, rdp);
1181}
1182
1183/*
1074 * Advance this CPU's callbacks, but only if the current grace period 1184 * Advance this CPU's callbacks, but only if the current grace period
1075 * has ended. This may be called only from the CPU to whom the rdp 1185 * has ended. This may be called only from the CPU to whom the rdp
1076 * belongs. In addition, the corresponding leaf rcu_node structure's 1186 * belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1080,12 +1190,15 @@ static void
1080__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 1190__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1081{ 1191{
1082 /* Did another grace period end? */ 1192 /* Did another grace period end? */
1083 if (rdp->completed != rnp->completed) { 1193 if (rdp->completed == rnp->completed) {
1084 1194
1085 /* Advance callbacks. No harm if list empty. */ 1195 /* No, so just accelerate recent callbacks. */
1086 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; 1196 rcu_accelerate_cbs(rsp, rnp, rdp);
1087 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; 1197
1088 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1198 } else {
1199
1200 /* Advance callbacks. */
1201 rcu_advance_cbs(rsp, rnp, rdp);
1089 1202
1090 /* Remember that we saw this grace-period completion. */ 1203 /* Remember that we saw this grace-period completion. */
1091 rdp->completed = rnp->completed; 1204 rdp->completed = rnp->completed;
@@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1392 /* 1505 /*
1393 * Because there is no grace period in progress right now, 1506 * Because there is no grace period in progress right now,
1394 * any callbacks we have up to this point will be satisfied 1507 * any callbacks we have up to this point will be satisfied
1395 * by the next grace period. So promote all callbacks to be 1508 * by the next grace period. So this is a good place to
1396 * handled after the end of the next grace period. If the 1509 * assign a grace period number to recently posted callbacks.
1397 * CPU is not yet aware of the end of the previous grace period,
1398 * we need to allow for the callback advancement that will
1399 * occur when it does become aware. Deadlock prevents us from
1400 * making it aware at this point: We cannot acquire a leaf
1401 * rcu_node ->lock while holding the root rcu_node ->lock.
1402 */ 1510 */
1403 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1511 rcu_accelerate_cbs(rsp, rnp, rdp);
1404 if (rdp->completed == rsp->completed)
1405 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1406 1512
1407 rsp->gp_flags = RCU_GP_FLAG_INIT; 1513 rsp->gp_flags = RCU_GP_FLAG_INIT;
1408 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ 1514 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
@@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1527 * This GP can't end until cpu checks in, so all of our 1633 * This GP can't end until cpu checks in, so all of our
1528 * callbacks can be processed during the next GP. 1634 * callbacks can be processed during the next GP.
1529 */ 1635 */
1530 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1636 rcu_accelerate_cbs(rsp, rnp, rdp);
1531 1637
1532 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 1638 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1533 } 1639 }
@@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1779 long bl, count, count_lazy; 1885 long bl, count, count_lazy;
1780 int i; 1886 int i;
1781 1887
1782 /* If no callbacks are ready, just return.*/ 1888 /* If no callbacks are ready, just return. */
1783 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1889 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1784 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 1890 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1785 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), 1891 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2008 2114
2009 WARN_ON_ONCE(rdp->beenonline == 0); 2115 WARN_ON_ONCE(rdp->beenonline == 0);
2010 2116
2011 /* 2117 /* Handle the end of a grace period that some other CPU ended. */
2012 * Advance callbacks in response to end of earlier grace
2013 * period that some other CPU ended.
2014 */
2015 rcu_process_gp_end(rsp, rdp); 2118 rcu_process_gp_end(rsp, rdp);
2016 2119
2017 /* Update RCU state based on any recent quiescent states. */ 2120 /* Update RCU state based on any recent quiescent states. */
2018 rcu_check_quiescent_state(rsp, rdp); 2121 rcu_check_quiescent_state(rsp, rdp);
2019 2122
2020 /* Does this CPU require a not-yet-started grace period? */ 2123 /* Does this CPU require a not-yet-started grace period? */
2124 local_irq_save(flags);
2021 if (cpu_needs_another_gp(rsp, rdp)) { 2125 if (cpu_needs_another_gp(rsp, rdp)) {
2022 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 2126 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2023 rcu_start_gp(rsp, flags); /* releases above lock */ 2127 rcu_start_gp(rsp, flags); /* releases above lock */
2128 } else {
2129 local_irq_restore(flags);
2024 } 2130 }
2025 2131
2026 /* If there are callbacks ready, invoke them. */ 2132 /* If there are callbacks ready, invoke them. */
@@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2719 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2825 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2720 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2826 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2721 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2827 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2722#ifdef CONFIG_RCU_USER_QS
2723 WARN_ON_ONCE(rdp->dynticks->in_user);
2724#endif
2725 rdp->cpu = cpu; 2828 rdp->cpu = cpu;
2726 rdp->rsp = rsp; 2829 rdp->rsp = rsp;
2727 rcu_boot_init_nocb_percpu_data(rdp); 2830 rcu_boot_init_nocb_percpu_data(rdp);
@@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2938 3041
2939 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 3042 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
2940 3043
3044 /* Silence gcc 4.8 warning about array index out of range. */
3045 if (rcu_num_lvls > RCU_NUM_LVLS)
3046 panic("rcu_init_one: rcu_num_lvls overflow");
3047
2941 /* Initialize the level-tracking arrays. */ 3048 /* Initialize the level-tracking arrays. */
2942 3049
2943 for (i = 0; i < rcu_num_lvls; i++) 3050 for (i = 0; i < rcu_num_lvls; i++)
@@ -3074,7 +3181,6 @@ void __init rcu_init(void)
3074 cpu_notifier(rcu_cpu_notify, 0); 3181 cpu_notifier(rcu_cpu_notify, 0);
3075 for_each_online_cpu(cpu) 3182 for_each_online_cpu(cpu)
3076 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3183 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3077 check_cpu_stall_init();
3078} 3184}
3079 3185
3080#include "rcutree_plugin.h" 3186#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291b093d..c896b5045d9d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,10 +102,6 @@ struct rcu_dynticks {
102 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105#ifdef CONFIG_RCU_USER_QS
106 bool ignore_user_qs; /* Treat userspace as extended QS or not */
107 bool in_user; /* Is the CPU in userland from RCU POV? */
108#endif
109}; 105};
110 106
111/* RCU's kthread states for tracing. */ 107/* RCU's kthread states for tracing. */
@@ -282,6 +278,8 @@ struct rcu_data {
282 */ 278 */
283 struct rcu_head *nxtlist; 279 struct rcu_head *nxtlist;
284 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 280 struct rcu_head **nxttail[RCU_NEXT_SIZE];
281 unsigned long nxtcompleted[RCU_NEXT_SIZE];
282 /* grace periods for sublists. */
285 long qlen_lazy; /* # of lazy queued callbacks */ 283 long qlen_lazy; /* # of lazy queued callbacks */
286 long qlen; /* # of queued callbacks, incl lazy */ 284 long qlen; /* # of queued callbacks, incl lazy */
287 long qlen_last_fqs_check; 285 long qlen_last_fqs_check;
@@ -343,11 +341,6 @@ struct rcu_data {
343 341
344#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 342#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
345 343
346#ifdef CONFIG_PROVE_RCU
347#define RCU_STALL_DELAY_DELTA (5 * HZ)
348#else
349#define RCU_STALL_DELAY_DELTA 0
350#endif
351#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 344#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
352 /* to take at least one */ 345 /* to take at least one */
353 /* scheduling clock irq */ 346 /* scheduling clock irq */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f6e5ec2932b4..c1cc7e17ff9d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -40,8 +40,7 @@
40#ifdef CONFIG_RCU_NOCB_CPU 40#ifdef CONFIG_RCU_NOCB_CPU
41static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 41static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
42static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ 42static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
43static bool rcu_nocb_poll; /* Offload kthread are to poll. */ 43static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
44module_param(rcu_nocb_poll, bool, 0444);
45static char __initdata nocb_buf[NR_CPUS * 5]; 44static char __initdata nocb_buf[NR_CPUS * 5];
46#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 45#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
47 46
@@ -2159,6 +2158,13 @@ static int __init rcu_nocb_setup(char *str)
2159} 2158}
2160__setup("rcu_nocbs=", rcu_nocb_setup); 2159__setup("rcu_nocbs=", rcu_nocb_setup);
2161 2160
2161static int __init parse_rcu_nocb_poll(char *arg)
2162{
2163 rcu_nocb_poll = 1;
2164 return 0;
2165}
2166early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2167
2162/* Is the specified CPU a no-CPUs CPU? */ 2168/* Is the specified CPU a no-CPUs CPU? */
2163static bool is_nocb_cpu(int cpu) 2169static bool is_nocb_cpu(int cpu)
2164{ 2170{
@@ -2366,10 +2372,11 @@ static int rcu_nocb_kthread(void *arg)
2366 for (;;) { 2372 for (;;) {
2367 /* If not polling, wait for next batch of callbacks. */ 2373 /* If not polling, wait for next batch of callbacks. */
2368 if (!rcu_nocb_poll) 2374 if (!rcu_nocb_poll)
2369 wait_event(rdp->nocb_wq, rdp->nocb_head); 2375 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2370 list = ACCESS_ONCE(rdp->nocb_head); 2376 list = ACCESS_ONCE(rdp->nocb_head);
2371 if (!list) { 2377 if (!list) {
2372 schedule_timeout_interruptible(1); 2378 schedule_timeout_interruptible(1);
2379 flush_signals(current);
2373 continue; 2380 continue;
2374 } 2381 }
2375 2382
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abbd..01ab081ac53a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1139 if (!desc->count) 1139 if (!desc->count)
1140 return 0; 1140 return 0;
1141 1141
1142 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); 1142 mutex_lock(&file_inode(filp)->i_mutex);
1143 do { 1143 do {
1144 if (!relay_file_read_avail(buf, *ppos)) 1144 if (!relay_file_read_avail(buf, *ppos))
1145 break; 1145 break;
@@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1159 *ppos = relay_file_read_end_pos(buf, read_start, ret); 1159 *ppos = relay_file_read_end_pos(buf, read_start, ret);
1160 } 1160 }
1161 } while (desc->count && ret); 1161 } while (desc->count && ret);
1162 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); 1162 mutex_unlock(&file_inode(filp)->i_mutex);
1163 1163
1164 return desc->written; 1164 return desc->written;
1165} 1165}
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c8..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -17,6 +17,7 @@
17 * See rt.c in preempt-rt for proper credits and further information 17 * See rt.c in preempt-rt for proper credits and further information
18 */ 18 */
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sched/rt.h>
20#include <linux/delay.h> 21#include <linux/delay.h>
21#include <linux/export.h> 22#include <linux/export.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec49475460..7890b10084a7 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/sched/rt.h>
13#include <linux/spinlock.h> 14#include <linux/spinlock.h>
14#include <linux/timer.h> 15#include <linux/timer.h>
15#include <linux/freezer.h> 16#include <linux/freezer.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c993..1e09308bf2a1 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/rt.h>
16#include <linux/timer.h> 17#include <linux/timer.h>
17 18
18#include "rtmutex_common.h" 19#include "rtmutex_common.h"
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 6850f53e02d8..b3c6c3fcd847 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
116 116
117EXPORT_SYMBOL(down_read_nested); 117EXPORT_SYMBOL(down_read_nested);
118 118
119void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
120{
121 might_sleep();
122 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
123
124 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
125}
126
127EXPORT_SYMBOL(_down_write_nest_lock);
128
119void down_write_nested(struct rw_semaphore *sem, int subclass) 129void down_write_nested(struct rw_semaphore *sem, int subclass)
120{ 130{
121 might_sleep(); 131 might_sleep();
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
35 ag->tg->rt_se = NULL; 35 ag->tg->rt_se = NULL;
36 ag->tg->rt_rq = NULL; 36 ag->tg->rt_rq = NULL;
37#endif 37#endif
38 sched_offline_group(ag->tg);
38 sched_destroy_group(ag->tg); 39 sched_destroy_group(ag->tg);
39} 40}
40 41
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
76 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
77 goto out_free; 78 goto out_free;
78 79
80 sched_online_group(tg, &root_task_group);
81
79 kref_init(&ag->kref); 82 kref_init(&ag->kref);
80 init_rwsem(&ag->lock); 83 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr); 84 ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d6fdcdcbb9b1..7f12624a393c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,7 +83,7 @@
83#endif 83#endif
84 84
85#include "sched.h" 85#include "sched.h"
86#include "../workqueue_sched.h" 86#include "../workqueue_internal.h"
87#include "../smpboot.h" 87#include "../smpboot.h"
88 88
89#define CREATE_TRACE_POINTS 89#define CREATE_TRACE_POINTS
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
1132 */ 1132 */
1133static int select_fallback_rq(int cpu, struct task_struct *p) 1133static int select_fallback_rq(int cpu, struct task_struct *p)
1134{ 1134{
1135 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1135 int nid = cpu_to_node(cpu);
1136 const struct cpumask *nodemask = NULL;
1136 enum { cpuset, possible, fail } state = cpuset; 1137 enum { cpuset, possible, fail } state = cpuset;
1137 int dest_cpu; 1138 int dest_cpu;
1138 1139
1139 /* Look for allowed, online CPU in same node. */ 1140 /*
1140 for_each_cpu(dest_cpu, nodemask) { 1141 * If the node that the cpu is on has been offlined, cpu_to_node()
1141 if (!cpu_online(dest_cpu)) 1142 * will return -1. There is no cpu on the node, and we should
1142 continue; 1143 * select the cpu on the other node.
1143 if (!cpu_active(dest_cpu)) 1144 */
1144 continue; 1145 if (nid != -1) {
1145 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1146 nodemask = cpumask_of_node(nid);
1146 return dest_cpu; 1147
1148 /* Look for allowed, online CPU in same node. */
1149 for_each_cpu(dest_cpu, nodemask) {
1150 if (!cpu_online(dest_cpu))
1151 continue;
1152 if (!cpu_active(dest_cpu))
1153 continue;
1154 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1155 return dest_cpu;
1156 }
1147 } 1157 }
1148 1158
1149 for (;;) { 1159 for (;;) {
@@ -1523,7 +1533,8 @@ out:
1523 */ 1533 */
1524int wake_up_process(struct task_struct *p) 1534int wake_up_process(struct task_struct *p)
1525{ 1535{
1526 return try_to_wake_up(p, TASK_ALL, 0); 1536 WARN_ON(task_is_stopped_or_traced(p));
1537 return try_to_wake_up(p, TASK_NORMAL, 0);
1527} 1538}
1528EXPORT_SYMBOL(wake_up_process); 1539EXPORT_SYMBOL(wake_up_process);
1529 1540
@@ -1741,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1741static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1752static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1742{ 1753{
1743 struct preempt_notifier *notifier; 1754 struct preempt_notifier *notifier;
1744 struct hlist_node *node;
1745 1755
1746 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1756 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1747 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1757 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1748} 1758}
1749 1759
@@ -1752,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
1752 struct task_struct *next) 1762 struct task_struct *next)
1753{ 1763{
1754 struct preempt_notifier *notifier; 1764 struct preempt_notifier *notifier;
1755 struct hlist_node *node;
1756 1765
1757 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1766 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1758 notifier->ops->sched_out(notifier, next); 1767 notifier->ops->sched_out(notifier, next);
1759} 1768}
1760 1769
@@ -1968,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
1968} 1977}
1969 1978
1970/* 1979/*
1971 * nr_running, nr_uninterruptible and nr_context_switches: 1980 * nr_running and nr_context_switches:
1972 * 1981 *
1973 * externally visible scheduler statistics: current number of runnable 1982 * externally visible scheduler statistics: current number of runnable
1974 * threads, current number of uninterruptible-sleeping threads, total 1983 * threads, total number of context switches performed since bootup.
1975 * number of context switches performed since bootup.
1976 */ 1984 */
1977unsigned long nr_running(void) 1985unsigned long nr_running(void)
1978{ 1986{
@@ -1984,23 +1992,6 @@ unsigned long nr_running(void)
1984 return sum; 1992 return sum;
1985} 1993}
1986 1994
1987unsigned long nr_uninterruptible(void)
1988{
1989 unsigned long i, sum = 0;
1990
1991 for_each_possible_cpu(i)
1992 sum += cpu_rq(i)->nr_uninterruptible;
1993
1994 /*
1995 * Since we read the counters lockless, it might be slightly
1996 * inaccurate. Do not allow it to go below zero though:
1997 */
1998 if (unlikely((long)sum < 0))
1999 sum = 0;
2000
2001 return sum;
2002}
2003
2004unsigned long long nr_context_switches(void) 1995unsigned long long nr_context_switches(void)
2005{ 1996{
2006 int i; 1997 int i;
@@ -2785,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
2785 if (irqs_disabled()) 2776 if (irqs_disabled())
2786 print_irqtrace_events(prev); 2777 print_irqtrace_events(prev);
2787 dump_stack(); 2778 dump_stack();
2788 add_taint(TAINT_WARN); 2779 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2789} 2780}
2790 2781
2791/* 2782/*
@@ -4410,20 +4401,32 @@ EXPORT_SYMBOL(yield);
4410 * It's the caller's job to ensure that the target task struct 4401 * It's the caller's job to ensure that the target task struct
4411 * can't go away on us before we can do any checks. 4402 * can't go away on us before we can do any checks.
4412 * 4403 *
4413 * Returns true if we indeed boosted the target task. 4404 * Returns:
4405 * true (>0) if we indeed boosted the target task.
4406 * false (0) if we failed to boost the target.
4407 * -ESRCH if there's no task to yield to.
4414 */ 4408 */
4415bool __sched yield_to(struct task_struct *p, bool preempt) 4409bool __sched yield_to(struct task_struct *p, bool preempt)
4416{ 4410{
4417 struct task_struct *curr = current; 4411 struct task_struct *curr = current;
4418 struct rq *rq, *p_rq; 4412 struct rq *rq, *p_rq;
4419 unsigned long flags; 4413 unsigned long flags;
4420 bool yielded = 0; 4414 int yielded = 0;
4421 4415
4422 local_irq_save(flags); 4416 local_irq_save(flags);
4423 rq = this_rq(); 4417 rq = this_rq();
4424 4418
4425again: 4419again:
4426 p_rq = task_rq(p); 4420 p_rq = task_rq(p);
4421 /*
4422 * If we're the only runnable task on the rq and target rq also
4423 * has only one task, there's absolutely no point in yielding.
4424 */
4425 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4426 yielded = -ESRCH;
4427 goto out_irq;
4428 }
4429
4427 double_rq_lock(rq, p_rq); 4430 double_rq_lock(rq, p_rq);
4428 while (task_rq(p) != p_rq) { 4431 while (task_rq(p) != p_rq) {
4429 double_rq_unlock(rq, p_rq); 4432 double_rq_unlock(rq, p_rq);
@@ -4431,13 +4434,13 @@ again:
4431 } 4434 }
4432 4435
4433 if (!curr->sched_class->yield_to_task) 4436 if (!curr->sched_class->yield_to_task)
4434 goto out; 4437 goto out_unlock;
4435 4438
4436 if (curr->sched_class != p->sched_class) 4439 if (curr->sched_class != p->sched_class)
4437 goto out; 4440 goto out_unlock;
4438 4441
4439 if (task_running(p_rq, p) || p->state) 4442 if (task_running(p_rq, p) || p->state)
4440 goto out; 4443 goto out_unlock;
4441 4444
4442 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4445 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4443 if (yielded) { 4446 if (yielded) {
@@ -4450,11 +4453,12 @@ again:
4450 resched_task(p_rq->curr); 4453 resched_task(p_rq->curr);
4451 } 4454 }
4452 4455
4453out: 4456out_unlock:
4454 double_rq_unlock(rq, p_rq); 4457 double_rq_unlock(rq, p_rq);
4458out_irq:
4455 local_irq_restore(flags); 4459 local_irq_restore(flags);
4456 4460
4457 if (yielded) 4461 if (yielded > 0)
4458 schedule(); 4462 schedule();
4459 4463
4460 return yielded; 4464 return yielded;
@@ -4713,6 +4717,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4713 */ 4717 */
4714 idle->sched_class = &idle_sched_class; 4718 idle->sched_class = &idle_sched_class;
4715 ftrace_graph_init_idle_task(idle, cpu); 4719 ftrace_graph_init_idle_task(idle, cpu);
4720 vtime_init_idle(idle);
4716#if defined(CONFIG_SMP) 4721#if defined(CONFIG_SMP)
4717 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4722 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4718#endif 4723#endif
@@ -7206,7 +7211,6 @@ static void free_sched_group(struct task_group *tg)
7206struct task_group *sched_create_group(struct task_group *parent) 7211struct task_group *sched_create_group(struct task_group *parent)
7207{ 7212{
7208 struct task_group *tg; 7213 struct task_group *tg;
7209 unsigned long flags;
7210 7214
7211 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7215 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7212 if (!tg) 7216 if (!tg)
@@ -7218,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent)
7218 if (!alloc_rt_sched_group(tg, parent)) 7222 if (!alloc_rt_sched_group(tg, parent))
7219 goto err; 7223 goto err;
7220 7224
7225 return tg;
7226
7227err:
7228 free_sched_group(tg);
7229 return ERR_PTR(-ENOMEM);
7230}
7231
7232void sched_online_group(struct task_group *tg, struct task_group *parent)
7233{
7234 unsigned long flags;
7235
7221 spin_lock_irqsave(&task_group_lock, flags); 7236 spin_lock_irqsave(&task_group_lock, flags);
7222 list_add_rcu(&tg->list, &task_groups); 7237 list_add_rcu(&tg->list, &task_groups);
7223 7238
@@ -7227,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
7227 INIT_LIST_HEAD(&tg->children); 7242 INIT_LIST_HEAD(&tg->children);
7228 list_add_rcu(&tg->siblings, &parent->children); 7243 list_add_rcu(&tg->siblings, &parent->children);
7229 spin_unlock_irqrestore(&task_group_lock, flags); 7244 spin_unlock_irqrestore(&task_group_lock, flags);
7230
7231 return tg;
7232
7233err:
7234 free_sched_group(tg);
7235 return ERR_PTR(-ENOMEM);
7236} 7245}
7237 7246
7238/* rcu callback to free various structures associated with a task group */ 7247/* rcu callback to free various structures associated with a task group */
@@ -7245,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
7245/* Destroy runqueue etc associated with a task group */ 7254/* Destroy runqueue etc associated with a task group */
7246void sched_destroy_group(struct task_group *tg) 7255void sched_destroy_group(struct task_group *tg)
7247{ 7256{
7257 /* wait for possible concurrent references to cfs_rqs complete */
7258 call_rcu(&tg->rcu, free_sched_group_rcu);
7259}
7260
7261void sched_offline_group(struct task_group *tg)
7262{
7248 unsigned long flags; 7263 unsigned long flags;
7249 int i; 7264 int i;
7250 7265
@@ -7256,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg)
7256 list_del_rcu(&tg->list); 7271 list_del_rcu(&tg->list);
7257 list_del_rcu(&tg->siblings); 7272 list_del_rcu(&tg->siblings);
7258 spin_unlock_irqrestore(&task_group_lock, flags); 7273 spin_unlock_irqrestore(&task_group_lock, flags);
7259
7260 /* wait for possible concurrent references to cfs_rqs complete */
7261 call_rcu(&tg->rcu, free_sched_group_rcu);
7262} 7274}
7263 7275
7264/* change task's runqueue when it moves between groups. 7276/* change task's runqueue when it moves between groups.
@@ -7554,6 +7566,25 @@ static int sched_rt_global_constraints(void)
7554} 7566}
7555#endif /* CONFIG_RT_GROUP_SCHED */ 7567#endif /* CONFIG_RT_GROUP_SCHED */
7556 7568
7569int sched_rr_handler(struct ctl_table *table, int write,
7570 void __user *buffer, size_t *lenp,
7571 loff_t *ppos)
7572{
7573 int ret;
7574 static DEFINE_MUTEX(mutex);
7575
7576 mutex_lock(&mutex);
7577 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7578 /* make sure that internally we keep jiffies */
7579 /* also, writing zero resets timeslice to default */
7580 if (!ret && write) {
7581 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7582 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7583 }
7584 mutex_unlock(&mutex);
7585 return ret;
7586}
7587
7557int sched_rt_handler(struct ctl_table *table, int write, 7588int sched_rt_handler(struct ctl_table *table, int write,
7558 void __user *buffer, size_t *lenp, 7589 void __user *buffer, size_t *lenp,
7559 loff_t *ppos) 7590 loff_t *ppos)
@@ -7610,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7610 return &tg->css; 7641 return &tg->css;
7611} 7642}
7612 7643
7644static int cpu_cgroup_css_online(struct cgroup *cgrp)
7645{
7646 struct task_group *tg = cgroup_tg(cgrp);
7647 struct task_group *parent;
7648
7649 if (!cgrp->parent)
7650 return 0;
7651
7652 parent = cgroup_tg(cgrp->parent);
7653 sched_online_group(tg, parent);
7654 return 0;
7655}
7656
7613static void cpu_cgroup_css_free(struct cgroup *cgrp) 7657static void cpu_cgroup_css_free(struct cgroup *cgrp)
7614{ 7658{
7615 struct task_group *tg = cgroup_tg(cgrp); 7659 struct task_group *tg = cgroup_tg(cgrp);
@@ -7617,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
7617 sched_destroy_group(tg); 7661 sched_destroy_group(tg);
7618} 7662}
7619 7663
7664static void cpu_cgroup_css_offline(struct cgroup *cgrp)
7665{
7666 struct task_group *tg = cgroup_tg(cgrp);
7667
7668 sched_offline_group(tg);
7669}
7670
7620static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7671static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7621 struct cgroup_taskset *tset) 7672 struct cgroup_taskset *tset)
7622{ 7673{
@@ -7972,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7972 .name = "cpu", 8023 .name = "cpu",
7973 .css_alloc = cpu_cgroup_css_alloc, 8024 .css_alloc = cpu_cgroup_css_alloc,
7974 .css_free = cpu_cgroup_css_free, 8025 .css_free = cpu_cgroup_css_free,
8026 .css_online = cpu_cgroup_css_online,
8027 .css_offline = cpu_cgroup_css_offline,
7975 .can_attach = cpu_cgroup_can_attach, 8028 .can_attach = cpu_cgroup_can_attach,
7976 .attach = cpu_cgroup_attach, 8029 .attach = cpu_cgroup_attach,
7977 .exit = cpu_cgroup_exit, 8030 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
31#include "cpupri.h" 33#include "cpupri.h"
32 34
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 35/* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..ed12cbb135f4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
3#include <linux/tsacct_kern.h> 3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h> 4#include <linux/kernel_stat.h>
5#include <linux/static_key.h> 5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
6#include "sched.h" 7#include "sched.h"
7 8
8 9
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
163 task_group_account_field(p, index, (__force u64) cputime); 164 task_group_account_field(p, index, (__force u64) cputime);
164 165
165 /* Account for user time used */ 166 /* Account for user time used */
166 acct_update_integrals(p); 167 acct_account_cputime(p);
167} 168}
168 169
169/* 170/*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
213 task_group_account_field(p, index, (__force u64) cputime); 214 task_group_account_field(p, index, (__force u64) cputime);
214 215
215 /* Account for system time used */ 216 /* Account for system time used */
216 acct_update_integrals(p); 217 acct_account_cputime(p);
217} 218}
218 219
219/* 220/*
@@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void)
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 296void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{ 297{
297 struct signal_struct *sig = tsk->signal; 298 struct signal_struct *sig = tsk->signal;
299 cputime_t utime, stime;
298 struct task_struct *t; 300 struct task_struct *t;
299 301
300 times->utime = sig->utime; 302 times->utime = sig->utime;
@@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
308 310
309 t = tsk; 311 t = tsk;
310 do { 312 do {
311 times->utime += t->utime; 313 task_cputime(tsk, &utime, &stime);
312 times->stime += t->stime; 314 times->utime += utime;
315 times->stime += stime;
313 times->sum_exec_runtime += task_sched_runtime(t); 316 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t); 317 } while_each_thread(tsk, t);
315out: 318out:
316 rcu_read_unlock(); 319 rcu_read_unlock();
317} 320}
318 321
319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
320
321#ifdef CONFIG_IRQ_TIME_ACCOUNTING 322#ifdef CONFIG_IRQ_TIME_ACCOUNTING
322/* 323/*
323 * Account a tick to a process and cpustat 324 * Account a tick to a process and cpustat
@@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
382 irqtime_account_process_tick(current, 0, rq); 383 irqtime_account_process_tick(current, 0, rq);
383} 384}
384#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 385#else /* CONFIG_IRQ_TIME_ACCOUNTING */
385static void irqtime_account_idle_ticks(int ticks) {} 386static inline void irqtime_account_idle_ticks(int ticks) {}
386static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 387static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
387 struct rq *rq) {} 388 struct rq *rq) {}
388#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
389 390
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
390/* 392/*
391 * Account a single tick of cpu time. 393 * Account a single tick of cpu time.
392 * @p: the process that the cpu time gets accounted to 394 * @p: the process that the cpu time gets accounted to
@@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
397 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 399 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
398 struct rq *rq = this_rq(); 400 struct rq *rq = this_rq();
399 401
402 if (vtime_accounting_enabled())
403 return;
404
400 if (sched_clock_irqtime) { 405 if (sched_clock_irqtime) {
401 irqtime_account_process_tick(p, user_tick, rq); 406 irqtime_account_process_tick(p, user_tick, rq);
402 return; 407 return;
@@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks)
438 443
439 account_idle_time(jiffies_to_cputime(ticks)); 444 account_idle_time(jiffies_to_cputime(ticks));
440} 445}
441 446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
442#endif
443 447
444/* 448/*
445 * Use precise platform statistics if available: 449 * Use precise platform statistics if available:
@@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
461 *st = cputime.stime; 465 *st = cputime.stime;
462} 466}
463 467
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev) 469void vtime_task_switch(struct task_struct *prev)
476{ 470{
471 if (!vtime_accounting_enabled())
472 return;
473
477 if (is_idle_task(prev)) 474 if (is_idle_task(prev))
478 vtime_account_idle(prev); 475 vtime_account_idle(prev);
479 else 476 else
480 vtime_account_system(prev); 477 vtime_account_system(prev);
481 478
479#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
482 vtime_account_user(prev); 480 vtime_account_user(prev);
481#endif
483 arch_vtime_task_switch(prev); 482 arch_vtime_task_switch(prev);
484} 483}
485#endif 484#endif
@@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev)
493 * vtime_account(). 492 * vtime_account().
494 */ 493 */
495#ifndef __ARCH_HAS_VTIME_ACCOUNT 494#ifndef __ARCH_HAS_VTIME_ACCOUNT
496void vtime_account(struct task_struct *tsk) 495void vtime_account_irq_enter(struct task_struct *tsk)
497{ 496{
498 if (in_interrupt() || !is_idle_task(tsk)) 497 if (!vtime_accounting_enabled())
499 vtime_account_system(tsk); 498 return;
500 else 499
501 vtime_account_idle(tsk); 500 if (!in_interrupt()) {
501 /*
502 * If we interrupted user, context_tracking_in_user()
503 * is 1 because the context tracking don't hook
504 * on irq entry/exit. This way we know if
505 * we need to flush user time on kernel entry.
506 */
507 if (context_tracking_in_user()) {
508 vtime_account_user(tsk);
509 return;
510 }
511
512 if (is_idle_task(tsk)) {
513 vtime_account_idle(tsk);
514 return;
515 }
516 }
517 vtime_account_system(tsk);
502} 518}
503EXPORT_SYMBOL_GPL(vtime_account); 519EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
504#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 520#endif /* __ARCH_HAS_VTIME_ACCOUNT */
505 521
506#else 522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
507
508#ifndef nsecs_to_cputime
509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
510#endif
511 523
512static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) 524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
513{ 525{
514 u64 temp = (__force u64) rtime; 526 u64 temp = (__force u64) rtime;
515 527
516 temp *= (__force u64) utime; 528 temp *= (__force u64) stime;
517 529
518 if (sizeof(cputime_t) == 4) 530 if (sizeof(cputime_t) == 4)
519 temp = div_u64(temp, (__force u32) total); 531 temp = div_u64(temp, (__force u32) total);
@@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev, 543 struct cputime *prev,
532 cputime_t *ut, cputime_t *st) 544 cputime_t *ut, cputime_t *st)
533{ 545{
534 cputime_t rtime, utime, total; 546 cputime_t rtime, stime, total;
535 547
536 utime = curr->utime; 548 stime = curr->stime;
537 total = utime + curr->stime; 549 total = stime + curr->utime;
538 550
539 /* 551 /*
540 * Tick based cputime accounting depend on random scheduling 552 * Tick based cputime accounting depend on random scheduling
@@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr,
549 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 561 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
550 562
551 if (total) 563 if (total)
552 utime = scale_utime(utime, rtime, total); 564 stime = scale_stime(stime, rtime, total);
553 else 565 else
554 utime = rtime; 566 stime = rtime;
555 567
556 /* 568 /*
557 * If the tick based count grows faster than the scheduler one, 569 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward. 570 * the result of the scaling may go backward.
559 * Let's enforce monotonicity. 571 * Let's enforce monotonicity.
560 */ 572 */
561 prev->utime = max(prev->utime, utime); 573 prev->stime = max(prev->stime, stime);
562 prev->stime = max(prev->stime, rtime - prev->utime); 574 prev->utime = max(prev->utime, rtime - prev->stime);
563 575
564 *ut = prev->utime; 576 *ut = prev->utime;
565 *st = prev->stime; 577 *st = prev->stime;
@@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr,
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 580void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{ 581{
570 struct task_cputime cputime = { 582 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime, 583 .sum_exec_runtime = p->se.sum_exec_runtime,
574 }; 584 };
575 585
586 task_cputime(p, &cputime.utime, &cputime.stime);
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 587 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
577} 588}
578 589
@@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
586 thread_group_cputime(p, &cputime); 597 thread_group_cputime(p, &cputime);
587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
588} 599}
589#endif 600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
601
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk)
604{
605 unsigned long long clock;
606
607 clock = local_clock();
608 if (clock < tsk->vtime_snap)
609 return 0;
610
611 return clock - tsk->vtime_snap;
612}
613
614static cputime_t get_vtime_delta(struct task_struct *tsk)
615{
616 unsigned long long delta = vtime_delta(tsk);
617
618 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
619 tsk->vtime_snap += delta;
620
621 /* CHECKME: always safe to convert nsecs to cputime? */
622 return nsecs_to_cputime(delta);
623}
624
625static void __vtime_account_system(struct task_struct *tsk)
626{
627 cputime_t delta_cpu = get_vtime_delta(tsk);
628
629 account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
630}
631
632void vtime_account_system(struct task_struct *tsk)
633{
634 if (!vtime_accounting_enabled())
635 return;
636
637 write_seqlock(&tsk->vtime_seqlock);
638 __vtime_account_system(tsk);
639 write_sequnlock(&tsk->vtime_seqlock);
640}
641
642void vtime_account_irq_exit(struct task_struct *tsk)
643{
644 if (!vtime_accounting_enabled())
645 return;
646
647 write_seqlock(&tsk->vtime_seqlock);
648 if (context_tracking_in_user())
649 tsk->vtime_snap_whence = VTIME_USER;
650 __vtime_account_system(tsk);
651 write_sequnlock(&tsk->vtime_seqlock);
652}
653
654void vtime_account_user(struct task_struct *tsk)
655{
656 cputime_t delta_cpu;
657
658 if (!vtime_accounting_enabled())
659 return;
660
661 delta_cpu = get_vtime_delta(tsk);
662
663 write_seqlock(&tsk->vtime_seqlock);
664 tsk->vtime_snap_whence = VTIME_SYS;
665 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
666 write_sequnlock(&tsk->vtime_seqlock);
667}
668
669void vtime_user_enter(struct task_struct *tsk)
670{
671 if (!vtime_accounting_enabled())
672 return;
673
674 write_seqlock(&tsk->vtime_seqlock);
675 tsk->vtime_snap_whence = VTIME_USER;
676 __vtime_account_system(tsk);
677 write_sequnlock(&tsk->vtime_seqlock);
678}
679
680void vtime_guest_enter(struct task_struct *tsk)
681{
682 write_seqlock(&tsk->vtime_seqlock);
683 __vtime_account_system(tsk);
684 current->flags |= PF_VCPU;
685 write_sequnlock(&tsk->vtime_seqlock);
686}
687
688void vtime_guest_exit(struct task_struct *tsk)
689{
690 write_seqlock(&tsk->vtime_seqlock);
691 __vtime_account_system(tsk);
692 current->flags &= ~PF_VCPU;
693 write_sequnlock(&tsk->vtime_seqlock);
694}
695
696void vtime_account_idle(struct task_struct *tsk)
697{
698 cputime_t delta_cpu = get_vtime_delta(tsk);
699
700 account_idle_time(delta_cpu);
701}
702
703bool vtime_accounting_enabled(void)
704{
705 return context_tracking_active();
706}
707
708void arch_vtime_task_switch(struct task_struct *prev)
709{
710 write_seqlock(&prev->vtime_seqlock);
711 prev->vtime_snap_whence = VTIME_SLEEPING;
712 write_sequnlock(&prev->vtime_seqlock);
713
714 write_seqlock(&current->vtime_seqlock);
715 current->vtime_snap_whence = VTIME_SYS;
716 current->vtime_snap = sched_clock();
717 write_sequnlock(&current->vtime_seqlock);
718}
719
720void vtime_init_idle(struct task_struct *t)
721{
722 unsigned long flags;
723
724 write_seqlock_irqsave(&t->vtime_seqlock, flags);
725 t->vtime_snap_whence = VTIME_SYS;
726 t->vtime_snap = sched_clock();
727 write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
728}
729
730cputime_t task_gtime(struct task_struct *t)
731{
732 unsigned int seq;
733 cputime_t gtime;
734
735 do {
736 seq = read_seqbegin(&t->vtime_seqlock);
737
738 gtime = t->gtime;
739 if (t->flags & PF_VCPU)
740 gtime += vtime_delta(t);
741
742 } while (read_seqretry(&t->vtime_seqlock, seq));
743
744 return gtime;
745}
746
747/*
748 * Fetch cputime raw values from fields of task_struct and
749 * add up the pending nohz execution time since the last
750 * cputime snapshot.
751 */
752static void
753fetch_task_cputime(struct task_struct *t,
754 cputime_t *u_dst, cputime_t *s_dst,
755 cputime_t *u_src, cputime_t *s_src,
756 cputime_t *udelta, cputime_t *sdelta)
757{
758 unsigned int seq;
759 unsigned long long delta;
760
761 do {
762 *udelta = 0;
763 *sdelta = 0;
764
765 seq = read_seqbegin(&t->vtime_seqlock);
766
767 if (u_dst)
768 *u_dst = *u_src;
769 if (s_dst)
770 *s_dst = *s_src;
771
772 /* Task is sleeping, nothing to add */
773 if (t->vtime_snap_whence == VTIME_SLEEPING ||
774 is_idle_task(t))
775 continue;
776
777 delta = vtime_delta(t);
778
779 /*
780 * Task runs either in user or kernel space, add pending nohz time to
781 * the right place.
782 */
783 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
784 *udelta = delta;
785 } else {
786 if (t->vtime_snap_whence == VTIME_SYS)
787 *sdelta = delta;
788 }
789 } while (read_seqretry(&t->vtime_seqlock, seq));
790}
791
792
793void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
794{
795 cputime_t udelta, sdelta;
796
797 fetch_task_cputime(t, utime, stime, &t->utime,
798 &t->stime, &udelta, &sdelta);
799 if (utime)
800 *utime += udelta;
801 if (stime)
802 *stime += sdelta;
803}
804
805void task_cputime_scaled(struct task_struct *t,
806 cputime_t *utimescaled, cputime_t *stimescaled)
807{
808 cputime_t udelta, sdelta;
809
810 fetch_task_cputime(t, utimescaled, stimescaled,
811 &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
812 if (utimescaled)
813 *utimescaled += cputime_to_scaled(udelta);
814 if (stimescaled)
815 *stimescaled += cputime_to_scaled(sdelta);
816}
817#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2cd3c1b4e582..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
110 if (autogroup_path(tg, group_path, PATH_MAX)) 110 if (autogroup_path(tg, group_path, PATH_MAX))
111 return group_path; 111 return group_path;
112 112
113 /*
114 * May be NULL if the underlying cgroup isn't fully-created yet
115 */
116 if (!tg->css.cgroup) {
117 group_path[0] = '\0';
118 return group_path;
119 }
120 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 113 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
121 return group_path; 114 return group_path;
122} 115}
@@ -222,8 +215,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
222 cfs_rq->runnable_load_avg); 215 cfs_rq->runnable_load_avg);
223 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", 216 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
224 cfs_rq->blocked_load_avg); 217 cfs_rq->blocked_load_avg);
225 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", 218 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg",
226 atomic64_read(&cfs_rq->tg->load_avg)); 219 (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
227 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", 220 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
228 cfs_rq->tg_load_contrib); 221 cfs_rq->tg_load_contrib);
229 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", 222 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
269 { 262 {
270 unsigned int freq = cpu_khz ? : 1; 263 unsigned int freq = cpu_khz ? : 1;
271 264
272 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", 265 SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
273 cpu, freq / 1000, (freq % 1000)); 266 cpu, freq / 1000, (freq % 1000));
274 } 267 }
275#else 268#else
276 SEQ_printf(m, "\ncpu#%d\n", cpu); 269 SEQ_printf(m, "cpu#%d\n", cpu);
277#endif 270#endif
278 271
279#define P(x) \ 272#define P(x) \
@@ -330,6 +323,7 @@ do { \
330 print_rq(m, rq, cpu); 323 print_rq(m, rq, cpu);
331 rcu_read_unlock(); 324 rcu_read_unlock();
332 spin_unlock_irqrestore(&sched_debug_lock, flags); 325 spin_unlock_irqrestore(&sched_debug_lock, flags);
326 SEQ_printf(m, "\n");
333} 327}
334 328
335static const char *sched_tunable_scaling_names[] = { 329static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
338 "linear" 332 "linear"
339}; 333};
340 334
341static int sched_debug_show(struct seq_file *m, void *v) 335static void sched_debug_header(struct seq_file *m)
342{ 336{
343 u64 ktime, sched_clk, cpu_clk; 337 u64 ktime, sched_clk, cpu_clk;
344 unsigned long flags; 338 unsigned long flags;
345 int cpu;
346 339
347 local_irq_save(flags); 340 local_irq_save(flags);
348 ktime = ktime_to_ns(ktime_get()); 341 ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
384#undef PN 377#undef PN
385#undef P 378#undef P
386 379
387 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", 380 SEQ_printf(m, " .%-40s: %d (%s)\n",
381 "sysctl_sched_tunable_scaling",
388 sysctl_sched_tunable_scaling, 382 sysctl_sched_tunable_scaling,
389 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); 383 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
384 SEQ_printf(m, "\n");
385}
390 386
391 for_each_online_cpu(cpu) 387static int sched_debug_show(struct seq_file *m, void *v)
392 print_cpu(m, cpu); 388{
389 int cpu = (unsigned long)(v - 2);
393 390
394 SEQ_printf(m, "\n"); 391 if (cpu != -1)
392 print_cpu(m, cpu);
393 else
394 sched_debug_header(m);
395 395
396 return 0; 396 return 0;
397} 397}
398 398
399void sysrq_sched_debug_show(void) 399void sysrq_sched_debug_show(void)
400{ 400{
401 sched_debug_show(NULL, NULL); 401 int cpu;
402
403 sched_debug_header(NULL);
404 for_each_online_cpu(cpu)
405 print_cpu(NULL, cpu);
406
407}
408
409/*
410 * This itererator needs some explanation.
411 * It returns 1 for the header position.
412 * This means 2 is cpu 0.
413 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
414 * to use cpumask_* to iterate over the cpus.
415 */
416static void *sched_debug_start(struct seq_file *file, loff_t *offset)
417{
418 unsigned long n = *offset;
419
420 if (n == 0)
421 return (void *) 1;
422
423 n--;
424
425 if (n > 0)
426 n = cpumask_next(n - 1, cpu_online_mask);
427 else
428 n = cpumask_first(cpu_online_mask);
429
430 *offset = n + 1;
431
432 if (n < nr_cpu_ids)
433 return (void *)(unsigned long)(n + 2);
434 return NULL;
435}
436
437static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
438{
439 (*offset)++;
440 return sched_debug_start(file, offset);
441}
442
443static void sched_debug_stop(struct seq_file *file, void *data)
444{
445}
446
447static const struct seq_operations sched_debug_sops = {
448 .start = sched_debug_start,
449 .next = sched_debug_next,
450 .stop = sched_debug_stop,
451 .show = sched_debug_show,
452};
453
454static int sched_debug_release(struct inode *inode, struct file *file)
455{
456 seq_release(inode, file);
457
458 return 0;
402} 459}
403 460
404static int sched_debug_open(struct inode *inode, struct file *filp) 461static int sched_debug_open(struct inode *inode, struct file *filp)
405{ 462{
406 return single_open(filp, sched_debug_show, NULL); 463 int ret = 0;
464
465 ret = seq_open(filp, &sched_debug_sops);
466
467 return ret;
407} 468}
408 469
409static const struct file_operations sched_debug_fops = { 470static const struct file_operations sched_debug_fops = {
410 .open = sched_debug_open, 471 .open = sched_debug_open,
411 .read = seq_read, 472 .read = seq_read,
412 .llseek = seq_lseek, 473 .llseek = seq_lseek,
413 .release = single_release, 474 .release = sched_debug_release,
414}; 475};
415 476
416static int __init init_sched_debug_procfs(void) 477static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eea8707234a..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1680 } 1680 }
1681 1681
1682 /* ensure we never gain time by being placed backwards. */ 1682 /* ensure we never gain time by being placed backwards. */
1683 vruntime = max_vruntime(se->vruntime, vruntime); 1683 se->vruntime = max_vruntime(se->vruntime, vruntime);
1684
1685 se->vruntime = vruntime;
1686} 1684}
1687 1685
1688static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 1686static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -2663,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2663 hrtimer_cancel(&cfs_b->slack_timer); 2661 hrtimer_cancel(&cfs_b->slack_timer);
2664} 2662}
2665 2663
2666static void unthrottle_offline_cfs_rqs(struct rq *rq) 2664static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2667{ 2665{
2668 struct cfs_rq *cfs_rq; 2666 struct cfs_rq *cfs_rq;
2669 2667
@@ -3254,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3254 */ 3252 */
3255static int select_idle_sibling(struct task_struct *p, int target) 3253static int select_idle_sibling(struct task_struct *p, int target)
3256{ 3254{
3257 int cpu = smp_processor_id();
3258 int prev_cpu = task_cpu(p);
3259 struct sched_domain *sd; 3255 struct sched_domain *sd;
3260 struct sched_group *sg; 3256 struct sched_group *sg;
3261 int i; 3257 int i = task_cpu(p);
3262 3258
3263 /* 3259 if (idle_cpu(target))
3264 * If the task is going to be woken-up on this cpu and if it is 3260 return target;
3265 * already idle, then it is the right target.
3266 */
3267 if (target == cpu && idle_cpu(cpu))
3268 return cpu;
3269 3261
3270 /* 3262 /*
3271 * If the task is going to be woken-up on the cpu where it previously 3263 * If the prevous cpu is cache affine and idle, don't be stupid.
3272 * ran and if it is currently idle, then it the right target.
3273 */ 3264 */
3274 if (target == prev_cpu && idle_cpu(prev_cpu)) 3265 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3275 return prev_cpu; 3266 return i;
3276 3267
3277 /* 3268 /*
3278 * Otherwise, iterate the domains and find an elegible idle cpu. 3269 * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3286,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
3286 goto next; 3277 goto next;
3287 3278
3288 for_each_cpu(i, sched_group_cpus(sg)) { 3279 for_each_cpu(i, sched_group_cpus(sg)) {
3289 if (!idle_cpu(i)) 3280 if (i == target || !idle_cpu(i))
3290 goto next; 3281 goto next;
3291 } 3282 }
3292 3283
@@ -6101,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
6101 * idle runqueue: 6092 * idle runqueue:
6102 */ 6093 */
6103 if (rq->cfs.load.weight) 6094 if (rq->cfs.load.weight)
6104 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 6095 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
6105 6096
6106 return rr_interval; 6097 return rr_interval;
6107} 6098}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 418feb01344e..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
7 7
8#include <linux/slab.h> 8#include <linux/slab.h>
9 9
10int sched_rr_timeslice = RR_TIMESLICE;
11
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 12static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11 13
12struct rt_bandwidth def_rt_bandwidth; 14struct rt_bandwidth def_rt_bandwidth;
@@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
566static int do_balance_runtime(struct rt_rq *rt_rq) 568static int do_balance_runtime(struct rt_rq *rt_rq)
567{ 569{
568 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 570 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
569 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 571 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
570 int i, weight, more = 0; 572 int i, weight, more = 0;
571 u64 rt_period; 573 u64 rt_period;
572 574
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
925 return; 927 return;
926 928
927 delta_exec = rq->clock_task - curr->se.exec_start; 929 delta_exec = rq->clock_task - curr->se.exec_start;
928 if (unlikely((s64)delta_exec < 0)) 930 if (unlikely((s64)delta_exec <= 0))
929 delta_exec = 0; 931 return;
930 932
931 schedstat_set(curr->se.statistics.exec_max, 933 schedstat_set(curr->se.statistics.exec_max,
932 max(curr->se.statistics.exec_max, delta_exec)); 934 max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1427static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1429static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1428{ 1430{
1429 if (!task_running(rq, p) && 1431 if (!task_running(rq, p) &&
1430 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1432 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1431 (p->nr_cpus_allowed > 1))
1432 return 1; 1433 return 1;
1433 return 0; 1434 return 0;
1434} 1435}
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1889 * we may need to handle the pulling of RT tasks 1890 * we may need to handle the pulling of RT tasks
1890 * now. 1891 * now.
1891 */ 1892 */
1892 if (p->on_rq && !rq->rt.rt_nr_running) 1893 if (!p->on_rq || rq->rt.rt_nr_running)
1893 pull_rt_task(rq); 1894 return;
1895
1896 if (pull_rt_task(rq))
1897 resched_task(rq->curr);
1894} 1898}
1895 1899
1896void init_sched_rt_class(void) 1900void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1985 if (soft != RLIM_INFINITY) { 1989 if (soft != RLIM_INFINITY) {
1986 unsigned long next; 1990 unsigned long next;
1987 1991
1988 p->rt.timeout++; 1992 if (p->rt.watchdog_stamp != jiffies) {
1993 p->rt.timeout++;
1994 p->rt.watchdog_stamp = jiffies;
1995 }
1996
1989 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1997 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1990 if (p->rt.timeout > next) 1998 if (p->rt.timeout > next)
1991 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 1999 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2010 if (--p->rt.time_slice) 2018 if (--p->rt.time_slice)
2011 return; 2019 return;
2012 2020
2013 p->rt.time_slice = RR_TIMESLICE; 2021 p->rt.time_slice = sched_rr_timeslice;
2014 2022
2015 /* 2023 /*
2016 * Requeue to the end of queue if we (and all of our ancestors) are the 2024 * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2041 * Time slice is 0 for SCHED_FIFO tasks 2049 * Time slice is 0 for SCHED_FIFO tasks
2042 */ 2050 */
2043 if (task->policy == SCHED_RR) 2051 if (task->policy == SCHED_RR)
2044 return RR_TIMESLICE; 2052 return sched_rr_timeslice;
2045 else 2053 else
2046 return 0; 2054 return 0;
2047} 2055}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc886441436a..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
1 1
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h>
3#include <linux/mutex.h> 5#include <linux/mutex.h>
4#include <linux/spinlock.h> 6#include <linux/spinlock.h>
5#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
21 if (mask_str == NULL) 21 if (mask_str == NULL)
22 return -ENOMEM; 22 return -ENOMEM;
23 23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 24 if (v == (void *)1) {
25 seq_printf(seq, "timestamp %lu\n", jiffies); 25 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
26 for_each_online_cpu(cpu) { 26 seq_printf(seq, "timestamp %lu\n", jiffies);
27 struct rq *rq = cpu_rq(cpu); 27 } else {
28 struct rq *rq;
28#ifdef CONFIG_SMP 29#ifdef CONFIG_SMP
29 struct sched_domain *sd; 30 struct sched_domain *sd;
30 int dcount = 0; 31 int dcount = 0;
31#endif 32#endif
33 cpu = (unsigned long)(v - 2);
34 rq = cpu_rq(cpu);
32 35
33 /* runqueue-specific stats */ 36 /* runqueue-specific stats */
34 seq_printf(seq, 37 seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
77 return 0; 80 return 0;
78} 81}
79 82
80static int schedstat_open(struct inode *inode, struct file *file) 83/*
84 * This itererator needs some explanation.
85 * It returns 1 for the header position.
86 * This means 2 is cpu 0.
87 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
88 * to use cpumask_* to iterate over the cpus.
89 */
90static void *schedstat_start(struct seq_file *file, loff_t *offset)
81{ 91{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 92 unsigned long n = *offset;
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86 93
87 if (!buf) 94 if (n == 0)
88 return -ENOMEM; 95 return (void *) 1;
89 res = single_open(file, show_schedstat, NULL); 96
90 if (!res) { 97 n--;
91 m = file->private_data; 98
92 m->buf = buf; 99 if (n > 0)
93 m->size = size; 100 n = cpumask_next(n - 1, cpu_online_mask);
94 } else 101 else
95 kfree(buf); 102 n = cpumask_first(cpu_online_mask);
96 return res; 103
104 *offset = n + 1;
105
106 if (n < nr_cpu_ids)
107 return (void *)(unsigned long)(n + 2);
108 return NULL;
109}
110
111static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
112{
113 (*offset)++;
114 return schedstat_start(file, offset);
115}
116
117static void schedstat_stop(struct seq_file *file, void *data)
118{
119}
120
121static const struct seq_operations schedstat_sops = {
122 .start = schedstat_start,
123 .next = schedstat_next,
124 .stop = schedstat_stop,
125 .show = show_schedstat,
126};
127
128static int schedstat_open(struct inode *inode, struct file *file)
129{
130 return seq_open(file, &schedstat_sops);
97} 131}
98 132
133static int schedstat_release(struct inode *inode, struct file *file)
134{
135 return 0;
136};
137
99static const struct file_operations proc_schedstat_operations = { 138static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open, 139 .open = schedstat_open,
101 .read = seq_read, 140 .read = seq_read,
102 .llseek = seq_lseek, 141 .llseek = seq_lseek,
103 .release = single_release, 142 .release = schedstat_release,
104}; 143};
105 144
106static int __init proc_schedstat_init(void) 145static int __init proc_schedstat_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 372771e948c2..2676aac4103d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -680,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
680 * No need to set need_resched since signal event passing 680 * No need to set need_resched since signal event passing
681 * goes through ->blocked 681 * goes through ->blocked
682 */ 682 */
683void signal_wake_up(struct task_struct *t, int resume) 683void signal_wake_up_state(struct task_struct *t, unsigned int state)
684{ 684{
685 unsigned int mask;
686
687 set_tsk_thread_flag(t, TIF_SIGPENDING); 685 set_tsk_thread_flag(t, TIF_SIGPENDING);
688
689 /* 686 /*
690 * For SIGKILL, we want to wake it up in the stopped/traced/killable 687 * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
691 * case. We don't check t->state here because there is a race with it 688 * case. We don't check t->state here because there is a race with it
692 * executing another processor and just now entering stopped state. 689 * executing another processor and just now entering stopped state.
693 * By using wake_up_state, we ensure the process will wake up and 690 * By using wake_up_state, we ensure the process will wake up and
694 * handle its death signal. 691 * handle its death signal.
695 */ 692 */
696 mask = TASK_INTERRUPTIBLE; 693 if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
697 if (resume)
698 mask |= TASK_WAKEKILL;
699 if (!wake_up_state(t, mask))
700 kick_process(t); 694 kick_process(t);
701} 695}
702 696
@@ -844,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t)
844 assert_spin_locked(&t->sighand->siglock); 838 assert_spin_locked(&t->sighand->siglock);
845 839
846 task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); 840 task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
847 signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); 841 ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
848} 842}
849 843
850/* 844/*
@@ -1163,11 +1157,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1163static void print_fatal_signal(int signr) 1157static void print_fatal_signal(int signr)
1164{ 1158{
1165 struct pt_regs *regs = signal_pt_regs(); 1159 struct pt_regs *regs = signal_pt_regs();
1166 printk("%s/%d: potentially unexpected fatal signal %d.\n", 1160 printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
1167 current->comm, task_pid_nr(current), signr); 1161 current->comm, task_pid_nr(current), signr);
1168 1162
1169#if defined(__i386__) && !defined(__arch_um__) 1163#if defined(__i386__) && !defined(__arch_um__)
1170 printk("code at %08lx: ", regs->ip); 1164 printk(KERN_INFO "code at %08lx: ", regs->ip);
1171 { 1165 {
1172 int i; 1166 int i;
1173 for (i = 0; i < 16; i++) { 1167 for (i = 0; i < 16; i++) {
@@ -1175,11 +1169,11 @@ static void print_fatal_signal(int signr)
1175 1169
1176 if (get_user(insn, (unsigned char *)(regs->ip + i))) 1170 if (get_user(insn, (unsigned char *)(regs->ip + i)))
1177 break; 1171 break;
1178 printk("%02x ", insn); 1172 printk(KERN_CONT "%02x ", insn);
1179 } 1173 }
1180 } 1174 }
1175 printk(KERN_CONT "\n");
1181#endif 1176#endif
1182 printk("\n");
1183 preempt_disable(); 1177 preempt_disable();
1184 show_regs(regs); 1178 show_regs(regs);
1185 preempt_enable(); 1179 preempt_enable();
@@ -1638,6 +1632,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1638 unsigned long flags; 1632 unsigned long flags;
1639 struct sighand_struct *psig; 1633 struct sighand_struct *psig;
1640 bool autoreap = false; 1634 bool autoreap = false;
1635 cputime_t utime, stime;
1641 1636
1642 BUG_ON(sig == -1); 1637 BUG_ON(sig == -1);
1643 1638
@@ -1675,8 +1670,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1675 task_uid(tsk)); 1670 task_uid(tsk));
1676 rcu_read_unlock(); 1671 rcu_read_unlock();
1677 1672
1678 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); 1673 task_cputime(tsk, &utime, &stime);
1679 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); 1674 info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
1675 info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
1680 1676
1681 info.si_status = tsk->exit_code & 0x7f; 1677 info.si_status = tsk->exit_code & 0x7f;
1682 if (tsk->exit_code & 0x80) 1678 if (tsk->exit_code & 0x80)
@@ -1740,6 +1736,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1740 unsigned long flags; 1736 unsigned long flags;
1741 struct task_struct *parent; 1737 struct task_struct *parent;
1742 struct sighand_struct *sighand; 1738 struct sighand_struct *sighand;
1739 cputime_t utime, stime;
1743 1740
1744 if (for_ptracer) { 1741 if (for_ptracer) {
1745 parent = tsk->parent; 1742 parent = tsk->parent;
@@ -1758,8 +1755,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1758 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1755 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1759 rcu_read_unlock(); 1756 rcu_read_unlock();
1760 1757
1761 info.si_utime = cputime_to_clock_t(tsk->utime); 1758 task_cputime(tsk, &utime, &stime);
1762 info.si_stime = cputime_to_clock_t(tsk->stime); 1759 info.si_utime = cputime_to_clock_t(utime);
1760 info.si_stime = cputime_to_clock_t(stime);
1763 1761
1764 info.si_code = why; 1762 info.si_code = why;
1765 switch (why) { 1763 switch (why) {
@@ -1800,6 +1798,10 @@ static inline int may_ptrace_stop(void)
1800 * If SIGKILL was already sent before the caller unlocked 1798 * If SIGKILL was already sent before the caller unlocked
1801 * ->siglock we must see ->core_state != NULL. Otherwise it 1799 * ->siglock we must see ->core_state != NULL. Otherwise it
1802 * is safe to enter schedule(). 1800 * is safe to enter schedule().
1801 *
1802 * This is almost outdated, a task with the pending SIGKILL can't
1803 * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
1804 * after SIGKILL was already dequeued.
1803 */ 1805 */
1804 if (unlikely(current->mm->core_state) && 1806 if (unlikely(current->mm->core_state) &&
1805 unlikely(current->mm == current->parent->mm)) 1807 unlikely(current->mm == current->parent->mm))
@@ -1925,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1925 if (gstop_done) 1927 if (gstop_done)
1926 do_notify_parent_cldstop(current, false, why); 1928 do_notify_parent_cldstop(current, false, why);
1927 1929
1930 /* tasklist protects us from ptrace_freeze_traced() */
1928 __set_current_state(TASK_RUNNING); 1931 __set_current_state(TASK_RUNNING);
1929 if (clear_code) 1932 if (clear_code)
1930 current->exit_code = 0; 1933 current->exit_code = 0;
@@ -2396,6 +2399,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
2396 tracehook_signal_handler(sig, info, ka, regs, stepping); 2399 tracehook_signal_handler(sig, info, ka, regs, stepping);
2397} 2400}
2398 2401
2402void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
2403{
2404 if (failed)
2405 force_sigsegv(ksig->sig, current);
2406 else
2407 signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
2408 signal_pt_regs(), stepping);
2409}
2410
2399/* 2411/*
2400 * It could be that complete_signal() picked us to notify about the 2412 * It could be that complete_signal() picked us to notify about the
2401 * group-wide signal. Other threads should be notified now to take 2413 * group-wide signal. Other threads should be notified now to take
@@ -2613,28 +2625,58 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
2613 return 0; 2625 return 0;
2614} 2626}
2615 2627
2616long do_sigpending(void __user *set, unsigned long sigsetsize) 2628#ifdef CONFIG_COMPAT
2629COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
2630 compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
2617{ 2631{
2618 long error = -EINVAL; 2632#ifdef __BIG_ENDIAN
2619 sigset_t pending; 2633 sigset_t old_set = current->blocked;
2620 2634
2635 /* XXX: Don't preclude handling different sized sigset_t's. */
2636 if (sigsetsize != sizeof(sigset_t))
2637 return -EINVAL;
2638
2639 if (nset) {
2640 compat_sigset_t new32;
2641 sigset_t new_set;
2642 int error;
2643 if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
2644 return -EFAULT;
2645
2646 sigset_from_compat(&new_set, &new32);
2647 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2648
2649 error = sigprocmask(how, &new_set, NULL);
2650 if (error)
2651 return error;
2652 }
2653 if (oset) {
2654 compat_sigset_t old32;
2655 sigset_to_compat(&old32, &old_set);
2656 if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
2657 return -EFAULT;
2658 }
2659 return 0;
2660#else
2661 return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
2662 (sigset_t __user *)oset, sigsetsize);
2663#endif
2664}
2665#endif
2666
2667static int do_sigpending(void *set, unsigned long sigsetsize)
2668{
2621 if (sigsetsize > sizeof(sigset_t)) 2669 if (sigsetsize > sizeof(sigset_t))
2622 goto out; 2670 return -EINVAL;
2623 2671
2624 spin_lock_irq(&current->sighand->siglock); 2672 spin_lock_irq(&current->sighand->siglock);
2625 sigorsets(&pending, &current->pending.signal, 2673 sigorsets(set, &current->pending.signal,
2626 &current->signal->shared_pending.signal); 2674 &current->signal->shared_pending.signal);
2627 spin_unlock_irq(&current->sighand->siglock); 2675 spin_unlock_irq(&current->sighand->siglock);
2628 2676
2629 /* Outside the lock because only this thread touches it. */ 2677 /* Outside the lock because only this thread touches it. */
2630 sigandsets(&pending, &current->blocked, &pending); 2678 sigandsets(set, &current->blocked, set);
2631 2679 return 0;
2632 error = -EFAULT;
2633 if (!copy_to_user(set, &pending, sigsetsize))
2634 error = 0;
2635
2636out:
2637 return error;
2638} 2680}
2639 2681
2640/** 2682/**
@@ -2643,11 +2685,36 @@ out:
2643 * @set: stores pending signals 2685 * @set: stores pending signals
2644 * @sigsetsize: size of sigset_t type or larger 2686 * @sigsetsize: size of sigset_t type or larger
2645 */ 2687 */
2646SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) 2688SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
2647{ 2689{
2648 return do_sigpending(set, sigsetsize); 2690 sigset_t set;
2691 int err = do_sigpending(&set, sigsetsize);
2692 if (!err && copy_to_user(uset, &set, sigsetsize))
2693 err = -EFAULT;
2694 return err;
2649} 2695}
2650 2696
2697#ifdef CONFIG_COMPAT
2698COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
2699 compat_size_t, sigsetsize)
2700{
2701#ifdef __BIG_ENDIAN
2702 sigset_t set;
2703 int err = do_sigpending(&set, sigsetsize);
2704 if (!err) {
2705 compat_sigset_t set32;
2706 sigset_to_compat(&set32, &set);
2707 /* we can get here only if sigsetsize <= sizeof(set) */
2708 if (copy_to_user(uset, &set32, sigsetsize))
2709 err = -EFAULT;
2710 }
2711 return err;
2712#else
2713 return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
2714#endif
2715}
2716#endif
2717
2651#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER 2718#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
2652 2719
2653int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) 2720int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
@@ -2924,6 +2991,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2924 return do_tkill(0, pid, sig); 2991 return do_tkill(0, pid, sig);
2925} 2992}
2926 2993
2994static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
2995{
2996 /* Not even root can pretend to send signals from the kernel.
2997 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2998 */
2999 if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
3000 (task_pid_vnr(current) != pid)) {
3001 /* We used to allow any < 0 si_code */
3002 WARN_ON_ONCE(info->si_code < 0);
3003 return -EPERM;
3004 }
3005 info->si_signo = sig;
3006
3007 /* POSIX.1b doesn't mention process groups. */
3008 return kill_proc_info(sig, info, pid);
3009}
3010
2927/** 3011/**
2928 * sys_rt_sigqueueinfo - send signal information to a signal 3012 * sys_rt_sigqueueinfo - send signal information to a signal
2929 * @pid: the PID of the thread 3013 * @pid: the PID of the thread
@@ -2934,25 +3018,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2934 siginfo_t __user *, uinfo) 3018 siginfo_t __user *, uinfo)
2935{ 3019{
2936 siginfo_t info; 3020 siginfo_t info;
2937
2938 if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) 3021 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2939 return -EFAULT; 3022 return -EFAULT;
3023 return do_rt_sigqueueinfo(pid, sig, &info);
3024}
2940 3025
2941 /* Not even root can pretend to send signals from the kernel. 3026#ifdef CONFIG_COMPAT
2942 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3027COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
2943 */ 3028 compat_pid_t, pid,
2944 if (info.si_code >= 0 || info.si_code == SI_TKILL) { 3029 int, sig,
2945 /* We used to allow any < 0 si_code */ 3030 struct compat_siginfo __user *, uinfo)
2946 WARN_ON_ONCE(info.si_code < 0); 3031{
2947 return -EPERM; 3032 siginfo_t info;
2948 } 3033 int ret = copy_siginfo_from_user32(&info, uinfo);
2949 info.si_signo = sig; 3034 if (unlikely(ret))
2950 3035 return ret;
2951 /* POSIX.1b doesn't mention process groups. */ 3036 return do_rt_sigqueueinfo(pid, sig, &info);
2952 return kill_proc_info(sig, &info, pid);
2953} 3037}
3038#endif
2954 3039
2955long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) 3040static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2956{ 3041{
2957 /* This is only valid for single tasks */ 3042 /* This is only valid for single tasks */
2958 if (pid <= 0 || tgid <= 0) 3043 if (pid <= 0 || tgid <= 0)
@@ -2961,7 +3046,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2961 /* Not even root can pretend to send signals from the kernel. 3046 /* Not even root can pretend to send signals from the kernel.
2962 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3047 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2963 */ 3048 */
2964 if (info->si_code >= 0 || info->si_code == SI_TKILL) { 3049 if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
3050 (task_pid_vnr(current) != pid)) {
2965 /* We used to allow any < 0 si_code */ 3051 /* We used to allow any < 0 si_code */
2966 WARN_ON_ONCE(info->si_code < 0); 3052 WARN_ON_ONCE(info->si_code < 0);
2967 return -EPERM; 3053 return -EPERM;
@@ -2982,6 +3068,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2982 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); 3068 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2983} 3069}
2984 3070
3071#ifdef CONFIG_COMPAT
3072COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3073 compat_pid_t, tgid,
3074 compat_pid_t, pid,
3075 int, sig,
3076 struct compat_siginfo __user *, uinfo)
3077{
3078 siginfo_t info;
3079
3080 if (copy_siginfo_from_user32(&info, uinfo))
3081 return -EFAULT;
3082 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
3083}
3084#endif
3085
2985int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3086int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2986{ 3087{
2987 struct task_struct *t = current; 3088 struct task_struct *t = current;
@@ -3027,7 +3128,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3027 return 0; 3128 return 0;
3028} 3129}
3029 3130
3030int 3131static int
3031do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3132do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
3032{ 3133{
3033 stack_t oss; 3134 stack_t oss;
@@ -3092,12 +3193,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
3092out: 3193out:
3093 return error; 3194 return error;
3094} 3195}
3095#ifdef CONFIG_GENERIC_SIGALTSTACK
3096SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) 3196SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3097{ 3197{
3098 return do_sigaltstack(uss, uoss, current_user_stack_pointer()); 3198 return do_sigaltstack(uss, uoss, current_user_stack_pointer());
3099} 3199}
3100#endif
3101 3200
3102int restore_altstack(const stack_t __user *uss) 3201int restore_altstack(const stack_t __user *uss)
3103{ 3202{
@@ -3115,9 +3214,9 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
3115} 3214}
3116 3215
3117#ifdef CONFIG_COMPAT 3216#ifdef CONFIG_COMPAT
3118#ifdef CONFIG_GENERIC_SIGALTSTACK 3217COMPAT_SYSCALL_DEFINE2(sigaltstack,
3119asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, 3218 const compat_stack_t __user *, uss_ptr,
3120 compat_stack_t __user *uoss_ptr) 3219 compat_stack_t __user *, uoss_ptr)
3121{ 3220{
3122 stack_t uss, uoss; 3221 stack_t uss, uoss;
3123 int ret; 3222 int ret;
@@ -3164,7 +3263,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3164 __put_user(t->sas_ss_size, &uss->ss_size); 3263 __put_user(t->sas_ss_size, &uss->ss_size);
3165} 3264}
3166#endif 3265#endif
3167#endif
3168 3266
3169#ifdef __ARCH_WANT_SYS_SIGPENDING 3267#ifdef __ARCH_WANT_SYS_SIGPENDING
3170 3268
@@ -3174,7 +3272,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3174 */ 3272 */
3175SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 3273SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
3176{ 3274{
3177 return do_sigpending(set, sizeof(*set)); 3275 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
3178} 3276}
3179 3277
3180#endif 3278#endif
@@ -3230,7 +3328,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3230} 3328}
3231#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 3329#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
3232 3330
3233#ifdef __ARCH_WANT_SYS_RT_SIGACTION 3331#ifndef CONFIG_ODD_RT_SIGACTION
3234/** 3332/**
3235 * sys_rt_sigaction - alter an action taken by a process 3333 * sys_rt_sigaction - alter an action taken by a process
3236 * @sig: signal to be sent 3334 * @sig: signal to be sent
@@ -3264,7 +3362,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
3264out: 3362out:
3265 return ret; 3363 return ret;
3266} 3364}
3267#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ 3365#ifdef CONFIG_COMPAT
3366COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3367 const struct compat_sigaction __user *, act,
3368 struct compat_sigaction __user *, oact,
3369 compat_size_t, sigsetsize)
3370{
3371 struct k_sigaction new_ka, old_ka;
3372 compat_sigset_t mask;
3373#ifdef __ARCH_HAS_SA_RESTORER
3374 compat_uptr_t restorer;
3375#endif
3376 int ret;
3377
3378 /* XXX: Don't preclude handling different sized sigset_t's. */
3379 if (sigsetsize != sizeof(compat_sigset_t))
3380 return -EINVAL;
3381
3382 if (act) {
3383 compat_uptr_t handler;
3384 ret = get_user(handler, &act->sa_handler);
3385 new_ka.sa.sa_handler = compat_ptr(handler);
3386#ifdef __ARCH_HAS_SA_RESTORER
3387 ret |= get_user(restorer, &act->sa_restorer);
3388 new_ka.sa.sa_restorer = compat_ptr(restorer);
3389#endif
3390 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
3391 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
3392 if (ret)
3393 return -EFAULT;
3394 sigset_from_compat(&new_ka.sa.sa_mask, &mask);
3395 }
3396
3397 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3398 if (!ret && oact) {
3399 sigset_to_compat(&mask, &old_ka.sa.sa_mask);
3400 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
3401 &oact->sa_handler);
3402 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
3403 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
3404#ifdef __ARCH_HAS_SA_RESTORER
3405 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3406 &oact->sa_restorer);
3407#endif
3408 }
3409 return ret;
3410}
3411#endif
3412#endif /* !CONFIG_ODD_RT_SIGACTION */
3413
3414#ifdef CONFIG_OLD_SIGACTION
3415SYSCALL_DEFINE3(sigaction, int, sig,
3416 const struct old_sigaction __user *, act,
3417 struct old_sigaction __user *, oact)
3418{
3419 struct k_sigaction new_ka, old_ka;
3420 int ret;
3421
3422 if (act) {
3423 old_sigset_t mask;
3424 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
3425 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
3426 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
3427 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
3428 __get_user(mask, &act->sa_mask))
3429 return -EFAULT;
3430#ifdef __ARCH_HAS_KA_RESTORER
3431 new_ka.ka_restorer = NULL;
3432#endif
3433 siginitset(&new_ka.sa.sa_mask, mask);
3434 }
3435
3436 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3437
3438 if (!ret && oact) {
3439 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
3440 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
3441 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
3442 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
3443 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
3444 return -EFAULT;
3445 }
3446
3447 return ret;
3448}
3449#endif
3450#ifdef CONFIG_COMPAT_OLD_SIGACTION
3451COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
3452 const struct compat_old_sigaction __user *, act,
3453 struct compat_old_sigaction __user *, oact)
3454{
3455 struct k_sigaction new_ka, old_ka;
3456 int ret;
3457 compat_old_sigset_t mask;
3458 compat_uptr_t handler, restorer;
3459
3460 if (act) {
3461 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
3462 __get_user(handler, &act->sa_handler) ||
3463 __get_user(restorer, &act->sa_restorer) ||
3464 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
3465 __get_user(mask, &act->sa_mask))
3466 return -EFAULT;
3467
3468#ifdef __ARCH_HAS_KA_RESTORER
3469 new_ka.ka_restorer = NULL;
3470#endif
3471 new_ka.sa.sa_handler = compat_ptr(handler);
3472 new_ka.sa.sa_restorer = compat_ptr(restorer);
3473 siginitset(&new_ka.sa.sa_mask, mask);
3474 }
3475
3476 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3477
3478 if (!ret && oact) {
3479 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
3480 __put_user(ptr_to_compat(old_ka.sa.sa_handler),
3481 &oact->sa_handler) ||
3482 __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3483 &oact->sa_restorer) ||
3484 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
3485 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
3486 return -EFAULT;
3487 }
3488 return ret;
3489}
3490#endif
3268 3491
3269#ifdef __ARCH_WANT_SYS_SGETMASK 3492#ifdef __ARCH_WANT_SYS_SGETMASK
3270 3493
@@ -3332,7 +3555,6 @@ int sigsuspend(sigset_t *set)
3332 return -ERESTARTNOHAND; 3555 return -ERESTARTNOHAND;
3333} 3556}
3334 3557
3335#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3336/** 3558/**
3337 * sys_rt_sigsuspend - replace the signal mask for a value with the 3559 * sys_rt_sigsuspend - replace the signal mask for a value with the
3338 * @unewset value until a signal is received 3560 * @unewset value until a signal is received
@@ -3351,7 +3573,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3351 return -EFAULT; 3573 return -EFAULT;
3352 return sigsuspend(&newset); 3574 return sigsuspend(&newset);
3353} 3575}
3354#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 3576
3577#ifdef CONFIG_COMPAT
3578COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
3579{
3580#ifdef __BIG_ENDIAN
3581 sigset_t newset;
3582 compat_sigset_t newset32;
3583
3584 /* XXX: Don't preclude handling different sized sigset_t's. */
3585 if (sigsetsize != sizeof(sigset_t))
3586 return -EINVAL;
3587
3588 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
3589 return -EFAULT;
3590 sigset_from_compat(&newset, &newset32);
3591 return sigsuspend(&newset);
3592#else
3593 /* on little-endian bitmaps don't care about granularity */
3594 return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
3595#endif
3596}
3597#endif
3598
3599#ifdef CONFIG_OLD_SIGSUSPEND
3600SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
3601{
3602 sigset_t blocked;
3603 siginitset(&blocked, mask);
3604 return sigsuspend(&blocked);
3605}
3606#endif
3607#ifdef CONFIG_OLD_SIGSUSPEND3
3608SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
3609{
3610 sigset_t blocked;
3611 siginitset(&blocked, mask);
3612 return sigsuspend(&blocked);
3613}
3614#endif
3355 3615
3356__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) 3616__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
3357{ 3617{
diff --git a/kernel/smp.c b/kernel/smp.c
index 29dd40a9f2f4..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,23 +16,14 @@
16#include "smpboot.h" 16#include "smpboot.h"
17 17
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19static struct {
20 struct list_head queue;
21 raw_spinlock_t lock;
22} call_function __cacheline_aligned_in_smp =
23 {
24 .queue = LIST_HEAD_INIT(call_function.queue),
25 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
26 };
27
28enum { 19enum {
29 CSD_FLAG_LOCK = 0x01, 20 CSD_FLAG_LOCK = 0x01,
30}; 21};
31 22
32struct call_function_data { 23struct call_function_data {
33 struct call_single_data csd; 24 struct call_single_data __percpu *csd;
34 atomic_t refs;
35 cpumask_var_t cpumask; 25 cpumask_var_t cpumask;
26 cpumask_var_t cpumask_ipi;
36}; 27};
37 28
38static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); 29static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -56,6 +47,14 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
56 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 47 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
57 cpu_to_node(cpu))) 48 cpu_to_node(cpu)))
58 return notifier_from_errno(-ENOMEM); 49 return notifier_from_errno(-ENOMEM);
50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
51 cpu_to_node(cpu)))
52 return notifier_from_errno(-ENOMEM);
53 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) {
55 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM);
57 }
59 break; 58 break;
60 59
61#ifdef CONFIG_HOTPLUG_CPU 60#ifdef CONFIG_HOTPLUG_CPU
@@ -65,6 +64,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
65 case CPU_DEAD: 64 case CPU_DEAD:
66 case CPU_DEAD_FROZEN: 65 case CPU_DEAD_FROZEN:
67 free_cpumask_var(cfd->cpumask); 66 free_cpumask_var(cfd->cpumask);
67 free_cpumask_var(cfd->cpumask_ipi);
68 free_percpu(cfd->csd);
68 break; 69 break;
69#endif 70#endif
70 }; 71 };
@@ -166,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
166} 167}
167 168
168/* 169/*
169 * Invoked by arch to handle an IPI for call function. Must be called with
170 * interrupts disabled.
171 */
172void generic_smp_call_function_interrupt(void)
173{
174 struct call_function_data *data;
175 int cpu = smp_processor_id();
176
177 /*
178 * Shouldn't receive this interrupt on a cpu that is not yet online.
179 */
180 WARN_ON_ONCE(!cpu_online(cpu));
181
182 /*
183 * Ensure entry is visible on call_function_queue after we have
184 * entered the IPI. See comment in smp_call_function_many.
185 * If we don't have this, then we may miss an entry on the list
186 * and never get another IPI to process it.
187 */
188 smp_mb();
189
190 /*
191 * It's ok to use list_for_each_rcu() here even though we may
192 * delete 'pos', since list_del_rcu() doesn't clear ->next
193 */
194 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
195 int refs;
196 smp_call_func_t func;
197
198 /*
199 * Since we walk the list without any locks, we might
200 * see an entry that was completed, removed from the
201 * list and is in the process of being reused.
202 *
203 * We must check that the cpu is in the cpumask before
204 * checking the refs, and both must be set before
205 * executing the callback on this cpu.
206 */
207
208 if (!cpumask_test_cpu(cpu, data->cpumask))
209 continue;
210
211 smp_rmb();
212
213 if (atomic_read(&data->refs) == 0)
214 continue;
215
216 func = data->csd.func; /* save for later warn */
217 func(data->csd.info);
218
219 /*
220 * If the cpu mask is not still set then func enabled
221 * interrupts (BUG), and this cpu took another smp call
222 * function interrupt and executed func(info) twice
223 * on this cpu. That nested execution decremented refs.
224 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pf enabled interrupts and double executed\n", func);
227 continue;
228 }
229
230 refs = atomic_dec_return(&data->refs);
231 WARN_ON(refs < 0);
232
233 if (refs)
234 continue;
235
236 WARN_ON(!cpumask_empty(data->cpumask));
237
238 raw_spin_lock(&call_function.lock);
239 list_del_rcu(&data->csd.list);
240 raw_spin_unlock(&call_function.lock);
241
242 csd_unlock(&data->csd);
243 }
244
245}
246
247/*
248 * Invoked by arch to handle an IPI for call function single. Must be 170 * Invoked by arch to handle an IPI for call function single. Must be
249 * called from the arch with interrupts disabled. 171 * called from the arch with interrupts disabled.
250 */ 172 */
@@ -448,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
448 smp_call_func_t func, void *info, bool wait) 370 smp_call_func_t func, void *info, bool wait)
449{ 371{
450 struct call_function_data *data; 372 struct call_function_data *data;
451 unsigned long flags; 373 int cpu, next_cpu, this_cpu = smp_processor_id();
452 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
453 374
454 /* 375 /*
455 * Can deadlock when called with interrupts disabled. 376 * Can deadlock when called with interrupts disabled.
@@ -481,79 +402,46 @@ void smp_call_function_many(const struct cpumask *mask,
481 } 402 }
482 403
483 data = &__get_cpu_var(cfd_data); 404 data = &__get_cpu_var(cfd_data);
484 csd_lock(&data->csd);
485
486 /* This BUG_ON verifies our reuse assertions and can be removed */
487 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
488 405
489 /*
490 * The global call function queue list add and delete are protected
491 * by a lock, but the list is traversed without any lock, relying
492 * on the rcu list add and delete to allow safe concurrent traversal.
493 * We reuse the call function data without waiting for any grace
494 * period after some other cpu removes it from the global queue.
495 * This means a cpu might find our data block as it is being
496 * filled out.
497 *
498 * We hold off the interrupt handler on the other cpu by
499 * ordering our writes to the cpu mask vs our setting of the
500 * refs counter. We assert only the cpu owning the data block
501 * will set a bit in cpumask, and each bit will only be cleared
502 * by the subject cpu. Each cpu must first find its bit is
503 * set and then check that refs is set indicating the element is
504 * ready to be processed, otherwise it must skip the entry.
505 *
506 * On the previous iteration refs was set to 0 by another cpu.
507 * To avoid the use of transitivity, set the counter to 0 here
508 * so the wmb will pair with the rmb in the interrupt handler.
509 */
510 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
511
512 data->csd.func = func;
513 data->csd.info = info;
514
515 /* Ensure 0 refs is visible before mask. Also orders func and info */
516 smp_wmb();
517
518 /* We rely on the "and" being processed before the store */
519 cpumask_and(data->cpumask, mask, cpu_online_mask); 406 cpumask_and(data->cpumask, mask, cpu_online_mask);
520 cpumask_clear_cpu(this_cpu, data->cpumask); 407 cpumask_clear_cpu(this_cpu, data->cpumask);
521 refs = cpumask_weight(data->cpumask);
522 408
523 /* Some callers race with other cpus changing the passed mask */ 409 /* Some callers race with other cpus changing the passed mask */
524 if (unlikely(!refs)) { 410 if (unlikely(!cpumask_weight(data->cpumask)))
525 csd_unlock(&data->csd);
526 return; 411 return;
527 }
528 412
529 raw_spin_lock_irqsave(&call_function.lock, flags);
530 /* 413 /*
531 * Place entry at the _HEAD_ of the list, so that any cpu still 414 * After we put an entry into the list, data->cpumask
532 * observing the entry in generic_smp_call_function_interrupt() 415 * may be cleared again when another CPU sends another IPI for
533 * will not miss any other list entries: 416 * a SMP function call, so data->cpumask will be zero.
534 */ 417 */
535 list_add_rcu(&data->csd.list, &call_function.queue); 418 cpumask_copy(data->cpumask_ipi, data->cpumask);
536 /*
537 * We rely on the wmb() in list_add_rcu to complete our writes
538 * to the cpumask before this write to refs, which indicates
539 * data is on the list and is ready to be processed.
540 */
541 atomic_set(&data->refs, refs);
542 raw_spin_unlock_irqrestore(&call_function.lock, flags);
543 419
544 /* 420 for_each_cpu(cpu, data->cpumask) {
545 * Make the list addition visible before sending the ipi. 421 struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
546 * (IPIs must obey or appear to obey normal Linux cache 422 struct call_single_queue *dst =
547 * coherency rules -- see comment in generic_exec_single). 423 &per_cpu(call_single_queue, cpu);
548 */ 424 unsigned long flags;
549 smp_mb(); 425
426 csd_lock(csd);
427 csd->func = func;
428 csd->info = info;
429
430 raw_spin_lock_irqsave(&dst->lock, flags);
431 list_add_tail(&csd->list, &dst->list);
432 raw_spin_unlock_irqrestore(&dst->lock, flags);
433 }
550 434
551 /* Send a message to all CPUs in the map */ 435 /* Send a message to all CPUs in the map */
552 arch_send_call_function_ipi_mask(data->cpumask); 436 arch_send_call_function_ipi_mask(data->cpumask_ipi);
553 437
554 /* Optionally wait for the CPUs to complete */ 438 if (wait) {
555 if (wait) 439 for_each_cpu(cpu, data->cpumask) {
556 csd_lock_wait(&data->csd); 440 struct call_single_data *csd =
441 per_cpu_ptr(data->csd, cpu);
442 csd_lock_wait(csd);
443 }
444 }
557} 445}
558EXPORT_SYMBOL(smp_call_function_many); 446EXPORT_SYMBOL(smp_call_function_many);
559 447
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d6c5fc054242..b9bde5727829 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data)
131 continue; 131 continue;
132 } 132 }
133 133
134 BUG_ON(td->cpu != smp_processor_id()); 134 //BUG_ON(td->cpu != smp_processor_id());
135 135
136 /* Check for state change setup */ 136 /* Check for state change setup */
137 switch (td->status) { 137 switch (td->status) {
@@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
183 kfree(td); 183 kfree(td);
184 return PTR_ERR(tsk); 184 return PTR_ERR(tsk);
185 } 185 }
186
187 get_task_struct(tsk); 186 get_task_struct(tsk);
188 *per_cpu_ptr(ht->store, cpu) = tsk; 187 *per_cpu_ptr(ht->store, cpu) = tsk;
188 if (ht->create)
189 ht->create(cpu);
189 return 0; 190 return 0;
190} 191}
191 192
@@ -225,7 +226,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
225{ 226{
226 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 227 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
227 228
228 if (tsk) 229 if (tsk && !ht->selfparking)
229 kthread_park(tsk); 230 kthread_park(tsk);
230} 231}
231 232
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567babe789..b4d252fd195b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 195EXPORT_SYMBOL(local_bh_enable_ip);
196 196
197/* 197/*
198 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 198 * We restart softirq processing for at most 2 ms,
199 * and we fall back to softirqd after that. 199 * and if need_resched() is not set.
200 * 200 *
201 * This number has been established via experimentation. 201 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 202 * The two things to balance is latency against fairness -
203 * we want to handle softirqs as soon as possible, but they 203 * we want to handle softirqs as soon as possible, but they
204 * should not be able to lock up the box. 204 * should not be able to lock up the box.
205 */ 205 */
206#define MAX_SOFTIRQ_RESTART 10 206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
207 207
208asmlinkage void __do_softirq(void) 208asmlinkage void __do_softirq(void)
209{ 209{
210 struct softirq_action *h; 210 struct softirq_action *h;
211 __u32 pending; 211 __u32 pending;
212 int max_restart = MAX_SOFTIRQ_RESTART; 212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 213 int cpu;
214 unsigned long old_flags = current->flags; 214 unsigned long old_flags = current->flags;
215 215
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 vtime_account_irq_enter(current); 224 account_irq_enter_time(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -264,15 +264,16 @@ restart:
264 local_irq_disable(); 264 local_irq_disable();
265 265
266 pending = local_softirq_pending(); 266 pending = local_softirq_pending();
267 if (pending && --max_restart) 267 if (pending) {
268 goto restart; 268 if (time_before(jiffies, end) && !need_resched())
269 goto restart;
269 270
270 if (pending)
271 wakeup_softirqd(); 271 wakeup_softirqd();
272 }
272 273
273 lockdep_softirq_exit(); 274 lockdep_softirq_exit();
274 275
275 vtime_account_irq_exit(current); 276 account_irq_exit_time(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 277 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 278 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 279}
@@ -341,7 +342,7 @@ static inline void invoke_softirq(void)
341 */ 342 */
342void irq_exit(void) 343void irq_exit(void)
343{ 344{
344 vtime_account_irq_exit(current); 345 account_irq_exit_time(current);
345 trace_hardirq_exit(); 346 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 347 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 348 if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2b859828cdc3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
282 */ 282 */
283void cleanup_srcu_struct(struct srcu_struct *sp) 283void cleanup_srcu_struct(struct srcu_struct *sp)
284{ 284{
285 int sum; 285 if (WARN_ON(srcu_readers_active(sp)))
286 286 return; /* Leakage unless caller handles error. */
287 sum = srcu_readers_active(sp);
288 WARN_ON(sum); /* Leakage unless caller handles error. */
289 if (sum != 0)
290 return;
291 free_percpu(sp->per_cpu_ref); 287 free_percpu(sp->per_cpu_ref);
292 sp->per_cpu_ref = NULL; 288 sp->per_cpu_ref = NULL;
293} 289}
@@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
302{ 298{
303 int idx; 299 int idx;
304 300
301 idx = ACCESS_ONCE(sp->completed) & 0x1;
305 preempt_disable(); 302 preempt_disable();
306 idx = rcu_dereference_index_check(sp->completed,
307 rcu_read_lock_sched_held()) & 0x1;
308 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; 303 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
309 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 304 smp_mb(); /* B */ /* Avoid leaking the critical section. */
310 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
@@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
321 */ 316 */
322void __srcu_read_unlock(struct srcu_struct *sp, int idx) 317void __srcu_read_unlock(struct srcu_struct *sp, int idx)
323{ 318{
324 preempt_disable();
325 smp_mb(); /* C */ /* Avoid leaking the critical section. */ 319 smp_mb(); /* C */ /* Avoid leaking the critical section. */
326 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; 320 this_cpu_dec(sp->per_cpu_ref->c[idx]);
327 preempt_enable();
328} 321}
329EXPORT_SYMBOL_GPL(__srcu_read_unlock); 322EXPORT_SYMBOL_GPL(__srcu_read_unlock);
330 323
@@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
423 !lock_is_held(&rcu_sched_lock_map), 416 !lock_is_held(&rcu_sched_lock_map),
424 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 417 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
425 418
419 might_sleep();
426 init_completion(&rcu.completion); 420 init_completion(&rcu.completion);
427 421
428 head->next = NULL; 422 head->next = NULL;
@@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
455 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 449 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
456 * @sp: srcu_struct with which to synchronize. 450 * @sp: srcu_struct with which to synchronize.
457 * 451 *
458 * Flip the completed counter, and wait for the old count to drain to zero. 452 * Wait for the count to drain to zero of both indexes. To avoid the
459 * As with classic RCU, the updater must use some separate means of 453 * possible starvation of synchronize_srcu(), it waits for the count of
460 * synchronizing concurrent updates. Can block; must be called from 454 * the index=((->completed & 1) ^ 1) to drain to zero at first,
461 * process context. 455 * and then flip the completed and wait for the count of the other index.
456 *
457 * Can block; must be called from process context.
462 * 458 *
463 * Note that it is illegal to call synchronize_srcu() from the corresponding 459 * Note that it is illegal to call synchronize_srcu() from the corresponding
464 * SRCU read-side critical section; doing so will result in deadlock. 460 * SRCU read-side critical section; doing so will result in deadlock.
@@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
480 * Wait for an SRCU grace period to elapse, but be more aggressive about 476 * Wait for an SRCU grace period to elapse, but be more aggressive about
481 * spinning rather than blocking when waiting. 477 * spinning rather than blocking when waiting.
482 * 478 *
483 * Note that it is illegal to call this function while holding any lock 479 * Note that it is also illegal to call synchronize_srcu_expedited()
484 * that is acquired by a CPU-hotplug notifier. It is also illegal to call 480 * from the corresponding SRCU read-side critical section;
485 * synchronize_srcu_expedited() from the corresponding SRCU read-side 481 * doing so will result in deadlock. However, it is perfectly legal
486 * critical section; doing so will result in deadlock. However, it is 482 * to call synchronize_srcu_expedited() on one srcu_struct from some
487 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 483 * other srcu_struct's read-side critical section, as long as
488 * from some other srcu_struct's read-side critical section, as long as
489 * the resulting graph of srcu_structs is acyclic. 484 * the resulting graph of srcu_structs is acyclic.
490 */ 485 */
491void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e965715..95d178c62d5a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -18,7 +18,7 @@
18#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23 23
24/* 24/*
@@ -37,10 +37,10 @@ struct cpu_stopper {
37 spinlock_t lock; 37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */ 38 bool enabled; /* is this stopper enabled? */
39 struct list_head works; /* list of pending works */ 39 struct list_head works; /* list of pending works */
40 struct task_struct *thread; /* stopper thread */
41}; 40};
42 41
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 42static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44static bool stop_machine_initialized = false; 44static bool stop_machine_initialized = false;
45 45
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
62} 62}
63 63
64/* queue @work to @stopper. if offline, @work is completed immediately */ 64/* queue @work to @stopper. if offline, @work is completed immediately */
65static void cpu_stop_queue_work(struct cpu_stopper *stopper, 65static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
66 struct cpu_stop_work *work)
67{ 66{
67 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
68 struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
69
68 unsigned long flags; 70 unsigned long flags;
69 71
70 spin_lock_irqsave(&stopper->lock, flags); 72 spin_lock_irqsave(&stopper->lock, flags);
71 73
72 if (stopper->enabled) { 74 if (stopper->enabled) {
73 list_add_tail(&work->list, &stopper->works); 75 list_add_tail(&work->list, &stopper->works);
74 wake_up_process(stopper->thread); 76 wake_up_process(p);
75 } else 77 } else
76 cpu_stop_signal_done(work->done, false); 78 cpu_stop_signal_done(work->done, false);
77 79
@@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
108 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; 110 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
109 111
110 cpu_stop_init_done(&done, 1); 112 cpu_stop_init_done(&done, 1);
111 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); 113 cpu_stop_queue_work(cpu, &work);
112 wait_for_completion(&done.completion); 114 wait_for_completion(&done.completion);
113 return done.executed ? done.ret : -ENOENT; 115 return done.executed ? done.ret : -ENOENT;
114} 116}
@@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
130 struct cpu_stop_work *work_buf) 132 struct cpu_stop_work *work_buf)
131{ 133{
132 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; 134 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
133 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); 135 cpu_stop_queue_work(cpu, work_buf);
134} 136}
135 137
136/* static data for stop_cpus */ 138/* static data for stop_cpus */
@@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
159 */ 161 */
160 preempt_disable(); 162 preempt_disable();
161 for_each_cpu(cpu, cpumask) 163 for_each_cpu(cpu, cpumask)
162 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), 164 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
163 &per_cpu(stop_cpus_work, cpu));
164 preempt_enable(); 165 preempt_enable();
165} 166}
166 167
@@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
244 return ret; 245 return ret;
245} 246}
246 247
247static int cpu_stopper_thread(void *data) 248static int cpu_stop_should_run(unsigned int cpu)
249{
250 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
251 unsigned long flags;
252 int run;
253
254 spin_lock_irqsave(&stopper->lock, flags);
255 run = !list_empty(&stopper->works);
256 spin_unlock_irqrestore(&stopper->lock, flags);
257 return run;
258}
259
260static void cpu_stopper_thread(unsigned int cpu)
248{ 261{
249 struct cpu_stopper *stopper = data; 262 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
250 struct cpu_stop_work *work; 263 struct cpu_stop_work *work;
251 int ret; 264 int ret;
252 265
253repeat: 266repeat:
254 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
255
256 if (kthread_should_stop()) {
257 __set_current_state(TASK_RUNNING);
258 return 0;
259 }
260
261 work = NULL; 267 work = NULL;
262 spin_lock_irq(&stopper->lock); 268 spin_lock_irq(&stopper->lock);
263 if (!list_empty(&stopper->works)) { 269 if (!list_empty(&stopper->works)) {
@@ -273,8 +279,6 @@ repeat:
273 struct cpu_stop_done *done = work->done; 279 struct cpu_stop_done *done = work->done;
274 char ksym_buf[KSYM_NAME_LEN] __maybe_unused; 280 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
275 281
276 __set_current_state(TASK_RUNNING);
277
278 /* cpu stop callbacks are not allowed to sleep */ 282 /* cpu stop callbacks are not allowed to sleep */
279 preempt_disable(); 283 preempt_disable();
280 284
@@ -290,88 +294,55 @@ repeat:
290 ksym_buf), arg); 294 ksym_buf), arg);
291 295
292 cpu_stop_signal_done(done, true); 296 cpu_stop_signal_done(done, true);
293 } else 297 goto repeat;
294 schedule(); 298 }
295
296 goto repeat;
297} 299}
298 300
299extern void sched_set_stop_task(int cpu, struct task_struct *stop); 301extern void sched_set_stop_task(int cpu, struct task_struct *stop);
300 302
301/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 303static void cpu_stop_create(unsigned int cpu)
302static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 304{
303 unsigned long action, void *hcpu) 305 sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
306}
307
308static void cpu_stop_park(unsigned int cpu)
304{ 309{
305 unsigned int cpu = (unsigned long)hcpu;
306 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 310 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
307 struct task_struct *p; 311 struct cpu_stop_work *work;
308 312 unsigned long flags;
309 switch (action & ~CPU_TASKS_FROZEN) {
310 case CPU_UP_PREPARE:
311 BUG_ON(stopper->thread || stopper->enabled ||
312 !list_empty(&stopper->works));
313 p = kthread_create_on_node(cpu_stopper_thread,
314 stopper,
315 cpu_to_node(cpu),
316 "migration/%d", cpu);
317 if (IS_ERR(p))
318 return notifier_from_errno(PTR_ERR(p));
319 get_task_struct(p);
320 kthread_bind(p, cpu);
321 sched_set_stop_task(cpu, p);
322 stopper->thread = p;
323 break;
324
325 case CPU_ONLINE:
326 /* strictly unnecessary, as first user will wake it */
327 wake_up_process(stopper->thread);
328 /* mark enabled */
329 spin_lock_irq(&stopper->lock);
330 stopper->enabled = true;
331 spin_unlock_irq(&stopper->lock);
332 break;
333
334#ifdef CONFIG_HOTPLUG_CPU
335 case CPU_UP_CANCELED:
336 case CPU_POST_DEAD:
337 {
338 struct cpu_stop_work *work;
339
340 sched_set_stop_task(cpu, NULL);
341 /* kill the stopper */
342 kthread_stop(stopper->thread);
343 /* drain remaining works */
344 spin_lock_irq(&stopper->lock);
345 list_for_each_entry(work, &stopper->works, list)
346 cpu_stop_signal_done(work->done, false);
347 stopper->enabled = false;
348 spin_unlock_irq(&stopper->lock);
349 /* release the stopper */
350 put_task_struct(stopper->thread);
351 stopper->thread = NULL;
352 break;
353 }
354#endif
355 }
356 313
357 return NOTIFY_OK; 314 /* drain remaining works */
315 spin_lock_irqsave(&stopper->lock, flags);
316 list_for_each_entry(work, &stopper->works, list)
317 cpu_stop_signal_done(work->done, false);
318 stopper->enabled = false;
319 spin_unlock_irqrestore(&stopper->lock, flags);
358} 320}
359 321
360/* 322static void cpu_stop_unpark(unsigned int cpu)
361 * Give it a higher priority so that cpu stopper is available to other 323{
362 * cpu notifiers. It currently shares the same priority as sched 324 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
363 * migration_notifier. 325
364 */ 326 spin_lock_irq(&stopper->lock);
365static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { 327 stopper->enabled = true;
366 .notifier_call = cpu_stop_cpu_callback, 328 spin_unlock_irq(&stopper->lock);
367 .priority = 10, 329}
330
331static struct smp_hotplug_thread cpu_stop_threads = {
332 .store = &cpu_stopper_task,
333 .thread_should_run = cpu_stop_should_run,
334 .thread_fn = cpu_stopper_thread,
335 .thread_comm = "migration/%u",
336 .create = cpu_stop_create,
337 .setup = cpu_stop_unpark,
338 .park = cpu_stop_park,
339 .unpark = cpu_stop_unpark,
340 .selfparking = true,
368}; 341};
369 342
370static int __init cpu_stop_init(void) 343static int __init cpu_stop_init(void)
371{ 344{
372 void *bcpu = (void *)(long)smp_processor_id();
373 unsigned int cpu; 345 unsigned int cpu;
374 int err;
375 346
376 for_each_possible_cpu(cpu) { 347 for_each_possible_cpu(cpu) {
377 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 348 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,15 +351,8 @@ static int __init cpu_stop_init(void)
380 INIT_LIST_HEAD(&stopper->works); 351 INIT_LIST_HEAD(&stopper->works);
381 } 352 }
382 353
383 /* start one for the boot cpu */ 354 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
384 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
385 bcpu);
386 BUG_ON(err != NOTIFY_OK);
387 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
388 register_cpu_notifier(&cpu_stop_cpu_notifier);
389
390 stop_machine_initialized = true; 355 stop_machine_initialized = true;
391
392 return 0; 356 return 0;
393} 357}
394early_initcall(cpu_stop_init); 358early_initcall(cpu_stop_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 265b37690421..81f56445fba9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
47#include <linux/syscalls.h> 47#include <linux/syscalls.h>
48#include <linux/kprobes.h> 48#include <linux/kprobes.h>
49#include <linux/user_namespace.h> 49#include <linux/user_namespace.h>
50#include <linux/binfmts.h>
50 51
51#include <linux/kmsg_dump.h> 52#include <linux/kmsg_dump.h>
52/* Move somewhere else to avoid recompiling? */ 53/* Move somewhere else to avoid recompiling? */
@@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex);
433SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, 434SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
434 void __user *, arg) 435 void __user *, arg)
435{ 436{
437 struct pid_namespace *pid_ns = task_active_pid_ns(current);
436 char buffer[256]; 438 char buffer[256];
437 int ret = 0; 439 int ret = 0;
438 440
439 /* We only trust the superuser with rebooting the system. */ 441 /* We only trust the superuser with rebooting the system. */
440 if (!capable(CAP_SYS_BOOT)) 442 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
441 return -EPERM; 443 return -EPERM;
442 444
443 /* For safety, we require "magic" arguments. */ 445 /* For safety, we require "magic" arguments. */
@@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
453 * pid_namespace, the command is handled by reboot_pid_ns() which will 455 * pid_namespace, the command is handled by reboot_pid_ns() which will
454 * call do_exit(). 456 * call do_exit().
455 */ 457 */
456 ret = reboot_pid_ns(task_active_pid_ns(current), cmd); 458 ret = reboot_pid_ns(pid_ns, cmd);
457 if (ret) 459 if (ret)
458 return ret; 460 return ret;
459 461
@@ -1792,14 +1794,14 @@ SYSCALL_DEFINE1(umask, int, mask)
1792static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1794static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1793{ 1795{
1794 struct fd exe; 1796 struct fd exe;
1795 struct dentry *dentry; 1797 struct inode *inode;
1796 int err; 1798 int err;
1797 1799
1798 exe = fdget(fd); 1800 exe = fdget(fd);
1799 if (!exe.file) 1801 if (!exe.file)
1800 return -EBADF; 1802 return -EBADF;
1801 1803
1802 dentry = exe.file->f_path.dentry; 1804 inode = file_inode(exe.file);
1803 1805
1804 /* 1806 /*
1805 * Because the original mm->exe_file points to executable file, make 1807 * Because the original mm->exe_file points to executable file, make
@@ -1807,11 +1809,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1807 * overall picture. 1809 * overall picture.
1808 */ 1810 */
1809 err = -EACCES; 1811 err = -EACCES;
1810 if (!S_ISREG(dentry->d_inode->i_mode) || 1812 if (!S_ISREG(inode->i_mode) ||
1811 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) 1813 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1812 goto exit; 1814 goto exit;
1813 1815
1814 err = inode_permission(dentry->d_inode, MAY_EXEC); 1816 err = inode_permission(inode, MAY_EXEC);
1815 if (err) 1817 if (err)
1816 goto exit; 1818 goto exit;
1817 1819
@@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2012 2014
2013 error = 0; 2015 error = 0;
2014 switch (option) { 2016 switch (option) {
2015 case PR_SET_PDEATHSIG: 2017 case PR_SET_PDEATHSIG:
2016 if (!valid_signal(arg2)) { 2018 if (!valid_signal(arg2)) {
2017 error = -EINVAL; 2019 error = -EINVAL;
2018 break;
2019 }
2020 me->pdeath_signal = arg2;
2021 break;
2022 case PR_GET_PDEATHSIG:
2023 error = put_user(me->pdeath_signal, (int __user *)arg2);
2024 break;
2025 case PR_GET_DUMPABLE:
2026 error = get_dumpable(me->mm);
2027 break; 2020 break;
2028 case PR_SET_DUMPABLE: 2021 }
2029 if (arg2 < 0 || arg2 > 1) { 2022 me->pdeath_signal = arg2;
2030 error = -EINVAL; 2023 break;
2031 break; 2024 case PR_GET_PDEATHSIG:
2032 } 2025 error = put_user(me->pdeath_signal, (int __user *)arg2);
2033 set_dumpable(me->mm, arg2); 2026 break;
2027 case PR_GET_DUMPABLE:
2028 error = get_dumpable(me->mm);
2029 break;
2030 case PR_SET_DUMPABLE:
2031 if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
2032 error = -EINVAL;
2034 break; 2033 break;
2034 }
2035 set_dumpable(me->mm, arg2);
2036 break;
2035 2037
2036 case PR_SET_UNALIGN: 2038 case PR_SET_UNALIGN:
2037 error = SET_UNALIGN_CTL(me, arg2); 2039 error = SET_UNALIGN_CTL(me, arg2);
2038 break; 2040 break;
2039 case PR_GET_UNALIGN: 2041 case PR_GET_UNALIGN:
2040 error = GET_UNALIGN_CTL(me, arg2); 2042 error = GET_UNALIGN_CTL(me, arg2);
2041 break; 2043 break;
2042 case PR_SET_FPEMU: 2044 case PR_SET_FPEMU:
2043 error = SET_FPEMU_CTL(me, arg2); 2045 error = SET_FPEMU_CTL(me, arg2);
2044 break; 2046 break;
2045 case PR_GET_FPEMU: 2047 case PR_GET_FPEMU:
2046 error = GET_FPEMU_CTL(me, arg2); 2048 error = GET_FPEMU_CTL(me, arg2);
2047 break; 2049 break;
2048 case PR_SET_FPEXC: 2050 case PR_SET_FPEXC:
2049 error = SET_FPEXC_CTL(me, arg2); 2051 error = SET_FPEXC_CTL(me, arg2);
2050 break; 2052 break;
2051 case PR_GET_FPEXC: 2053 case PR_GET_FPEXC:
2052 error = GET_FPEXC_CTL(me, arg2); 2054 error = GET_FPEXC_CTL(me, arg2);
2053 break; 2055 break;
2054 case PR_GET_TIMING: 2056 case PR_GET_TIMING:
2055 error = PR_TIMING_STATISTICAL; 2057 error = PR_TIMING_STATISTICAL;
2056 break; 2058 break;
2057 case PR_SET_TIMING: 2059 case PR_SET_TIMING:
2058 if (arg2 != PR_TIMING_STATISTICAL) 2060 if (arg2 != PR_TIMING_STATISTICAL)
2059 error = -EINVAL; 2061 error = -EINVAL;
2060 break; 2062 break;
2061 case PR_SET_NAME: 2063 case PR_SET_NAME:
2062 comm[sizeof(me->comm)-1] = 0; 2064 comm[sizeof(me->comm) - 1] = 0;
2063 if (strncpy_from_user(comm, (char __user *)arg2, 2065 if (strncpy_from_user(comm, (char __user *)arg2,
2064 sizeof(me->comm) - 1) < 0) 2066 sizeof(me->comm) - 1) < 0)
2065 return -EFAULT; 2067 return -EFAULT;
2066 set_task_comm(me, comm); 2068 set_task_comm(me, comm);
2067 proc_comm_connector(me); 2069 proc_comm_connector(me);
2068 break; 2070 break;
2069 case PR_GET_NAME: 2071 case PR_GET_NAME:
2070 get_task_comm(comm, me); 2072 get_task_comm(comm, me);
2071 if (copy_to_user((char __user *)arg2, comm, 2073 if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
2072 sizeof(comm))) 2074 return -EFAULT;
2073 return -EFAULT; 2075 break;
2074 break; 2076 case PR_GET_ENDIAN:
2075 case PR_GET_ENDIAN: 2077 error = GET_ENDIAN(me, arg2);
2076 error = GET_ENDIAN(me, arg2); 2078 break;
2077 break; 2079 case PR_SET_ENDIAN:
2078 case PR_SET_ENDIAN: 2080 error = SET_ENDIAN(me, arg2);
2079 error = SET_ENDIAN(me, arg2); 2081 break;
2080 break; 2082 case PR_GET_SECCOMP:
2081 case PR_GET_SECCOMP: 2083 error = prctl_get_seccomp();
2082 error = prctl_get_seccomp(); 2084 break;
2083 break; 2085 case PR_SET_SECCOMP:
2084 case PR_SET_SECCOMP: 2086 error = prctl_set_seccomp(arg2, (char __user *)arg3);
2085 error = prctl_set_seccomp(arg2, (char __user *)arg3); 2087 break;
2086 break; 2088 case PR_GET_TSC:
2087 case PR_GET_TSC: 2089 error = GET_TSC_CTL(arg2);
2088 error = GET_TSC_CTL(arg2); 2090 break;
2089 break; 2091 case PR_SET_TSC:
2090 case PR_SET_TSC: 2092 error = SET_TSC_CTL(arg2);
2091 error = SET_TSC_CTL(arg2); 2093 break;
2092 break; 2094 case PR_TASK_PERF_EVENTS_DISABLE:
2093 case PR_TASK_PERF_EVENTS_DISABLE: 2095 error = perf_event_task_disable();
2094 error = perf_event_task_disable(); 2096 break;
2095 break; 2097 case PR_TASK_PERF_EVENTS_ENABLE:
2096 case PR_TASK_PERF_EVENTS_ENABLE: 2098 error = perf_event_task_enable();
2097 error = perf_event_task_enable(); 2099 break;
2098 break; 2100 case PR_GET_TIMERSLACK:
2099 case PR_GET_TIMERSLACK: 2101 error = current->timer_slack_ns;
2100 error = current->timer_slack_ns; 2102 break;
2101 break; 2103 case PR_SET_TIMERSLACK:
2102 case PR_SET_TIMERSLACK: 2104 if (arg2 <= 0)
2103 if (arg2 <= 0) 2105 current->timer_slack_ns =
2104 current->timer_slack_ns =
2105 current->default_timer_slack_ns; 2106 current->default_timer_slack_ns;
2106 else 2107 else
2107 current->timer_slack_ns = arg2; 2108 current->timer_slack_ns = arg2;
2108 break; 2109 break;
2109 case PR_MCE_KILL: 2110 case PR_MCE_KILL:
2110 if (arg4 | arg5) 2111 if (arg4 | arg5)
2111 return -EINVAL; 2112 return -EINVAL;
2112 switch (arg2) { 2113 switch (arg2) {
2113 case PR_MCE_KILL_CLEAR: 2114 case PR_MCE_KILL_CLEAR:
2114 if (arg3 != 0) 2115 if (arg3 != 0)
2115 return -EINVAL;
2116 current->flags &= ~PF_MCE_PROCESS;
2117 break;
2118 case PR_MCE_KILL_SET:
2119 current->flags |= PF_MCE_PROCESS;
2120 if (arg3 == PR_MCE_KILL_EARLY)
2121 current->flags |= PF_MCE_EARLY;
2122 else if (arg3 == PR_MCE_KILL_LATE)
2123 current->flags &= ~PF_MCE_EARLY;
2124 else if (arg3 == PR_MCE_KILL_DEFAULT)
2125 current->flags &=
2126 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2127 else
2128 return -EINVAL;
2129 break;
2130 default:
2131 return -EINVAL; 2116 return -EINVAL;
2132 } 2117 current->flags &= ~PF_MCE_PROCESS;
2133 break; 2118 break;
2134 case PR_MCE_KILL_GET: 2119 case PR_MCE_KILL_SET:
2135 if (arg2 | arg3 | arg4 | arg5) 2120 current->flags |= PF_MCE_PROCESS;
2136 return -EINVAL; 2121 if (arg3 == PR_MCE_KILL_EARLY)
2137 if (current->flags & PF_MCE_PROCESS) 2122 current->flags |= PF_MCE_EARLY;
2138 error = (current->flags & PF_MCE_EARLY) ? 2123 else if (arg3 == PR_MCE_KILL_LATE)
2139 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; 2124 current->flags &= ~PF_MCE_EARLY;
2125 else if (arg3 == PR_MCE_KILL_DEFAULT)
2126 current->flags &=
2127 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2140 else 2128 else
2141 error = PR_MCE_KILL_DEFAULT;
2142 break;
2143 case PR_SET_MM:
2144 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2145 break;
2146 case PR_GET_TID_ADDRESS:
2147 error = prctl_get_tid_address(me, (int __user **)arg2);
2148 break;
2149 case PR_SET_CHILD_SUBREAPER:
2150 me->signal->is_child_subreaper = !!arg2;
2151 break;
2152 case PR_GET_CHILD_SUBREAPER:
2153 error = put_user(me->signal->is_child_subreaper,
2154 (int __user *) arg2);
2155 break;
2156 case PR_SET_NO_NEW_PRIVS:
2157 if (arg2 != 1 || arg3 || arg4 || arg5)
2158 return -EINVAL; 2129 return -EINVAL;
2159
2160 current->no_new_privs = 1;
2161 break; 2130 break;
2162 case PR_GET_NO_NEW_PRIVS:
2163 if (arg2 || arg3 || arg4 || arg5)
2164 return -EINVAL;
2165 return current->no_new_privs ? 1 : 0;
2166 default: 2131 default:
2167 error = -EINVAL; 2132 return -EINVAL;
2168 break; 2133 }
2134 break;
2135 case PR_MCE_KILL_GET:
2136 if (arg2 | arg3 | arg4 | arg5)
2137 return -EINVAL;
2138 if (current->flags & PF_MCE_PROCESS)
2139 error = (current->flags & PF_MCE_EARLY) ?
2140 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
2141 else
2142 error = PR_MCE_KILL_DEFAULT;
2143 break;
2144 case PR_SET_MM:
2145 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2146 break;
2147 case PR_GET_TID_ADDRESS:
2148 error = prctl_get_tid_address(me, (int __user **)arg2);
2149 break;
2150 case PR_SET_CHILD_SUBREAPER:
2151 me->signal->is_child_subreaper = !!arg2;
2152 break;
2153 case PR_GET_CHILD_SUBREAPER:
2154 error = put_user(me->signal->is_child_subreaper,
2155 (int __user *)arg2);
2156 break;
2157 case PR_SET_NO_NEW_PRIVS:
2158 if (arg2 != 1 || arg3 || arg4 || arg5)
2159 return -EINVAL;
2160
2161 current->no_new_privs = 1;
2162 break;
2163 case PR_GET_NO_NEW_PRIVS:
2164 if (arg2 || arg3 || arg4 || arg5)
2165 return -EINVAL;
2166 return current->no_new_privs ? 1 : 0;
2167 default:
2168 error = -EINVAL;
2169 break;
2169 } 2170 }
2170 return error; 2171 return error;
2171} 2172}
@@ -2184,11 +2185,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2184 2185
2185char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 2186char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2186 2187
2187static void argv_cleanup(struct subprocess_info *info)
2188{
2189 argv_free(info->argv);
2190}
2191
2192static int __orderly_poweroff(void) 2188static int __orderly_poweroff(void)
2193{ 2189{
2194 int argc; 2190 int argc;
@@ -2208,9 +2204,8 @@ static int __orderly_poweroff(void)
2208 } 2204 }
2209 2205
2210 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, 2206 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
2211 NULL, argv_cleanup, NULL); 2207 NULL, NULL, NULL);
2212 if (ret == -ENOMEM) 2208 argv_free(argv);
2213 argv_free(argv);
2214 2209
2215 return ret; 2210 return ret;
2216} 2211}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c88878db491e..d1b4ee67d2df 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
61#include <linux/kmod.h> 61#include <linux/kmod.h>
62#include <linux/capability.h> 62#include <linux/capability.h>
63#include <linux/binfmts.h> 63#include <linux/binfmts.h>
64#include <linux/sched/sysctl.h>
64 65
65#include <asm/uaccess.h> 66#include <asm/uaccess.h>
66#include <asm/processor.h> 67#include <asm/processor.h>
@@ -104,7 +105,6 @@ extern char core_pattern[];
104extern unsigned int core_pipe_limit; 105extern unsigned int core_pipe_limit;
105#endif 106#endif
106extern int pid_max; 107extern int pid_max;
107extern int min_free_kbytes;
108extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
109extern int sysctl_drop_caches; 109extern int sysctl_drop_caches;
110extern int percpu_pagelist_fraction; 110extern int percpu_pagelist_fraction;
@@ -161,10 +161,13 @@ extern int unaligned_enabled;
161#endif 161#endif
162 162
163#ifdef CONFIG_IA64 163#ifdef CONFIG_IA64
164extern int no_unaligned_warning;
165extern int unaligned_dump_stack; 164extern int unaligned_dump_stack;
166#endif 165#endif
167 166
167#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
168extern int no_unaligned_warning;
169#endif
170
168#ifdef CONFIG_PROC_SYSCTL 171#ifdef CONFIG_PROC_SYSCTL
169static int proc_do_cad_pid(struct ctl_table *table, int write, 172static int proc_do_cad_pid(struct ctl_table *table, int write,
170 void __user *buffer, size_t *lenp, loff_t *ppos); 173 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -403,6 +406,13 @@ static struct ctl_table kern_table[] = {
403 .mode = 0644, 406 .mode = 0644,
404 .proc_handler = sched_rt_handler, 407 .proc_handler = sched_rt_handler,
405 }, 408 },
409 {
410 .procname = "sched_rr_timeslice_ms",
411 .data = &sched_rr_timeslice,
412 .maxlen = sizeof(int),
413 .mode = 0644,
414 .proc_handler = sched_rr_handler,
415 },
406#ifdef CONFIG_SCHED_AUTOGROUP 416#ifdef CONFIG_SCHED_AUTOGROUP
407 { 417 {
408 .procname = "sched_autogroup_enabled", 418 .procname = "sched_autogroup_enabled",
@@ -911,7 +921,7 @@ static struct ctl_table kern_table[] = {
911 .proc_handler = proc_doulongvec_minmax, 921 .proc_handler = proc_doulongvec_minmax,
912 }, 922 },
913#endif 923#endif
914#ifdef CONFIG_IA64 924#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
915 { 925 {
916 .procname = "ignore-unaligned-usertrap", 926 .procname = "ignore-unaligned-usertrap",
917 .data = &no_unaligned_warning, 927 .data = &no_unaligned_warning,
@@ -919,6 +929,8 @@ static struct ctl_table kern_table[] = {
919 .mode = 0644, 929 .mode = 0644,
920 .proc_handler = proc_dointvec, 930 .proc_handler = proc_dointvec,
921 }, 931 },
932#endif
933#ifdef CONFIG_IA64
922 { 934 {
923 .procname = "unaligned-dump-stack", 935 .procname = "unaligned-dump-stack",
924 .data = &unaligned_dump_stack, 936 .data = &unaligned_dump_stack,
@@ -2006,7 +2018,7 @@ static int proc_taint(struct ctl_table *table, int write,
2006 int i; 2018 int i;
2007 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { 2019 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
2008 if ((tmptaint >> i) & 1) 2020 if ((tmptaint >> i) & 1)
2009 add_taint(i); 2021 add_taint(i, LOCKDEP_STILL_OK);
2010 } 2022 }
2011 } 2023 }
2012 2024
@@ -2083,7 +2095,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2083static void validate_coredump_safety(void) 2095static void validate_coredump_safety(void)
2084{ 2096{
2085#ifdef CONFIG_COREDUMP 2097#ifdef CONFIG_COREDUMP
2086 if (suid_dumpable == SUID_DUMPABLE_SAFE && 2098 if (suid_dumpable == SUID_DUMP_ROOT &&
2087 core_pattern[0] != '/' && core_pattern[0] != '|') { 2099 core_pattern[0] != '/' && core_pattern[0] != '|') {
2088 printk(KERN_WARNING "Unsafe core_pattern used with "\ 2100 printk(KERN_WARNING "Unsafe core_pattern used with "\
2089 "suid_dumpable=2. Pipe handler or fully qualified "\ 2101 "suid_dumpable=2. Pipe handler or fully qualified "\
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a6384450501..ebf72358e86a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, 387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, 388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, 389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, 390 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, 391 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, 392 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
971static ssize_t bin_intvec(struct file *file, 970static ssize_t bin_intvec(struct file *file,
972 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 971 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
973{ 972{
974 mm_segment_t old_fs = get_fs();
975 ssize_t copied = 0; 973 ssize_t copied = 0;
976 char *buffer; 974 char *buffer;
977 ssize_t result; 975 ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
984 if (oldval && oldlen) { 982 if (oldval && oldlen) {
985 unsigned __user *vec = oldval; 983 unsigned __user *vec = oldval;
986 size_t length = oldlen / sizeof(*vec); 984 size_t length = oldlen / sizeof(*vec);
987 loff_t pos = 0;
988 char *str, *end; 985 char *str, *end;
989 int i; 986 int i;
990 987
991 set_fs(KERNEL_DS); 988 result = kernel_read(file, 0, buffer, BUFSZ - 1);
992 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
993 set_fs(old_fs);
994 if (result < 0) 989 if (result < 0)
995 goto out_kfree; 990 goto out_kfree;
996 991
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
1017 if (newval && newlen) { 1012 if (newval && newlen) {
1018 unsigned __user *vec = newval; 1013 unsigned __user *vec = newval;
1019 size_t length = newlen / sizeof(*vec); 1014 size_t length = newlen / sizeof(*vec);
1020 loff_t pos = 0;
1021 char *str, *end; 1015 char *str, *end;
1022 int i; 1016 int i;
1023 1017
@@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file,
1033 str += snprintf(str, end - str, "%lu\t", value); 1027 str += snprintf(str, end - str, "%lu\t", value);
1034 } 1028 }
1035 1029
1036 set_fs(KERNEL_DS); 1030 result = kernel_write(file, buffer, str - buffer, 0);
1037 result = vfs_write(file, buffer, str - buffer, &pos);
1038 set_fs(old_fs);
1039 if (result < 0) 1031 if (result < 0)
1040 goto out_kfree; 1032 goto out_kfree;
1041 } 1033 }
@@ -1049,7 +1041,6 @@ out:
1049static ssize_t bin_ulongvec(struct file *file, 1041static ssize_t bin_ulongvec(struct file *file,
1050 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1042 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1051{ 1043{
1052 mm_segment_t old_fs = get_fs();
1053 ssize_t copied = 0; 1044 ssize_t copied = 0;
1054 char *buffer; 1045 char *buffer;
1055 ssize_t result; 1046 ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
1062 if (oldval && oldlen) { 1053 if (oldval && oldlen) {
1063 unsigned long __user *vec = oldval; 1054 unsigned long __user *vec = oldval;
1064 size_t length = oldlen / sizeof(*vec); 1055 size_t length = oldlen / sizeof(*vec);
1065 loff_t pos = 0;
1066 char *str, *end; 1056 char *str, *end;
1067 int i; 1057 int i;
1068 1058
1069 set_fs(KERNEL_DS); 1059 result = kernel_read(file, 0, buffer, BUFSZ - 1);
1070 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1071 set_fs(old_fs);
1072 if (result < 0) 1060 if (result < 0)
1073 goto out_kfree; 1061 goto out_kfree;
1074 1062
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
1095 if (newval && newlen) { 1083 if (newval && newlen) {
1096 unsigned long __user *vec = newval; 1084 unsigned long __user *vec = newval;
1097 size_t length = newlen / sizeof(*vec); 1085 size_t length = newlen / sizeof(*vec);
1098 loff_t pos = 0;
1099 char *str, *end; 1086 char *str, *end;
1100 int i; 1087 int i;
1101 1088
@@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file,
1111 str += snprintf(str, end - str, "%lu\t", value); 1098 str += snprintf(str, end - str, "%lu\t", value);
1112 } 1099 }
1113 1100
1114 set_fs(KERNEL_DS); 1101 result = kernel_write(file, buffer, str - buffer, 0);
1115 result = vfs_write(file, buffer, str - buffer, &pos);
1116 set_fs(old_fs);
1117 if (result < 0) 1102 if (result < 0)
1118 goto out_kfree; 1103 goto out_kfree;
1119 } 1104 }
@@ -1127,19 +1112,15 @@ out:
1127static ssize_t bin_uuid(struct file *file, 1112static ssize_t bin_uuid(struct file *file,
1128 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1113 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1129{ 1114{
1130 mm_segment_t old_fs = get_fs();
1131 ssize_t result, copied = 0; 1115 ssize_t result, copied = 0;
1132 1116
1133 /* Only supports reads */ 1117 /* Only supports reads */
1134 if (oldval && oldlen) { 1118 if (oldval && oldlen) {
1135 loff_t pos = 0;
1136 char buf[40], *str = buf; 1119 char buf[40], *str = buf;
1137 unsigned char uuid[16]; 1120 unsigned char uuid[16];
1138 int i; 1121 int i;
1139 1122
1140 set_fs(KERNEL_DS); 1123 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1141 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1142 set_fs(old_fs);
1143 if (result < 0) 1124 if (result < 0)
1144 goto out; 1125 goto out;
1145 1126
@@ -1175,18 +1156,14 @@ out:
1175static ssize_t bin_dn_node_address(struct file *file, 1156static ssize_t bin_dn_node_address(struct file *file,
1176 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1157 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1177{ 1158{
1178 mm_segment_t old_fs = get_fs();
1179 ssize_t result, copied = 0; 1159 ssize_t result, copied = 0;
1180 1160
1181 if (oldval && oldlen) { 1161 if (oldval && oldlen) {
1182 loff_t pos = 0;
1183 char buf[15], *nodep; 1162 char buf[15], *nodep;
1184 unsigned long area, node; 1163 unsigned long area, node;
1185 __le16 dnaddr; 1164 __le16 dnaddr;
1186 1165
1187 set_fs(KERNEL_DS); 1166 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1188 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1189 set_fs(old_fs);
1190 if (result < 0) 1167 if (result < 0)
1191 goto out; 1168 goto out;
1192 1169
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
1194 1171
1195 /* Convert the decnet address to binary */ 1172 /* Convert the decnet address to binary */
1196 result = -EIO; 1173 result = -EIO;
1197 nodep = strchr(buf, '.') + 1; 1174 nodep = strchr(buf, '.');
1198 if (!nodep) 1175 if (!nodep)
1199 goto out; 1176 goto out;
1177 ++nodep;
1200 1178
1201 area = simple_strtoul(buf, NULL, 10); 1179 area = simple_strtoul(buf, NULL, 10);
1202 node = simple_strtoul(nodep, NULL, 10); 1180 node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
1215 } 1193 }
1216 1194
1217 if (newval && newlen) { 1195 if (newval && newlen) {
1218 loff_t pos = 0;
1219 __le16 dnaddr; 1196 __le16 dnaddr;
1220 char buf[15]; 1197 char buf[15];
1221 int len; 1198 int len;
@@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1232 le16_to_cpu(dnaddr) >> 10, 1209 le16_to_cpu(dnaddr) >> 10,
1233 le16_to_cpu(dnaddr) & 0x3ff); 1210 le16_to_cpu(dnaddr) & 0x3ff);
1234 1211
1235 set_fs(KERNEL_DS); 1212 result = kernel_write(file, buf, len, 0);
1236 result = vfs_write(file, buf, len, &pos);
1237 set_fs(old_fs);
1238 if (result < 0) 1213 if (result < 0)
1239 goto out; 1214 goto out;
1240 } 1215 }
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd28..f8342a41efa6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
115} 115}
116 116
117/* 117/*
118 * Indicates if there is an offset between the system clock and the hardware
119 * clock/persistent clock/rtc.
120 */
121int persistent_clock_is_local;
122
123/*
118 * Adjust the time obtained from the CMOS to be UTC time instead of 124 * Adjust the time obtained from the CMOS to be UTC time instead of
119 * local time. 125 * local time.
120 * 126 *
@@ -135,6 +141,8 @@ static inline void warp_clock(void)
135 struct timespec adjust; 141 struct timespec adjust;
136 142
137 adjust = current_kernel_time(); 143 adjust = current_kernel_time();
144 if (sys_tz.tz_minuteswest != 0)
145 persistent_clock_is_local = 1;
138 adjust.tv_sec += sys_tz.tz_minuteswest * 60; 146 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
139 do_settimeofday(&adjust); 147 do_settimeofday(&adjust);
140} 148}
@@ -232,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);
232 * Avoid unnecessary multiplications/divisions in the 240 * Avoid unnecessary multiplications/divisions in the
233 * two most common HZ cases: 241 * two most common HZ cases:
234 */ 242 */
235inline unsigned int jiffies_to_msecs(const unsigned long j) 243unsigned int jiffies_to_msecs(const unsigned long j)
236{ 244{
237#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 245#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
238 return (MSEC_PER_SEC / HZ) * j; 246 return (MSEC_PER_SEC / HZ) * j;
@@ -248,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
248} 256}
249EXPORT_SYMBOL(jiffies_to_msecs); 257EXPORT_SYMBOL(jiffies_to_msecs);
250 258
251inline unsigned int jiffies_to_usecs(const unsigned long j) 259unsigned int jiffies_to_usecs(const unsigned long j)
252{ 260{
253#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 261#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
254 return (USEC_PER_SEC / HZ) * j; 262 return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db1261..24510d84efd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
12config ARCH_CLOCKSOURCE_DATA 12config ARCH_CLOCKSOURCE_DATA
13 bool 13 bool
14 14
15# Platforms has a persistent clock
16config ALWAYS_USE_PERSISTENT_CLOCK
17 bool
18 default n
19
15# Timekeeping vsyscall support 20# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL 21config GENERIC_TIME_VSYSCALL
17 bool 22 bool
@@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD
38 default y 43 default y
39 depends on GENERIC_CLOCKEVENTS 44 depends on GENERIC_CLOCKEVENTS
40 45
46# Architecture can handle broadcast in a driver-agnostic way
47config ARCH_HAS_TICK_BROADCAST
48 bool
49
41# Clockevents broadcasting infrastructure 50# Clockevents broadcasting infrastructure
42config GENERIC_CLOCKEVENTS_BROADCAST 51config GENERIC_CLOCKEVENTS_BROADCAST
43 bool 52 bool
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977c..c6d6400ee137 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,
339 clockevents_config(dev, freq); 339 clockevents_config(dev, freq);
340 clockevents_register_device(dev); 340 clockevents_register_device(dev);
341} 341}
342EXPORT_SYMBOL_GPL(clockevents_config_and_register);
342 343
343/** 344/**
344 * clockevents_update_freq - Update frequency and reprogram a clock event device. 345 * clockevents_update_freq - Update frequency and reprogram a clock event device.
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669b..072bb066bb7d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rtc.h>
18 19
19#include "tick-internal.h" 20#include "tick-internal.h"
20 21
@@ -22,7 +23,7 @@
22 * NTP timekeeping variables: 23 * NTP timekeeping variables:
23 */ 24 */
24 25
25DEFINE_SPINLOCK(ntp_lock); 26DEFINE_RAW_SPINLOCK(ntp_lock);
26 27
27 28
28/* USER_HZ period (usecs): */ 29/* USER_HZ period (usecs): */
@@ -347,7 +348,7 @@ void ntp_clear(void)
347{ 348{
348 unsigned long flags; 349 unsigned long flags;
349 350
350 spin_lock_irqsave(&ntp_lock, flags); 351 raw_spin_lock_irqsave(&ntp_lock, flags);
351 352
352 time_adjust = 0; /* stop active adjtime() */ 353 time_adjust = 0; /* stop active adjtime() */
353 time_status |= STA_UNSYNC; 354 time_status |= STA_UNSYNC;
@@ -361,7 +362,7 @@ void ntp_clear(void)
361 362
362 /* Clear PPS state variables */ 363 /* Clear PPS state variables */
363 pps_clear(); 364 pps_clear();
364 spin_unlock_irqrestore(&ntp_lock, flags); 365 raw_spin_unlock_irqrestore(&ntp_lock, flags);
365 366
366} 367}
367 368
@@ -371,9 +372,9 @@ u64 ntp_tick_length(void)
371 unsigned long flags; 372 unsigned long flags;
372 s64 ret; 373 s64 ret;
373 374
374 spin_lock_irqsave(&ntp_lock, flags); 375 raw_spin_lock_irqsave(&ntp_lock, flags);
375 ret = tick_length; 376 ret = tick_length;
376 spin_unlock_irqrestore(&ntp_lock, flags); 377 raw_spin_unlock_irqrestore(&ntp_lock, flags);
377 return ret; 378 return ret;
378} 379}
379 380
@@ -394,7 +395,7 @@ int second_overflow(unsigned long secs)
394 int leap = 0; 395 int leap = 0;
395 unsigned long flags; 396 unsigned long flags;
396 397
397 spin_lock_irqsave(&ntp_lock, flags); 398 raw_spin_lock_irqsave(&ntp_lock, flags);
398 399
399 /* 400 /*
400 * Leap second processing. If in leap-insert state at the end of the 401 * Leap second processing. If in leap-insert state at the end of the
@@ -478,13 +479,12 @@ int second_overflow(unsigned long secs)
478 time_adjust = 0; 479 time_adjust = 0;
479 480
480out: 481out:
481 spin_unlock_irqrestore(&ntp_lock, flags); 482 raw_spin_unlock_irqrestore(&ntp_lock, flags);
482 483
483 return leap; 484 return leap;
484} 485}
485 486
486#ifdef CONFIG_GENERIC_CMOS_UPDATE 487#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
487
488static void sync_cmos_clock(struct work_struct *work); 488static void sync_cmos_clock(struct work_struct *work);
489 489
490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); 490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work)
510 } 510 }
511 511
512 getnstimeofday(&now); 512 getnstimeofday(&now);
513 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 513 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
514 fail = update_persistent_clock(now); 514 struct timespec adjust = now;
515
516 fail = -ENODEV;
517 if (persistent_clock_is_local)
518 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
519#ifdef CONFIG_GENERIC_CMOS_UPDATE
520 fail = update_persistent_clock(adjust);
521#endif
522#ifdef CONFIG_RTC_SYSTOHC
523 if (fail == -ENODEV)
524 fail = rtc_set_ntp_time(adjust);
525#endif
526 }
515 527
516 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); 528 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
517 if (next.tv_nsec <= 0) 529 if (next.tv_nsec <= 0)
518 next.tv_nsec += NSEC_PER_SEC; 530 next.tv_nsec += NSEC_PER_SEC;
519 531
520 if (!fail) 532 if (!fail || fail == -ENODEV)
521 next.tv_sec = 659; 533 next.tv_sec = 659;
522 else 534 else
523 next.tv_sec = 0; 535 next.tv_sec = 0;
@@ -660,7 +672,7 @@ int do_adjtimex(struct timex *txc)
660 672
661 getnstimeofday(&ts); 673 getnstimeofday(&ts);
662 674
663 spin_lock_irq(&ntp_lock); 675 raw_spin_lock_irq(&ntp_lock);
664 676
665 if (txc->modes & ADJ_ADJTIME) { 677 if (txc->modes & ADJ_ADJTIME) {
666 long save_adjust = time_adjust; 678 long save_adjust = time_adjust;
@@ -702,7 +714,7 @@ int do_adjtimex(struct timex *txc)
702 /* fill PPS status fields */ 714 /* fill PPS status fields */
703 pps_fill_timex(txc); 715 pps_fill_timex(txc);
704 716
705 spin_unlock_irq(&ntp_lock); 717 raw_spin_unlock_irq(&ntp_lock);
706 718
707 txc->time.tv_sec = ts.tv_sec; 719 txc->time.tv_sec = ts.tv_sec;
708 txc->time.tv_usec = ts.tv_nsec; 720 txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
900 912
901 pts_norm = pps_normalize_ts(*phase_ts); 913 pts_norm = pps_normalize_ts(*phase_ts);
902 914
903 spin_lock_irqsave(&ntp_lock, flags); 915 raw_spin_lock_irqsave(&ntp_lock, flags);
904 916
905 /* clear the error bits, they will be set again if needed */ 917 /* clear the error bits, they will be set again if needed */
906 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
913 * just start the frequency interval */ 925 * just start the frequency interval */
914 if (unlikely(pps_fbase.tv_sec == 0)) { 926 if (unlikely(pps_fbase.tv_sec == 0)) {
915 pps_fbase = *raw_ts; 927 pps_fbase = *raw_ts;
916 spin_unlock_irqrestore(&ntp_lock, flags); 928 raw_spin_unlock_irqrestore(&ntp_lock, flags);
917 return; 929 return;
918 } 930 }
919 931
@@ -928,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
928 time_status |= STA_PPSJITTER; 940 time_status |= STA_PPSJITTER;
929 /* restart the frequency calibration interval */ 941 /* restart the frequency calibration interval */
930 pps_fbase = *raw_ts; 942 pps_fbase = *raw_ts;
931 spin_unlock_irqrestore(&ntp_lock, flags); 943 raw_spin_unlock_irqrestore(&ntp_lock, flags);
932 pr_err("hardpps: PPSJITTER: bad pulse\n"); 944 pr_err("hardpps: PPSJITTER: bad pulse\n");
933 return; 945 return;
934 } 946 }
@@ -945,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
945 957
946 hardpps_update_phase(pts_norm.nsec); 958 hardpps_update_phase(pts_norm.nsec);
947 959
948 spin_unlock_irqrestore(&ntp_lock, flags); 960 raw_spin_unlock_irqrestore(&ntp_lock, flags);
949} 961}
950EXPORT_SYMBOL(hardpps); 962EXPORT_SYMBOL(hardpps);
951 963
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e2..2fb8cb88df8d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,6 +18,7 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h>
21 22
22#include "tick-internal.h" 23#include "tick-internal.h"
23 24
@@ -86,6 +87,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
86 return (dev && tick_broadcast_device.evtdev == dev); 87 return (dev && tick_broadcast_device.evtdev == dev);
87} 88}
88 89
90static void err_broadcast(const struct cpumask *mask)
91{
92 pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
93}
94
95static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
96{
97 if (!dev->broadcast)
98 dev->broadcast = tick_broadcast;
99 if (!dev->broadcast) {
100 pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
101 dev->name);
102 dev->broadcast = err_broadcast;
103 }
104}
105
89/* 106/*
90 * Check, if the device is disfunctional and a place holder, which 107 * Check, if the device is disfunctional and a place holder, which
91 * needs to be handled by the broadcast device. 108 * needs to be handled by the broadcast device.
@@ -105,6 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
105 */ 122 */
106 if (!tick_device_is_functional(dev)) { 123 if (!tick_device_is_functional(dev)) {
107 dev->event_handler = tick_handle_periodic; 124 dev->event_handler = tick_handle_periodic;
125 tick_device_setup_broadcast_func(dev);
108 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 126 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
109 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 127 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
110 ret = 1; 128 ret = 1;
@@ -116,15 +134,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 */ 134 */
117 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 135 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
118 int cpu = smp_processor_id(); 136 int cpu = smp_processor_id();
119
120 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 137 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
121 tick_broadcast_clear_oneshot(cpu); 138 tick_broadcast_clear_oneshot(cpu);
139 } else {
140 tick_device_setup_broadcast_func(dev);
122 } 141 }
123 } 142 }
124 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 143 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
125 return ret; 144 return ret;
126} 145}
127 146
147#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
148int tick_receive_broadcast(void)
149{
150 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
151 struct clock_event_device *evt = td->evtdev;
152
153 if (!evt)
154 return -ENODEV;
155
156 if (!evt->event_handler)
157 return -EINVAL;
158
159 evt->event_handler(evt);
160 return 0;
161}
162#endif
163
128/* 164/*
129 * Broadcast the event to the cpus, which are set in the mask (mangled). 165 * Broadcast the event to the cpus, which are set in the mask (mangled).
130 */ 166 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d58e552d9fd1..314b9ee07edf 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/irq_work.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -28,7 +29,7 @@
28/* 29/*
29 * Per cpu nohz control structure 30 * Per cpu nohz control structure
30 */ 31 */
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 32DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32 33
33/* 34/*
34 * The time, when the last jiffy update happened. Protected by jiffies_lock. 35 * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -331,8 +332,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
331 time_delta = timekeeping_max_deferment(); 332 time_delta = timekeeping_max_deferment();
332 } while (read_seqretry(&jiffies_lock, seq)); 333 } while (read_seqretry(&jiffies_lock, seq));
333 334
334 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || 335 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
335 arch_needs_cpu(cpu)) { 336 arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
336 next_jiffies = last_jiffies + 1; 337 next_jiffies = last_jiffies + 1;
337 delta_jiffies = 1; 338 delta_jiffies = 1;
338 } else { 339 } else {
@@ -631,8 +632,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
631 632
632static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 633static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
633{ 634{
634#ifndef CONFIG_VIRT_CPU_ACCOUNTING 635#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
635 unsigned long ticks; 636 unsigned long ticks;
637
638 if (vtime_accounting_enabled())
639 return;
636 /* 640 /*
637 * We stopped the tick in idle. Update process times would miss the 641 * We stopped the tick in idle. Update process times would miss the
638 * time we slept as update_process_times does only a 1 tick 642 * time we slept as update_process_times does only a 1 tick
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb0db3f..9a0bc98fbe1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -29,6 +29,9 @@ static struct timekeeper timekeeper;
29/* flag for if timekeeping is suspended */ 29/* flag for if timekeeping is suspended */
30int __read_mostly timekeeping_suspended; 30int __read_mostly timekeeping_suspended;
31 31
32/* Flag for if there is a persistent clock on this platform */
33bool __read_mostly persistent_clock_exist = false;
34
32static inline void tk_normalize_xtime(struct timekeeper *tk) 35static inline void tk_normalize_xtime(struct timekeeper *tk)
33{ 36{
34 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { 37 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
@@ -135,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
135} 138}
136 139
137/* Timekeeper helper functions. */ 140/* Timekeeper helper functions. */
141
142#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
143u32 (*arch_gettimeoffset)(void);
144
145u32 get_arch_timeoffset(void)
146{
147 if (likely(arch_gettimeoffset))
148 return arch_gettimeoffset();
149 return 0;
150}
151#else
152static inline u32 get_arch_timeoffset(void) { return 0; }
153#endif
154
138static inline s64 timekeeping_get_ns(struct timekeeper *tk) 155static inline s64 timekeeping_get_ns(struct timekeeper *tk)
139{ 156{
140 cycle_t cycle_now, cycle_delta; 157 cycle_t cycle_now, cycle_delta;
@@ -151,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
151 nsec = cycle_delta * tk->mult + tk->xtime_nsec; 168 nsec = cycle_delta * tk->mult + tk->xtime_nsec;
152 nsec >>= tk->shift; 169 nsec >>= tk->shift;
153 170
154 /* If arch requires, add in gettimeoffset() */ 171 /* If arch requires, add in get_arch_timeoffset() */
155 return nsec + arch_gettimeoffset(); 172 return nsec + get_arch_timeoffset();
156} 173}
157 174
158static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) 175static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
@@ -171,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
171 /* convert delta to nanoseconds. */ 188 /* convert delta to nanoseconds. */
172 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 189 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
173 190
174 /* If arch requires, add in gettimeoffset() */ 191 /* If arch requires, add in get_arch_timeoffset() */
175 return nsec + arch_gettimeoffset(); 192 return nsec + get_arch_timeoffset();
176} 193}
177 194
178static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 195static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
@@ -254,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
254 271
255 tk->xtime_nsec += cycle_delta * tk->mult; 272 tk->xtime_nsec += cycle_delta * tk->mult;
256 273
257 /* If arch requires, add in gettimeoffset() */ 274 /* If arch requires, add in get_arch_timeoffset() */
258 tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; 275 tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
259 276
260 tk_normalize_xtime(tk); 277 tk_normalize_xtime(tk);
261 278
@@ -264,19 +281,18 @@ static void timekeeping_forward_now(struct timekeeper *tk)
264} 281}
265 282
266/** 283/**
267 * getnstimeofday - Returns the time of day in a timespec 284 * __getnstimeofday - Returns the time of day in a timespec.
268 * @ts: pointer to the timespec to be set 285 * @ts: pointer to the timespec to be set
269 * 286 *
270 * Returns the time of day in a timespec. 287 * Updates the time of day in the timespec.
288 * Returns 0 on success, or -ve when suspended (timespec will be undefined).
271 */ 289 */
272void getnstimeofday(struct timespec *ts) 290int __getnstimeofday(struct timespec *ts)
273{ 291{
274 struct timekeeper *tk = &timekeeper; 292 struct timekeeper *tk = &timekeeper;
275 unsigned long seq; 293 unsigned long seq;
276 s64 nsecs = 0; 294 s64 nsecs = 0;
277 295
278 WARN_ON(timekeeping_suspended);
279
280 do { 296 do {
281 seq = read_seqbegin(&tk->lock); 297 seq = read_seqbegin(&tk->lock);
282 298
@@ -287,6 +303,26 @@ void getnstimeofday(struct timespec *ts)
287 303
288 ts->tv_nsec = 0; 304 ts->tv_nsec = 0;
289 timespec_add_ns(ts, nsecs); 305 timespec_add_ns(ts, nsecs);
306
307 /*
308 * Do not bail out early, in case there were callers still using
309 * the value, even in the face of the WARN_ON.
310 */
311 if (unlikely(timekeeping_suspended))
312 return -EAGAIN;
313 return 0;
314}
315EXPORT_SYMBOL(__getnstimeofday);
316
317/**
318 * getnstimeofday - Returns the time of day in a timespec.
319 * @ts: pointer to the timespec to be set
320 *
321 * Returns the time of day in a timespec (WARN if suspended).
322 */
323void getnstimeofday(struct timespec *ts)
324{
325 WARN_ON(__getnstimeofday(ts));
290} 326}
291EXPORT_SYMBOL(getnstimeofday); 327EXPORT_SYMBOL(getnstimeofday);
292 328
@@ -640,12 +676,14 @@ void __init timekeeping_init(void)
640 struct timespec now, boot, tmp; 676 struct timespec now, boot, tmp;
641 677
642 read_persistent_clock(&now); 678 read_persistent_clock(&now);
679
643 if (!timespec_valid_strict(&now)) { 680 if (!timespec_valid_strict(&now)) {
644 pr_warn("WARNING: Persistent clock returned invalid value!\n" 681 pr_warn("WARNING: Persistent clock returned invalid value!\n"
645 " Check your CMOS/BIOS settings.\n"); 682 " Check your CMOS/BIOS settings.\n");
646 now.tv_sec = 0; 683 now.tv_sec = 0;
647 now.tv_nsec = 0; 684 now.tv_nsec = 0;
648 } 685 } else if (now.tv_sec || now.tv_nsec)
686 persistent_clock_exist = true;
649 687
650 read_boot_clock(&boot); 688 read_boot_clock(&boot);
651 if (!timespec_valid_strict(&boot)) { 689 if (!timespec_valid_strict(&boot)) {
@@ -718,11 +756,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
718{ 756{
719 struct timekeeper *tk = &timekeeper; 757 struct timekeeper *tk = &timekeeper;
720 unsigned long flags; 758 unsigned long flags;
721 struct timespec ts;
722 759
723 /* Make sure we don't set the clock twice */ 760 /*
724 read_persistent_clock(&ts); 761 * Make sure we don't set the clock twice, as timekeeping_resume()
725 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 762 * already did it
763 */
764 if (has_persistent_clock())
726 return; 765 return;
727 766
728 write_seqlock_irqsave(&tk->lock, flags); 767 write_seqlock_irqsave(&tk->lock, flags);
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
1scale=0
2
3define gcd(a,b) {
4 auto t;
5 while (b) {
6 t = b;
7 b = a % b;
8 a = t;
9 }
10 return a;
11}
12
13/* Division by reciprocal multiplication. */
14define fmul(b,n,d) {
15 return (2^b*n+d-1)/d;
16}
17
18/* Adjustment factor when a ceiling value is used. Use as:
19 (imul * n) + (fmulxx * n + fadjxx) >> xx) */
20define fadj(b,n,d) {
21 auto v;
22 d = d/gcd(n,d);
23 v = 2^b*(d-1)/d;
24 return v;
25}
26
27/* Compute the appropriate mul/adj values as well as a shift count,
28 which brings the mul value into the range 2^b-1 <= x < 2^b. Such
29 a shift value will be correct in the signed integer range and off
30 by at most one in the upper half of the unsigned range. */
31define fmuls(b,n,d) {
32 auto s, m;
33 for (s = 0; 1; s++) {
34 m = fmul(s,n,d);
35 if (m >= 2^(b-1))
36 return s;
37 }
38 return 0;
39}
40
41define timeconst(hz) {
42 print "/* Automatically generated by kernel/timeconst.bc */\n"
43 print "/* Time conversion constants for HZ == ", hz, " */\n"
44 print "\n"
45
46 print "#ifndef KERNEL_TIMECONST_H\n"
47 print "#define KERNEL_TIMECONST_H\n\n"
48
49 print "#include <linux/param.h>\n"
50 print "#include <linux/types.h>\n\n"
51
52 print "#if HZ != ", hz, "\n"
53 print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
54 print "#endif\n\n"
55
56 if (hz < 2) {
57 print "#error Totally bogus HZ value!\n"
58 } else {
59 s=fmuls(32,1000,hz)
60 obase=16
61 print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
62 print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
63 obase=10
64 print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
65
66 s=fmuls(32,hz,1000)
67 obase=16
68 print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
69 print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
70 obase=10
71 print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
72
73 obase=10
74 cd=gcd(hz,1000)
75 print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
76 print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
77 print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
78 print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
79 print "\n"
80
81 s=fmuls(32,1000000,hz)
82 obase=16
83 print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
84 print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
85 obase=10
86 print "#define HZ_TO_USEC_SHR32\t", s, "\n"
87
88 s=fmuls(32,hz,1000000)
89 obase=16
90 print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
91 print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
92 obase=10
93 print "#define USEC_TO_HZ_SHR32\t", s, "\n"
94
95 obase=10
96 cd=gcd(hz,1000000)
97 print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
98 print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
99 print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
100 print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
101 print "\n"
102
103 print "#endif /* KERNEL_TIMECONST_H */\n"
104 }
105 halt
106}
107
108timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index eb51d76e058a..000000000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,378 +0,0 @@
1#!/usr/bin/perl
2# -----------------------------------------------------------------------
3#
4# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5#
6# This file is part of the Linux kernel, and is made available under
7# the terms of the GNU General Public License version 2 or (at your
8# option) any later version; incorporated herein by reference.
9#
10# -----------------------------------------------------------------------
11#
12
13#
14# Usage: timeconst.pl HZ > timeconst.h
15#
16
17# Precomputed values for systems without Math::BigInt
18# Generated by:
19# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
20%canned_values = (
21 24 => [
22 '0xa6aaaaab','0x2aaaaaa',26,
23 125,3,
24 '0xc49ba5e4','0x1fbe76c8b4',37,
25 3,125,
26 '0xa2c2aaab','0xaaaa',16,
27 125000,3,
28 '0xc9539b89','0x7fffbce4217d',47,
29 3,125000,
30 ], 32 => [
31 '0xfa000000','0x6000000',27,
32 125,4,
33 '0x83126e98','0xfdf3b645a',36,
34 4,125,
35 '0xf4240000','0x0',17,
36 31250,1,
37 '0x8637bd06','0x3fff79c842fa',46,
38 1,31250,
39 ], 48 => [
40 '0xa6aaaaab','0x6aaaaaa',27,
41 125,6,
42 '0xc49ba5e4','0xfdf3b645a',36,
43 6,125,
44 '0xa2c2aaab','0x15555',17,
45 62500,3,
46 '0xc9539b89','0x3fffbce4217d',46,
47 3,62500,
48 ], 64 => [
49 '0xfa000000','0xe000000',28,
50 125,8,
51 '0x83126e98','0x7ef9db22d',35,
52 8,125,
53 '0xf4240000','0x0',18,
54 15625,1,
55 '0x8637bd06','0x1fff79c842fa',45,
56 1,15625,
57 ], 100 => [
58 '0xa0000000','0x0',28,
59 10,1,
60 '0xcccccccd','0x733333333',35,
61 1,10,
62 '0x9c400000','0x0',18,
63 10000,1,
64 '0xd1b71759','0x1fff2e48e8a7',45,
65 1,10000,
66 ], 122 => [
67 '0x8325c53f','0xfbcda3a',28,
68 500,61,
69 '0xf9db22d1','0x7fbe76c8b',35,
70 61,500,
71 '0x8012e2a0','0x3ef36',18,
72 500000,61,
73 '0xffda4053','0x1ffffbce4217',45,
74 61,500000,
75 ], 128 => [
76 '0xfa000000','0x1e000000',29,
77 125,16,
78 '0x83126e98','0x3f7ced916',34,
79 16,125,
80 '0xf4240000','0x40000',19,
81 15625,2,
82 '0x8637bd06','0xfffbce4217d',44,
83 2,15625,
84 ], 200 => [
85 '0xa0000000','0x0',29,
86 5,1,
87 '0xcccccccd','0x333333333',34,
88 1,5,
89 '0x9c400000','0x0',19,
90 5000,1,
91 '0xd1b71759','0xfff2e48e8a7',44,
92 1,5000,
93 ], 250 => [
94 '0x80000000','0x0',29,
95 4,1,
96 '0x80000000','0x180000000',33,
97 1,4,
98 '0xfa000000','0x0',20,
99 4000,1,
100 '0x83126e98','0x7ff7ced9168',43,
101 1,4000,
102 ], 256 => [
103 '0xfa000000','0x3e000000',30,
104 125,32,
105 '0x83126e98','0x1fbe76c8b',33,
106 32,125,
107 '0xf4240000','0xc0000',20,
108 15625,4,
109 '0x8637bd06','0x7ffde7210be',43,
110 4,15625,
111 ], 300 => [
112 '0xd5555556','0x2aaaaaaa',30,
113 10,3,
114 '0x9999999a','0x1cccccccc',33,
115 3,10,
116 '0xd0555556','0xaaaaa',20,
117 10000,3,
118 '0x9d495183','0x7ffcb923a29',43,
119 3,10000,
120 ], 512 => [
121 '0xfa000000','0x7e000000',31,
122 125,64,
123 '0x83126e98','0xfdf3b645',32,
124 64,125,
125 '0xf4240000','0x1c0000',21,
126 15625,8,
127 '0x8637bd06','0x3ffef39085f',42,
128 8,15625,
129 ], 1000 => [
130 '0x80000000','0x0',31,
131 1,1,
132 '0x80000000','0x0',31,
133 1,1,
134 '0xfa000000','0x0',22,
135 1000,1,
136 '0x83126e98','0x1ff7ced9168',41,
137 1,1000,
138 ], 1024 => [
139 '0xfa000000','0xfe000000',32,
140 125,128,
141 '0x83126e98','0x7ef9db22',31,
142 128,125,
143 '0xf4240000','0x3c0000',22,
144 15625,16,
145 '0x8637bd06','0x1fff79c842f',41,
146 16,15625,
147 ], 1200 => [
148 '0xd5555556','0xd5555555',32,
149 5,6,
150 '0x9999999a','0x66666666',31,
151 6,5,
152 '0xd0555556','0x2aaaaa',22,
153 2500,3,
154 '0x9d495183','0x1ffcb923a29',41,
155 3,2500,
156 ]
157);
158
159$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
160
161sub bint($)
162{
163 my($x) = @_;
164 return Math::BigInt->new($x);
165}
166
167#
168# Constants for division by reciprocal multiplication.
169# (bits, numerator, denominator)
170#
171sub fmul($$$)
172{
173 my ($b,$n,$d) = @_;
174
175 $n = bint($n);
176 $d = bint($d);
177
178 return scalar (($n << $b)+$d-bint(1))/$d;
179}
180
181sub fadj($$$)
182{
183 my($b,$n,$d) = @_;
184
185 $n = bint($n);
186 $d = bint($d);
187
188 $d = $d/bgcd($n, $d);
189 return scalar (($d-bint(1)) << $b)/$d;
190}
191
192sub fmuls($$$) {
193 my($b,$n,$d) = @_;
194 my($s,$m);
195 my($thres) = bint(1) << ($b-1);
196
197 $n = bint($n);
198 $d = bint($d);
199
200 for ($s = 0; 1; $s++) {
201 $m = fmul($s,$n,$d);
202 return $s if ($m >= $thres);
203 }
204 return 0;
205}
206
207# Generate a hex value if the result fits in 64 bits;
208# otherwise skip.
209sub bignum_hex($) {
210 my($x) = @_;
211 my $s = $x->as_hex();
212
213 return (length($s) > 18) ? undef : $s;
214}
215
216# Provides mul, adj, and shr factors for a specific
217# (bit, time, hz) combination
218sub muladj($$$) {
219 my($b, $t, $hz) = @_;
220 my $s = fmuls($b, $t, $hz);
221 my $m = fmul($s, $t, $hz);
222 my $a = fadj($s, $t, $hz);
223 return (bignum_hex($m), bignum_hex($a), $s);
224}
225
226# Provides numerator, denominator values
227sub numden($$) {
228 my($n, $d) = @_;
229 my $g = bgcd($n, $d);
230 return ($n/$g, $d/$g);
231}
232
233# All values for a specific (time, hz) combo
234sub conversions($$) {
235 my ($t, $hz) = @_;
236 my @val = ();
237
238 # HZ_TO_xx
239 push(@val, muladj(32, $t, $hz));
240 push(@val, numden($t, $hz));
241
242 # xx_TO_HZ
243 push(@val, muladj(32, $hz, $t));
244 push(@val, numden($hz, $t));
245
246 return @val;
247}
248
249sub compute_values($) {
250 my($hz) = @_;
251 my @val = ();
252 my $s, $m, $a, $g;
253
254 if (!$has_bigint) {
255 die "$0: HZ == $hz not canned and ".
256 "Math::BigInt not available\n";
257 }
258
259 # MSEC conversions
260 push(@val, conversions(1000, $hz));
261
262 # USEC conversions
263 push(@val, conversions(1000000, $hz));
264
265 return @val;
266}
267
268sub outputval($$)
269{
270 my($name, $val) = @_;
271 my $csuf;
272
273 if (defined($val)) {
274 if ($name !~ /SHR/) {
275 $val = "U64_C($val)";
276 }
277 printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
278 }
279}
280
281sub output($@)
282{
283 my($hz, @val) = @_;
284 my $pfx, $bit, $suf, $s, $m, $a;
285
286 print "/* Automatically generated by kernel/timeconst.pl */\n";
287 print "/* Conversion constants for HZ == $hz */\n";
288 print "\n";
289 print "#ifndef KERNEL_TIMECONST_H\n";
290 print "#define KERNEL_TIMECONST_H\n";
291 print "\n";
292
293 print "#include <linux/param.h>\n";
294 print "#include <linux/types.h>\n";
295
296 print "\n";
297 print "#if HZ != $hz\n";
298 print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
299 print "#endif\n";
300 print "\n";
301
302 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
303 'HZ_TO_USEC','USEC_TO_HZ') {
304 foreach $bit (32) {
305 foreach $suf ('MUL', 'ADJ', 'SHR') {
306 outputval("${pfx}_$suf$bit", shift(@val));
307 }
308 }
309 foreach $suf ('NUM', 'DEN') {
310 outputval("${pfx}_$suf", shift(@val));
311 }
312 }
313
314 print "\n";
315 print "#endif /* KERNEL_TIMECONST_H */\n";
316}
317
318# Pretty-print Perl values
319sub perlvals(@) {
320 my $v;
321 my @l = ();
322
323 foreach $v (@_) {
324 if (!defined($v)) {
325 push(@l, 'undef');
326 } elsif ($v =~ /^0x/) {
327 push(@l, "\'".$v."\'");
328 } else {
329 push(@l, $v.'');
330 }
331 }
332 return join(',', @l);
333}
334
335($hz) = @ARGV;
336
337# Use this to generate the %canned_values structure
338if ($hz eq '--can') {
339 shift(@ARGV);
340 @hzlist = sort {$a <=> $b} (@ARGV);
341
342 print "# Precomputed values for systems without Math::BigInt\n";
343 print "# Generated by:\n";
344 print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
345 print "\%canned_values = (\n";
346 my $pf = "\t";
347 foreach $hz (@hzlist) {
348 my @values = compute_values($hz);
349 print "$pf$hz => [\n";
350 while (scalar(@values)) {
351 my $bit;
352 foreach $bit (32) {
353 my $m = shift(@values);
354 my $a = shift(@values);
355 my $s = shift(@values);
356 print "\t\t", perlvals($m,$a,$s), ",\n";
357 }
358 my $n = shift(@values);
359 my $d = shift(@values);
360 print "\t\t", perlvals($n,$d), ",\n";
361 }
362 print "\t]";
363 $pf = ', ';
364 }
365 print "\n);\n";
366} else {
367 $hz += 0; # Force to number
368 if ($hz < 1) {
369 die "Usage: $0 HZ\n";
370 }
371
372 @val = @{$canned_values{$hz}};
373 if (!defined(@val)) {
374 @val = compute_values($hz);
375 }
376 output($hz, @val);
377}
378exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d00858482..dbf7a78a1ef1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/irq_work.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/sched/sysctl.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43 44
44#include <asm/uaccess.h> 45#include <asm/uaccess.h>
@@ -1351,7 +1352,6 @@ void update_process_times(int user_tick)
1351 account_process_tick(p, user_tick); 1352 account_process_tick(p, user_tick);
1352 run_local_timers(); 1353 run_local_timers();
1353 rcu_check_callbacks(cpu, user_tick); 1354 rcu_check_callbacks(cpu, user_tick);
1354 printk_tick();
1355#ifdef CONFIG_IRQ_WORK 1355#ifdef CONFIG_IRQ_WORK
1356 if (in_irq()) 1356 if (in_irq())
1357 irq_work_run(); 1357 irq_work_run();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335a485f..192473b22799 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
39 help 39 help
40 See Documentation/trace/ftrace-design.txt 40 See Documentation/trace/ftrace-design.txt
41 41
42config HAVE_DYNAMIC_FTRACE_WITH_REGS
43 bool
44
42config HAVE_FTRACE_MCOUNT_RECORD 45config HAVE_FTRACE_MCOUNT_RECORD
43 bool 46 bool
44 help 47 help
@@ -78,21 +81,6 @@ config EVENT_TRACING
78 select CONTEXT_SWITCH_TRACER 81 select CONTEXT_SWITCH_TRACER
79 bool 82 bool
80 83
81config EVENT_POWER_TRACING_DEPRECATED
82 depends on EVENT_TRACING
83 bool "Deprecated power event trace API, to be removed"
84 default y
85 help
86 Provides old power event types:
87 C-state/idle accounting events:
88 power:power_start
89 power:power_end
90 and old cpufreq accounting event:
91 power:power_frequency
92 This is for userspace compatibility
93 and will vanish after 5 kernel iterations,
94 namely 3.1.
95
96config CONTEXT_SWITCH_TRACER 84config CONTEXT_SWITCH_TRACER
97 bool 85 bool
98 86
@@ -250,6 +238,16 @@ config FTRACE_SYSCALLS
250 help 238 help
251 Basic tracer to catch the syscall entry and exit events. 239 Basic tracer to catch the syscall entry and exit events.
252 240
241config TRACER_SNAPSHOT
242 bool "Create a snapshot trace buffer"
243 select TRACER_MAX_TRACE
244 help
245 Allow tracing users to take snapshot of the current buffer using the
246 ftrace interface, e.g.:
247
248 echo 1 > /sys/kernel/debug/tracing/snapshot
249 cat snapshot
250
253config TRACE_BRANCH_PROFILING 251config TRACE_BRANCH_PROFILING
254 bool 252 bool
255 select GENERIC_TRACER 253 select GENERIC_TRACER
@@ -434,6 +432,11 @@ config DYNAMIC_FTRACE
434 were made. If so, it runs stop_machine (stops all CPUS) 432 were made. If so, it runs stop_machine (stops all CPUS)
435 and modifies the code to jump over the call to ftrace. 433 and modifies the code to jump over the call to ftrace.
436 434
435config DYNAMIC_FTRACE_WITH_REGS
436 def_bool y
437 depends on DYNAMIC_FTRACE
438 depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
439
437config FUNCTION_PROFILER 440config FUNCTION_PROFILER
438 bool "Kernel function profiler" 441 bool "Kernel function profiler"
439 depends on FUNCTION_TRACER 442 depends on FUNCTION_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb593f6a687e..9e5b8c272eec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
147 return; 147 return;
148 148
149 local_irq_save(flags); 149 local_irq_save(flags);
150 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 150 buf = this_cpu_ptr(bt->msg_data);
151 va_start(args, fmt); 151 va_start(args, fmt);
152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); 152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
153 va_end(args); 153 va_end(args);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3ffe4c5ad3f3..ab25b88aae56 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) 111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112#endif 112#endif
113 113
114/*
115 * Traverse the ftrace_global_list, invoking all entries. The reason that we
116 * can use rcu_dereference_raw() is that elements removed from this list
117 * are simply leaked, so there is no need to interact with a grace-period
118 * mechanism. The rcu_dereference_raw() calls are needed to handle
119 * concurrent insertions into the ftrace_global_list.
120 *
121 * Silly Alpha and silly pointer-speculation compiler optimizations!
122 */
123#define do_for_each_ftrace_op(op, list) \
124 op = rcu_dereference_raw(list); \
125 do
126
127/*
128 * Optimized for just a single item in the list (as that is the normal case).
129 */
130#define while_for_each_ftrace_op(op) \
131 while (likely(op = rcu_dereference_raw((op)->next)) && \
132 unlikely((op) != &ftrace_list_end))
133
114/** 134/**
115 * ftrace_nr_registered_ops - return number of ops registered 135 * ftrace_nr_registered_ops - return number of ops registered
116 * 136 *
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
132 return cnt; 152 return cnt;
133} 153}
134 154
135/*
136 * Traverse the ftrace_global_list, invoking all entries. The reason that we
137 * can use rcu_dereference_raw() is that elements removed from this list
138 * are simply leaked, so there is no need to interact with a grace-period
139 * mechanism. The rcu_dereference_raw() calls are needed to handle
140 * concurrent insertions into the ftrace_global_list.
141 *
142 * Silly Alpha and silly pointer-speculation compiler optimizations!
143 */
144static void 155static void
145ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, 156ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
146 struct ftrace_ops *op, struct pt_regs *regs) 157 struct ftrace_ops *op, struct pt_regs *regs)
147{ 158{
148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) 159 int bit;
160
161 bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
162 if (bit < 0)
149 return; 163 return;
150 164
151 trace_recursion_set(TRACE_GLOBAL_BIT); 165 do_for_each_ftrace_op(op, ftrace_global_list) {
152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
153 while (op != &ftrace_list_end) {
154 op->func(ip, parent_ip, op, regs); 166 op->func(ip, parent_ip, op, regs);
155 op = rcu_dereference_raw(op->next); /*see above*/ 167 } while_for_each_ftrace_op(op);
156 }; 168
157 trace_recursion_clear(TRACE_GLOBAL_BIT); 169 trace_clear_recursion(bit);
158} 170}
159 171
160static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 172static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
221 * registered callers. 233 * registered callers.
222 */ 234 */
223 if (ftrace_global_list == &ftrace_list_end || 235 if (ftrace_global_list == &ftrace_list_end ||
224 ftrace_global_list->next == &ftrace_list_end) 236 ftrace_global_list->next == &ftrace_list_end) {
225 func = ftrace_global_list->func; 237 func = ftrace_global_list->func;
226 else 238 /*
239 * As we are calling the function directly.
240 * If it does not have recursion protection,
241 * the function_trace_op needs to be updated
242 * accordingly.
243 */
244 if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
245 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
246 else
247 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
248 } else {
227 func = ftrace_global_list_func; 249 func = ftrace_global_list_func;
250 /* The list has its own recursion protection. */
251 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
252 }
253
228 254
229 /* If we filter on pids, update to use the pid function */ 255 /* If we filter on pids, update to use the pid function */
230 if (!list_empty(&ftrace_pids)) { 256 if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) 363 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
338 return -EINVAL; 364 return -EINVAL;
339 365
340#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS 366#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
341 /* 367 /*
342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 368 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. 369 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -736,7 +762,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
736{ 762{
737 struct ftrace_profile *rec; 763 struct ftrace_profile *rec;
738 struct hlist_head *hhd; 764 struct hlist_head *hhd;
739 struct hlist_node *n;
740 unsigned long key; 765 unsigned long key;
741 766
742 key = hash_long(ip, ftrace_profile_bits); 767 key = hash_long(ip, ftrace_profile_bits);
@@ -745,7 +770,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
745 if (hlist_empty(hhd)) 770 if (hlist_empty(hhd))
746 return NULL; 771 return NULL;
747 772
748 hlist_for_each_entry_rcu(rec, n, hhd, node) { 773 hlist_for_each_entry_rcu(rec, hhd, node) {
749 if (rec->ip == ip) 774 if (rec->ip == ip)
750 return rec; 775 return rec;
751 } 776 }
@@ -1107,7 +1132,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1107 unsigned long key; 1132 unsigned long key;
1108 struct ftrace_func_entry *entry; 1133 struct ftrace_func_entry *entry;
1109 struct hlist_head *hhd; 1134 struct hlist_head *hhd;
1110 struct hlist_node *n;
1111 1135
1112 if (ftrace_hash_empty(hash)) 1136 if (ftrace_hash_empty(hash))
1113 return NULL; 1137 return NULL;
@@ -1119,7 +1143,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1119 1143
1120 hhd = &hash->buckets[key]; 1144 hhd = &hash->buckets[key];
1121 1145
1122 hlist_for_each_entry_rcu(entry, n, hhd, hlist) { 1146 hlist_for_each_entry_rcu(entry, hhd, hlist) {
1123 if (entry->ip == ip) 1147 if (entry->ip == ip)
1124 return entry; 1148 return entry;
1125 } 1149 }
@@ -1176,7 +1200,7 @@ remove_hash_entry(struct ftrace_hash *hash,
1176static void ftrace_hash_clear(struct ftrace_hash *hash) 1200static void ftrace_hash_clear(struct ftrace_hash *hash)
1177{ 1201{
1178 struct hlist_head *hhd; 1202 struct hlist_head *hhd;
1179 struct hlist_node *tp, *tn; 1203 struct hlist_node *tn;
1180 struct ftrace_func_entry *entry; 1204 struct ftrace_func_entry *entry;
1181 int size = 1 << hash->size_bits; 1205 int size = 1 << hash->size_bits;
1182 int i; 1206 int i;
@@ -1186,7 +1210,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
1186 1210
1187 for (i = 0; i < size; i++) { 1211 for (i = 0; i < size; i++) {
1188 hhd = &hash->buckets[i]; 1212 hhd = &hash->buckets[i];
1189 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) 1213 hlist_for_each_entry_safe(entry, tn, hhd, hlist)
1190 free_hash_entry(hash, entry); 1214 free_hash_entry(hash, entry);
1191 } 1215 }
1192 FTRACE_WARN_ON(hash->count); 1216 FTRACE_WARN_ON(hash->count);
@@ -1249,7 +1273,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1249{ 1273{
1250 struct ftrace_func_entry *entry; 1274 struct ftrace_func_entry *entry;
1251 struct ftrace_hash *new_hash; 1275 struct ftrace_hash *new_hash;
1252 struct hlist_node *tp;
1253 int size; 1276 int size;
1254 int ret; 1277 int ret;
1255 int i; 1278 int i;
@@ -1264,7 +1287,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1264 1287
1265 size = 1 << hash->size_bits; 1288 size = 1 << hash->size_bits;
1266 for (i = 0; i < size; i++) { 1289 for (i = 0; i < size; i++) {
1267 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { 1290 hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
1268 ret = add_hash_entry(new_hash, entry->ip); 1291 ret = add_hash_entry(new_hash, entry->ip);
1269 if (ret < 0) 1292 if (ret < 0)
1270 goto free_hash; 1293 goto free_hash;
@@ -1290,7 +1313,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1290 struct ftrace_hash **dst, struct ftrace_hash *src) 1313 struct ftrace_hash **dst, struct ftrace_hash *src)
1291{ 1314{
1292 struct ftrace_func_entry *entry; 1315 struct ftrace_func_entry *entry;
1293 struct hlist_node *tp, *tn; 1316 struct hlist_node *tn;
1294 struct hlist_head *hhd; 1317 struct hlist_head *hhd;
1295 struct ftrace_hash *old_hash; 1318 struct ftrace_hash *old_hash;
1296 struct ftrace_hash *new_hash; 1319 struct ftrace_hash *new_hash;
@@ -1336,7 +1359,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1336 size = 1 << src->size_bits; 1359 size = 1 << src->size_bits;
1337 for (i = 0; i < size; i++) { 1360 for (i = 0; i < size; i++) {
1338 hhd = &src->buckets[i]; 1361 hhd = &src->buckets[i];
1339 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { 1362 hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
1340 if (bits > 0) 1363 if (bits > 0)
1341 key = hash_long(entry->ip, bits); 1364 key = hash_long(entry->ip, bits);
1342 else 1365 else
@@ -2875,7 +2898,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2875{ 2898{
2876 struct ftrace_func_probe *entry; 2899 struct ftrace_func_probe *entry;
2877 struct hlist_head *hhd; 2900 struct hlist_head *hhd;
2878 struct hlist_node *n;
2879 unsigned long key; 2901 unsigned long key;
2880 2902
2881 key = hash_long(ip, FTRACE_HASH_BITS); 2903 key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2891,7 +2913,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2891 * on the hash. rcu_read_lock is too dangerous here. 2913 * on the hash. rcu_read_lock is too dangerous here.
2892 */ 2914 */
2893 preempt_disable_notrace(); 2915 preempt_disable_notrace();
2894 hlist_for_each_entry_rcu(entry, n, hhd, node) { 2916 hlist_for_each_entry_rcu(entry, hhd, node) {
2895 if (entry->ip == ip) 2917 if (entry->ip == ip)
2896 entry->ops->func(ip, parent_ip, &entry->data); 2918 entry->ops->func(ip, parent_ip, &entry->data);
2897 } 2919 }
@@ -3042,7 +3064,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3042 void *data, int flags) 3064 void *data, int flags)
3043{ 3065{
3044 struct ftrace_func_probe *entry; 3066 struct ftrace_func_probe *entry;
3045 struct hlist_node *n, *tmp; 3067 struct hlist_node *tmp;
3046 char str[KSYM_SYMBOL_LEN]; 3068 char str[KSYM_SYMBOL_LEN];
3047 int type = MATCH_FULL; 3069 int type = MATCH_FULL;
3048 int i, len = 0; 3070 int i, len = 0;
@@ -3065,7 +3087,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3065 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3087 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3066 struct hlist_head *hhd = &ftrace_func_hash[i]; 3088 struct hlist_head *hhd = &ftrace_func_hash[i];
3067 3089
3068 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { 3090 hlist_for_each_entry_safe(entry, tmp, hhd, node) {
3069 3091
3070 /* break up if statements for readability */ 3092 /* break up if statements for readability */
3071 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) 3093 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
@@ -3970,35 +3992,49 @@ static void ftrace_init_module(struct module *mod,
3970 ftrace_process_locs(mod, start, end); 3992 ftrace_process_locs(mod, start, end);
3971} 3993}
3972 3994
3973static int ftrace_module_notify(struct notifier_block *self, 3995static int ftrace_module_notify_enter(struct notifier_block *self,
3974 unsigned long val, void *data) 3996 unsigned long val, void *data)
3975{ 3997{
3976 struct module *mod = data; 3998 struct module *mod = data;
3977 3999
3978 switch (val) { 4000 if (val == MODULE_STATE_COMING)
3979 case MODULE_STATE_COMING:
3980 ftrace_init_module(mod, mod->ftrace_callsites, 4001 ftrace_init_module(mod, mod->ftrace_callsites,
3981 mod->ftrace_callsites + 4002 mod->ftrace_callsites +
3982 mod->num_ftrace_callsites); 4003 mod->num_ftrace_callsites);
3983 break; 4004 return 0;
3984 case MODULE_STATE_GOING: 4005}
4006
4007static int ftrace_module_notify_exit(struct notifier_block *self,
4008 unsigned long val, void *data)
4009{
4010 struct module *mod = data;
4011
4012 if (val == MODULE_STATE_GOING)
3985 ftrace_release_mod(mod); 4013 ftrace_release_mod(mod);
3986 break;
3987 }
3988 4014
3989 return 0; 4015 return 0;
3990} 4016}
3991#else 4017#else
3992static int ftrace_module_notify(struct notifier_block *self, 4018static int ftrace_module_notify_enter(struct notifier_block *self,
3993 unsigned long val, void *data) 4019 unsigned long val, void *data)
4020{
4021 return 0;
4022}
4023static int ftrace_module_notify_exit(struct notifier_block *self,
4024 unsigned long val, void *data)
3994{ 4025{
3995 return 0; 4026 return 0;
3996} 4027}
3997#endif /* CONFIG_MODULES */ 4028#endif /* CONFIG_MODULES */
3998 4029
3999struct notifier_block ftrace_module_nb = { 4030struct notifier_block ftrace_module_enter_nb = {
4000 .notifier_call = ftrace_module_notify, 4031 .notifier_call = ftrace_module_notify_enter,
4001 .priority = 0, 4032 .priority = INT_MAX, /* Run before anything that can use kprobes */
4033};
4034
4035struct notifier_block ftrace_module_exit_nb = {
4036 .notifier_call = ftrace_module_notify_exit,
4037 .priority = INT_MIN, /* Run after anything that can remove kprobes */
4002}; 4038};
4003 4039
4004extern unsigned long __start_mcount_loc[]; 4040extern unsigned long __start_mcount_loc[];
@@ -4032,9 +4068,13 @@ void __init ftrace_init(void)
4032 __start_mcount_loc, 4068 __start_mcount_loc,
4033 __stop_mcount_loc); 4069 __stop_mcount_loc);
4034 4070
4035 ret = register_module_notifier(&ftrace_module_nb); 4071 ret = register_module_notifier(&ftrace_module_enter_nb);
4036 if (ret) 4072 if (ret)
4037 pr_warning("Failed to register trace ftrace module notifier\n"); 4073 pr_warning("Failed to register trace ftrace module enter notifier\n");
4074
4075 ret = register_module_notifier(&ftrace_module_exit_nb);
4076 if (ret)
4077 pr_warning("Failed to register trace ftrace module exit notifier\n");
4038 4078
4039 set_ftrace_early_filters(); 4079 set_ftrace_early_filters();
4040 4080
@@ -4090,14 +4130,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4090 */ 4130 */
4091 preempt_disable_notrace(); 4131 preempt_disable_notrace();
4092 trace_recursion_set(TRACE_CONTROL_BIT); 4132 trace_recursion_set(TRACE_CONTROL_BIT);
4093 op = rcu_dereference_raw(ftrace_control_list); 4133 do_for_each_ftrace_op(op, ftrace_control_list) {
4094 while (op != &ftrace_list_end) {
4095 if (!ftrace_function_local_disabled(op) && 4134 if (!ftrace_function_local_disabled(op) &&
4096 ftrace_ops_test(op, ip)) 4135 ftrace_ops_test(op, ip))
4097 op->func(ip, parent_ip, op, regs); 4136 op->func(ip, parent_ip, op, regs);
4098 4137 } while_for_each_ftrace_op(op);
4099 op = rcu_dereference_raw(op->next);
4100 };
4101 trace_recursion_clear(TRACE_CONTROL_BIT); 4138 trace_recursion_clear(TRACE_CONTROL_BIT);
4102 preempt_enable_notrace(); 4139 preempt_enable_notrace();
4103} 4140}
@@ -4112,27 +4149,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112 struct ftrace_ops *ignored, struct pt_regs *regs) 4149 struct ftrace_ops *ignored, struct pt_regs *regs)
4113{ 4150{
4114 struct ftrace_ops *op; 4151 struct ftrace_ops *op;
4152 int bit;
4115 4153
4116 if (function_trace_stop) 4154 if (function_trace_stop)
4117 return; 4155 return;
4118 4156
4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) 4157 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
4158 if (bit < 0)
4120 return; 4159 return;
4121 4160
4122 trace_recursion_set(TRACE_INTERNAL_BIT);
4123 /* 4161 /*
4124 * Some of the ops may be dynamically allocated, 4162 * Some of the ops may be dynamically allocated,
4125 * they must be freed after a synchronize_sched(). 4163 * they must be freed after a synchronize_sched().
4126 */ 4164 */
4127 preempt_disable_notrace(); 4165 preempt_disable_notrace();
4128 op = rcu_dereference_raw(ftrace_ops_list); 4166 do_for_each_ftrace_op(op, ftrace_ops_list) {
4129 while (op != &ftrace_list_end) {
4130 if (ftrace_ops_test(op, ip)) 4167 if (ftrace_ops_test(op, ip))
4131 op->func(ip, parent_ip, op, regs); 4168 op->func(ip, parent_ip, op, regs);
4132 op = rcu_dereference_raw(op->next); 4169 } while_for_each_ftrace_op(op);
4133 };
4134 preempt_enable_notrace(); 4170 preempt_enable_notrace();
4135 trace_recursion_clear(TRACE_INTERNAL_BIT); 4171 trace_clear_recursion(bit);
4136} 4172}
4137 4173
4138/* 4174/*
@@ -4143,8 +4179,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4143 * Archs are to support both the regs and ftrace_ops at the same time. 4179 * Archs are to support both the regs and ftrace_ops at the same time.
4144 * If they support ftrace_ops, it is assumed they support regs. 4180 * If they support ftrace_ops, it is assumed they support regs.
4145 * If call backs want to use regs, they must either check for regs 4181 * If call backs want to use regs, they must either check for regs
4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. 4182 * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. 4183 * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
4148 * An architecture can pass partial regs with ftrace_ops and still 4184 * An architecture can pass partial regs with ftrace_ops and still
4149 * set the ARCH_SUPPORT_FTARCE_OPS. 4185 * set the ARCH_SUPPORT_FTARCE_OPS.
4150 */ 4186 */
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
20 17
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedcd..7244acde77b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
3 * 3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ftrace_event.h>
6#include <linux/ring_buffer.h> 7#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/debugfs.h> 11#include <linux/debugfs.h>
10#include <linux/uaccess.h> 12#include <linux/uaccess.h>
@@ -21,7 +23,6 @@
21#include <linux/fs.h> 23#include <linux/fs.h>
22 24
23#include <asm/local.h> 25#include <asm/local.h>
24#include "trace.h"
25 26
26static void update_pages_handler(struct work_struct *work); 27static void update_pages_handler(struct work_struct *work);
27 28
@@ -2432,41 +2433,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2432 2433
2433#ifdef CONFIG_TRACING 2434#ifdef CONFIG_TRACING
2434 2435
2435#define TRACE_RECURSIVE_DEPTH 16 2436/*
2437 * The lock and unlock are done within a preempt disable section.
2438 * The current_context per_cpu variable can only be modified
2439 * by the current task between lock and unlock. But it can
2440 * be modified more than once via an interrupt. To pass this
2441 * information from the lock to the unlock without having to
2442 * access the 'in_interrupt()' functions again (which do show
2443 * a bit of overhead in something as critical as function tracing,
2444 * we use a bitmask trick.
2445 *
2446 * bit 0 = NMI context
2447 * bit 1 = IRQ context
2448 * bit 2 = SoftIRQ context
2449 * bit 3 = normal context.
2450 *
2451 * This works because this is the order of contexts that can
2452 * preempt other contexts. A SoftIRQ never preempts an IRQ
2453 * context.
2454 *
2455 * When the context is determined, the corresponding bit is
2456 * checked and set (if it was set, then a recursion of that context
2457 * happened).
2458 *
2459 * On unlock, we need to clear this bit. To do so, just subtract
2460 * 1 from the current_context and AND it to itself.
2461 *
2462 * (binary)
2463 * 101 - 1 = 100
2464 * 101 & 100 = 100 (clearing bit zero)
2465 *
2466 * 1010 - 1 = 1001
2467 * 1010 & 1001 = 1000 (clearing bit 1)
2468 *
2469 * The least significant bit can be cleared this way, and it
2470 * just so happens that it is the same bit corresponding to
2471 * the current context.
2472 */
2473static DEFINE_PER_CPU(unsigned int, current_context);
2436 2474
2437/* Keep this code out of the fast path cache */ 2475static __always_inline int trace_recursive_lock(void)
2438static noinline void trace_recursive_fail(void)
2439{ 2476{
2440 /* Disable all tracing before we do anything else */ 2477 unsigned int val = this_cpu_read(current_context);
2441 tracing_off_permanent(); 2478 int bit;
2442
2443 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2444 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2445 trace_recursion_buffer(),
2446 hardirq_count() >> HARDIRQ_SHIFT,
2447 softirq_count() >> SOFTIRQ_SHIFT,
2448 in_nmi());
2449
2450 WARN_ON_ONCE(1);
2451}
2452 2479
2453static inline int trace_recursive_lock(void) 2480 if (in_interrupt()) {
2454{ 2481 if (in_nmi())
2455 trace_recursion_inc(); 2482 bit = 0;
2483 else if (in_irq())
2484 bit = 1;
2485 else
2486 bit = 2;
2487 } else
2488 bit = 3;
2456 2489
2457 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) 2490 if (unlikely(val & (1 << bit)))
2458 return 0; 2491 return 1;
2459 2492
2460 trace_recursive_fail(); 2493 val |= (1 << bit);
2494 this_cpu_write(current_context, val);
2461 2495
2462 return -1; 2496 return 0;
2463} 2497}
2464 2498
2465static inline void trace_recursive_unlock(void) 2499static __always_inline void trace_recursive_unlock(void)
2466{ 2500{
2467 WARN_ON_ONCE(!trace_recursion_buffer()); 2501 unsigned int val = this_cpu_read(current_context);
2468 2502
2469 trace_recursion_dec(); 2503 val--;
2504 val &= this_cpu_read(current_context);
2505 this_cpu_write(current_context, val);
2470} 2506}
2471 2507
2472#else 2508#else
@@ -3067,6 +3103,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3067EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); 3103EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3068 3104
3069/** 3105/**
3106 * ring_buffer_read_events_cpu - get the number of events successfully read
3107 * @buffer: The ring buffer
3108 * @cpu: The per CPU buffer to get the number of events read
3109 */
3110unsigned long
3111ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
3112{
3113 struct ring_buffer_per_cpu *cpu_buffer;
3114
3115 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3116 return 0;
3117
3118 cpu_buffer = buffer->buffers[cpu];
3119 return cpu_buffer->read;
3120}
3121EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
3122
3123/**
3070 * ring_buffer_entries - get the number of entries in a buffer 3124 * ring_buffer_entries - get the number of entries in a buffer
3071 * @buffer: The ring buffer 3125 * @buffer: The ring buffer
3072 * 3126 *
@@ -3425,7 +3479,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3425 /* check for end of page padding */ 3479 /* check for end of page padding */
3426 if ((iter->head >= rb_page_size(iter->head_page)) && 3480 if ((iter->head >= rb_page_size(iter->head_page)) &&
3427 (iter->head_page != cpu_buffer->commit_page)) 3481 (iter->head_page != cpu_buffer->commit_page))
3428 rb_advance_iter(iter); 3482 rb_inc_iter(iter);
3429} 3483}
3430 3484
3431static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) 3485static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5125677efa0..c2e2c2310374 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
39#include <linux/poll.h> 39#include <linux/poll.h>
40#include <linux/nmi.h> 40#include <linux/nmi.h>
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/sched/rt.h>
42 43
43#include "trace.h" 44#include "trace.h"
44#include "trace_output.h" 45#include "trace_output.h"
@@ -249,7 +250,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
249static struct tracer *trace_types __read_mostly; 250static struct tracer *trace_types __read_mostly;
250 251
251/* current_trace points to the tracer that is currently active */ 252/* current_trace points to the tracer that is currently active */
252static struct tracer *current_trace __read_mostly; 253static struct tracer *current_trace __read_mostly = &nop_trace;
253 254
254/* 255/*
255 * trace_types_lock is used to protect the trace_types list. 256 * trace_types_lock is used to protect the trace_types list.
@@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
709 return; 710 return;
710 711
711 WARN_ON_ONCE(!irqs_disabled()); 712 WARN_ON_ONCE(!irqs_disabled());
712 if (!current_trace->use_max_tr) { 713
713 WARN_ON_ONCE(1); 714 if (!current_trace->allocated_snapshot) {
715 /* Only the nop tracer should hit this when disabling */
716 WARN_ON_ONCE(current_trace != &nop_trace);
714 return; 717 return;
715 } 718 }
719
716 arch_spin_lock(&ftrace_max_lock); 720 arch_spin_lock(&ftrace_max_lock);
717 721
718 tr->buffer = max_tr.buffer; 722 tr->buffer = max_tr.buffer;
@@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
739 return; 743 return;
740 744
741 WARN_ON_ONCE(!irqs_disabled()); 745 WARN_ON_ONCE(!irqs_disabled());
742 if (!current_trace->use_max_tr) { 746 if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
743 WARN_ON_ONCE(1);
744 return; 747 return;
745 }
746 748
747 arch_spin_lock(&ftrace_max_lock); 749 arch_spin_lock(&ftrace_max_lock);
748 750
@@ -862,10 +864,13 @@ int register_tracer(struct tracer *type)
862 864
863 current_trace = type; 865 current_trace = type;
864 866
865 /* If we expanded the buffers, make sure the max is expanded too */ 867 if (type->use_max_tr) {
866 if (ring_buffer_expanded && type->use_max_tr) 868 /* If we expanded the buffers, make sure the max is expanded too */
867 ring_buffer_resize(max_tr.buffer, trace_buf_size, 869 if (ring_buffer_expanded)
868 RING_BUFFER_ALL_CPUS); 870 ring_buffer_resize(max_tr.buffer, trace_buf_size,
871 RING_BUFFER_ALL_CPUS);
872 type->allocated_snapshot = true;
873 }
869 874
870 /* the test is responsible for initializing and enabling */ 875 /* the test is responsible for initializing and enabling */
871 pr_info("Testing tracer %s: ", type->name); 876 pr_info("Testing tracer %s: ", type->name);
@@ -881,10 +886,14 @@ int register_tracer(struct tracer *type)
881 /* Only reset on passing, to avoid touching corrupted buffers */ 886 /* Only reset on passing, to avoid touching corrupted buffers */
882 tracing_reset_online_cpus(tr); 887 tracing_reset_online_cpus(tr);
883 888
884 /* Shrink the max buffer again */ 889 if (type->use_max_tr) {
885 if (ring_buffer_expanded && type->use_max_tr) 890 type->allocated_snapshot = false;
886 ring_buffer_resize(max_tr.buffer, 1, 891
887 RING_BUFFER_ALL_CPUS); 892 /* Shrink the max buffer again */
893 if (ring_buffer_expanded)
894 ring_buffer_resize(max_tr.buffer, 1,
895 RING_BUFFER_ALL_CPUS);
896 }
888 897
889 printk(KERN_CONT "PASSED\n"); 898 printk(KERN_CONT "PASSED\n");
890 } 899 }
@@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
922{ 931{
923 struct ring_buffer *buffer = tr->buffer; 932 struct ring_buffer *buffer = tr->buffer;
924 933
934 if (!buffer)
935 return;
936
925 ring_buffer_record_disable(buffer); 937 ring_buffer_record_disable(buffer);
926 938
927 /* Make sure all commits have finished */ 939 /* Make sure all commits have finished */
@@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
936 struct ring_buffer *buffer = tr->buffer; 948 struct ring_buffer *buffer = tr->buffer;
937 int cpu; 949 int cpu;
938 950
951 if (!buffer)
952 return;
953
939 ring_buffer_record_disable(buffer); 954 ring_buffer_record_disable(buffer);
940 955
941 /* Make sure all commits have finished */ 956 /* Make sure all commits have finished */
@@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1167 1182
1168 entry->preempt_count = pc & 0xff; 1183 entry->preempt_count = pc & 0xff;
1169 entry->pid = (tsk) ? tsk->pid : 0; 1184 entry->pid = (tsk) ? tsk->pid : 0;
1170 entry->padding = 0;
1171 entry->flags = 1185 entry->flags =
1172#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1186#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1173 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1187 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1335 */ 1349 */
1336 preempt_disable_notrace(); 1350 preempt_disable_notrace();
1337 1351
1338 use_stack = ++__get_cpu_var(ftrace_stack_reserve); 1352 use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
1339 /* 1353 /*
1340 * We don't need any atomic variables, just a barrier. 1354 * We don't need any atomic variables, just a barrier.
1341 * If an interrupt comes in, we don't care, because it would 1355 * If an interrupt comes in, we don't care, because it would
@@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1389 out: 1403 out:
1390 /* Again, don't let gcc optimize things here */ 1404 /* Again, don't let gcc optimize things here */
1391 barrier(); 1405 barrier();
1392 __get_cpu_var(ftrace_stack_reserve)--; 1406 __this_cpu_dec(ftrace_stack_reserve);
1393 preempt_enable_notrace(); 1407 preempt_enable_notrace();
1394 1408
1395} 1409}
@@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1517static char *get_trace_buf(void) 1531static char *get_trace_buf(void)
1518{ 1532{
1519 struct trace_buffer_struct *percpu_buffer; 1533 struct trace_buffer_struct *percpu_buffer;
1520 struct trace_buffer_struct *buffer;
1521 1534
1522 /* 1535 /*
1523 * If we have allocated per cpu buffers, then we do not 1536 * If we have allocated per cpu buffers, then we do not
@@ -1535,9 +1548,7 @@ static char *get_trace_buf(void)
1535 if (!percpu_buffer) 1548 if (!percpu_buffer)
1536 return NULL; 1549 return NULL;
1537 1550
1538 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); 1551 return this_cpu_ptr(&percpu_buffer->buffer[0]);
1539
1540 return buffer->buffer;
1541} 1552}
1542 1553
1543static int alloc_percpu_trace_buffer(void) 1554static int alloc_percpu_trace_buffer(void)
@@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1942static void *s_start(struct seq_file *m, loff_t *pos) 1953static void *s_start(struct seq_file *m, loff_t *pos)
1943{ 1954{
1944 struct trace_iterator *iter = m->private; 1955 struct trace_iterator *iter = m->private;
1945 static struct tracer *old_tracer;
1946 int cpu_file = iter->cpu_file; 1956 int cpu_file = iter->cpu_file;
1947 void *p = NULL; 1957 void *p = NULL;
1948 loff_t l = 0; 1958 loff_t l = 0;
1949 int cpu; 1959 int cpu;
1950 1960
1951 /* copy the tracer to avoid using a global lock all around */ 1961 /*
1962 * copy the tracer to avoid using a global lock all around.
1963 * iter->trace is a copy of current_trace, the pointer to the
1964 * name may be used instead of a strcmp(), as iter->trace->name
1965 * will point to the same string as current_trace->name.
1966 */
1952 mutex_lock(&trace_types_lock); 1967 mutex_lock(&trace_types_lock);
1953 if (unlikely(old_tracer != current_trace && current_trace)) { 1968 if (unlikely(current_trace && iter->trace->name != current_trace->name))
1954 old_tracer = current_trace;
1955 *iter->trace = *current_trace; 1969 *iter->trace = *current_trace;
1956 }
1957 mutex_unlock(&trace_types_lock); 1970 mutex_unlock(&trace_types_lock);
1958 1971
1959 atomic_inc(&trace_record_cmdline_disabled); 1972 if (iter->snapshot && iter->trace->use_max_tr)
1973 return ERR_PTR(-EBUSY);
1974
1975 if (!iter->snapshot)
1976 atomic_inc(&trace_record_cmdline_disabled);
1960 1977
1961 if (*pos != iter->pos) { 1978 if (*pos != iter->pos) {
1962 iter->ent = NULL; 1979 iter->ent = NULL;
@@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p)
1995{ 2012{
1996 struct trace_iterator *iter = m->private; 2013 struct trace_iterator *iter = m->private;
1997 2014
1998 atomic_dec(&trace_record_cmdline_disabled); 2015 if (iter->snapshot && iter->trace->use_max_tr)
2016 return;
2017
2018 if (!iter->snapshot)
2019 atomic_dec(&trace_record_cmdline_disabled);
1999 trace_access_unlock(iter->cpu_file); 2020 trace_access_unlock(iter->cpu_file);
2000 trace_event_read_unlock(); 2021 trace_event_read_unlock();
2001} 2022}
@@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2080 unsigned long total; 2101 unsigned long total;
2081 const char *name = "preemption"; 2102 const char *name = "preemption";
2082 2103
2083 if (type) 2104 name = type->name;
2084 name = type->name;
2085 2105
2086 get_total_entries(tr, &total, &entries); 2106 get_total_entries(tr, &total, &entries);
2087 2107
@@ -2430,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = {
2430}; 2450};
2431 2451
2432static struct trace_iterator * 2452static struct trace_iterator *
2433__tracing_open(struct inode *inode, struct file *file) 2453__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2434{ 2454{
2435 long cpu_file = (long) inode->i_private; 2455 long cpu_file = (long) inode->i_private;
2436 struct trace_iterator *iter; 2456 struct trace_iterator *iter;
@@ -2457,16 +2477,16 @@ __tracing_open(struct inode *inode, struct file *file)
2457 if (!iter->trace) 2477 if (!iter->trace)
2458 goto fail; 2478 goto fail;
2459 2479
2460 if (current_trace) 2480 *iter->trace = *current_trace;
2461 *iter->trace = *current_trace;
2462 2481
2463 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2482 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2464 goto fail; 2483 goto fail;
2465 2484
2466 if (current_trace && current_trace->print_max) 2485 if (current_trace->print_max || snapshot)
2467 iter->tr = &max_tr; 2486 iter->tr = &max_tr;
2468 else 2487 else
2469 iter->tr = &global_trace; 2488 iter->tr = &global_trace;
2489 iter->snapshot = snapshot;
2470 iter->pos = -1; 2490 iter->pos = -1;
2471 mutex_init(&iter->mutex); 2491 mutex_init(&iter->mutex);
2472 iter->cpu_file = cpu_file; 2492 iter->cpu_file = cpu_file;
@@ -2483,8 +2503,9 @@ __tracing_open(struct inode *inode, struct file *file)
2483 if (trace_clocks[trace_clock_id].in_ns) 2503 if (trace_clocks[trace_clock_id].in_ns)
2484 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 2504 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2485 2505
2486 /* stop the trace while dumping */ 2506 /* stop the trace while dumping if we are not opening "snapshot" */
2487 tracing_stop(); 2507 if (!iter->snapshot)
2508 tracing_stop();
2488 2509
2489 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2510 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2490 for_each_tracing_cpu(cpu) { 2511 for_each_tracing_cpu(cpu) {
@@ -2547,8 +2568,9 @@ static int tracing_release(struct inode *inode, struct file *file)
2547 if (iter->trace && iter->trace->close) 2568 if (iter->trace && iter->trace->close)
2548 iter->trace->close(iter); 2569 iter->trace->close(iter);
2549 2570
2550 /* reenable tracing if it was previously enabled */ 2571 if (!iter->snapshot)
2551 tracing_start(); 2572 /* reenable tracing if it was previously enabled */
2573 tracing_start();
2552 mutex_unlock(&trace_types_lock); 2574 mutex_unlock(&trace_types_lock);
2553 2575
2554 mutex_destroy(&iter->mutex); 2576 mutex_destroy(&iter->mutex);
@@ -2576,7 +2598,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2576 } 2598 }
2577 2599
2578 if (file->f_mode & FMODE_READ) { 2600 if (file->f_mode & FMODE_READ) {
2579 iter = __tracing_open(inode, file); 2601 iter = __tracing_open(inode, file, false);
2580 if (IS_ERR(iter)) 2602 if (IS_ERR(iter))
2581 ret = PTR_ERR(iter); 2603 ret = PTR_ERR(iter);
2582 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 2604 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -2899,6 +2921,8 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2899 if (copy_from_user(&buf, ubuf, cnt)) 2921 if (copy_from_user(&buf, ubuf, cnt))
2900 return -EFAULT; 2922 return -EFAULT;
2901 2923
2924 buf[cnt] = 0;
2925
2902 trace_set_options(buf); 2926 trace_set_options(buf);
2903 2927
2904 *ppos += cnt; 2928 *ppos += cnt;
@@ -3012,10 +3036,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
3012 int r; 3036 int r;
3013 3037
3014 mutex_lock(&trace_types_lock); 3038 mutex_lock(&trace_types_lock);
3015 if (current_trace) 3039 r = sprintf(buf, "%s\n", current_trace->name);
3016 r = sprintf(buf, "%s\n", current_trace->name);
3017 else
3018 r = sprintf(buf, "\n");
3019 mutex_unlock(&trace_types_lock); 3040 mutex_unlock(&trace_types_lock);
3020 3041
3021 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3042 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3181,6 +3202,7 @@ static int tracing_set_tracer(const char *buf)
3181 static struct trace_option_dentry *topts; 3202 static struct trace_option_dentry *topts;
3182 struct trace_array *tr = &global_trace; 3203 struct trace_array *tr = &global_trace;
3183 struct tracer *t; 3204 struct tracer *t;
3205 bool had_max_tr;
3184 int ret = 0; 3206 int ret = 0;
3185 3207
3186 mutex_lock(&trace_types_lock); 3208 mutex_lock(&trace_types_lock);
@@ -3205,9 +3227,21 @@ static int tracing_set_tracer(const char *buf)
3205 goto out; 3227 goto out;
3206 3228
3207 trace_branch_disable(); 3229 trace_branch_disable();
3208 if (current_trace && current_trace->reset) 3230 if (current_trace->reset)
3209 current_trace->reset(tr); 3231 current_trace->reset(tr);
3210 if (current_trace && current_trace->use_max_tr) { 3232
3233 had_max_tr = current_trace->allocated_snapshot;
3234 current_trace = &nop_trace;
3235
3236 if (had_max_tr && !t->use_max_tr) {
3237 /*
3238 * We need to make sure that the update_max_tr sees that
3239 * current_trace changed to nop_trace to keep it from
3240 * swapping the buffers after we resize it.
3241 * The update_max_tr is called from interrupts disabled
3242 * so a synchronized_sched() is sufficient.
3243 */
3244 synchronize_sched();
3211 /* 3245 /*
3212 * We don't free the ring buffer. instead, resize it because 3246 * We don't free the ring buffer. instead, resize it because
3213 * The max_tr ring buffer has some state (e.g. ring->clock) and 3247 * The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3215,18 +3249,19 @@ static int tracing_set_tracer(const char *buf)
3215 */ 3249 */
3216 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); 3250 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3217 set_buffer_entries(&max_tr, 1); 3251 set_buffer_entries(&max_tr, 1);
3252 tracing_reset_online_cpus(&max_tr);
3253 current_trace->allocated_snapshot = false;
3218 } 3254 }
3219 destroy_trace_option_files(topts); 3255 destroy_trace_option_files(topts);
3220 3256
3221 current_trace = &nop_trace;
3222
3223 topts = create_trace_option_files(t); 3257 topts = create_trace_option_files(t);
3224 if (t->use_max_tr) { 3258 if (t->use_max_tr && !had_max_tr) {
3225 /* we need to make per cpu buffer sizes equivalent */ 3259 /* we need to make per cpu buffer sizes equivalent */
3226 ret = resize_buffer_duplicate_size(&max_tr, &global_trace, 3260 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3227 RING_BUFFER_ALL_CPUS); 3261 RING_BUFFER_ALL_CPUS);
3228 if (ret < 0) 3262 if (ret < 0)
3229 goto out; 3263 goto out;
3264 t->allocated_snapshot = true;
3230 } 3265 }
3231 3266
3232 if (t->init) { 3267 if (t->init) {
@@ -3334,8 +3369,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3334 ret = -ENOMEM; 3369 ret = -ENOMEM;
3335 goto fail; 3370 goto fail;
3336 } 3371 }
3337 if (current_trace) 3372 *iter->trace = *current_trace;
3338 *iter->trace = *current_trace;
3339 3373
3340 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3374 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3341 ret = -ENOMEM; 3375 ret = -ENOMEM;
@@ -3452,7 +3486,7 @@ static int tracing_wait_pipe(struct file *filp)
3452 return -EINTR; 3486 return -EINTR;
3453 3487
3454 /* 3488 /*
3455 * We block until we read something and tracing is enabled. 3489 * We block until we read something and tracing is disabled.
3456 * We still block if tracing is disabled, but we have never 3490 * We still block if tracing is disabled, but we have never
3457 * read anything. This allows a user to cat this file, and 3491 * read anything. This allows a user to cat this file, and
3458 * then enable tracing. But after we have read something, 3492 * then enable tracing. But after we have read something,
@@ -3460,7 +3494,7 @@ static int tracing_wait_pipe(struct file *filp)
3460 * 3494 *
3461 * iter->pos will be 0 if we haven't read anything. 3495 * iter->pos will be 0 if we haven't read anything.
3462 */ 3496 */
3463 if (tracing_is_enabled() && iter->pos) 3497 if (!tracing_is_enabled() && iter->pos)
3464 break; 3498 break;
3465 } 3499 }
3466 3500
@@ -3475,7 +3509,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3475 size_t cnt, loff_t *ppos) 3509 size_t cnt, loff_t *ppos)
3476{ 3510{
3477 struct trace_iterator *iter = filp->private_data; 3511 struct trace_iterator *iter = filp->private_data;
3478 static struct tracer *old_tracer;
3479 ssize_t sret; 3512 ssize_t sret;
3480 3513
3481 /* return any leftover data */ 3514 /* return any leftover data */
@@ -3487,10 +3520,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3487 3520
3488 /* copy the tracer to avoid using a global lock all around */ 3521 /* copy the tracer to avoid using a global lock all around */
3489 mutex_lock(&trace_types_lock); 3522 mutex_lock(&trace_types_lock);
3490 if (unlikely(old_tracer != current_trace && current_trace)) { 3523 if (unlikely(iter->trace->name != current_trace->name))
3491 old_tracer = current_trace;
3492 *iter->trace = *current_trace; 3524 *iter->trace = *current_trace;
3493 }
3494 mutex_unlock(&trace_types_lock); 3525 mutex_unlock(&trace_types_lock);
3495 3526
3496 /* 3527 /*
@@ -3646,7 +3677,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3646 .ops = &tracing_pipe_buf_ops, 3677 .ops = &tracing_pipe_buf_ops,
3647 .spd_release = tracing_spd_release_pipe, 3678 .spd_release = tracing_spd_release_pipe,
3648 }; 3679 };
3649 static struct tracer *old_tracer;
3650 ssize_t ret; 3680 ssize_t ret;
3651 size_t rem; 3681 size_t rem;
3652 unsigned int i; 3682 unsigned int i;
@@ -3656,10 +3686,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3656 3686
3657 /* copy the tracer to avoid using a global lock all around */ 3687 /* copy the tracer to avoid using a global lock all around */
3658 mutex_lock(&trace_types_lock); 3688 mutex_lock(&trace_types_lock);
3659 if (unlikely(old_tracer != current_trace && current_trace)) { 3689 if (unlikely(iter->trace->name != current_trace->name))
3660 old_tracer = current_trace;
3661 *iter->trace = *current_trace; 3690 *iter->trace = *current_trace;
3662 }
3663 mutex_unlock(&trace_types_lock); 3691 mutex_unlock(&trace_types_lock);
3664 3692
3665 mutex_lock(&iter->mutex); 3693 mutex_lock(&iter->mutex);
@@ -4035,8 +4063,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4035 * Reset the buffer so that it doesn't have incomparable timestamps. 4063 * Reset the buffer so that it doesn't have incomparable timestamps.
4036 */ 4064 */
4037 tracing_reset_online_cpus(&global_trace); 4065 tracing_reset_online_cpus(&global_trace);
4038 if (max_tr.buffer) 4066 tracing_reset_online_cpus(&max_tr);
4039 tracing_reset_online_cpus(&max_tr);
4040 4067
4041 mutex_unlock(&trace_types_lock); 4068 mutex_unlock(&trace_types_lock);
4042 4069
@@ -4052,6 +4079,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4052 return single_open(file, tracing_clock_show, NULL); 4079 return single_open(file, tracing_clock_show, NULL);
4053} 4080}
4054 4081
4082#ifdef CONFIG_TRACER_SNAPSHOT
4083static int tracing_snapshot_open(struct inode *inode, struct file *file)
4084{
4085 struct trace_iterator *iter;
4086 int ret = 0;
4087
4088 if (file->f_mode & FMODE_READ) {
4089 iter = __tracing_open(inode, file, true);
4090 if (IS_ERR(iter))
4091 ret = PTR_ERR(iter);
4092 }
4093 return ret;
4094}
4095
4096static ssize_t
4097tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4098 loff_t *ppos)
4099{
4100 unsigned long val;
4101 int ret;
4102
4103 ret = tracing_update_buffers();
4104 if (ret < 0)
4105 return ret;
4106
4107 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4108 if (ret)
4109 return ret;
4110
4111 mutex_lock(&trace_types_lock);
4112
4113 if (current_trace->use_max_tr) {
4114 ret = -EBUSY;
4115 goto out;
4116 }
4117
4118 switch (val) {
4119 case 0:
4120 if (current_trace->allocated_snapshot) {
4121 /* free spare buffer */
4122 ring_buffer_resize(max_tr.buffer, 1,
4123 RING_BUFFER_ALL_CPUS);
4124 set_buffer_entries(&max_tr, 1);
4125 tracing_reset_online_cpus(&max_tr);
4126 current_trace->allocated_snapshot = false;
4127 }
4128 break;
4129 case 1:
4130 if (!current_trace->allocated_snapshot) {
4131 /* allocate spare buffer */
4132 ret = resize_buffer_duplicate_size(&max_tr,
4133 &global_trace, RING_BUFFER_ALL_CPUS);
4134 if (ret < 0)
4135 break;
4136 current_trace->allocated_snapshot = true;
4137 }
4138
4139 local_irq_disable();
4140 /* Now, we're going to swap */
4141 update_max_tr(&global_trace, current, smp_processor_id());
4142 local_irq_enable();
4143 break;
4144 default:
4145 if (current_trace->allocated_snapshot)
4146 tracing_reset_online_cpus(&max_tr);
4147 else
4148 ret = -EINVAL;
4149 break;
4150 }
4151
4152 if (ret >= 0) {
4153 *ppos += cnt;
4154 ret = cnt;
4155 }
4156out:
4157 mutex_unlock(&trace_types_lock);
4158 return ret;
4159}
4160#endif /* CONFIG_TRACER_SNAPSHOT */
4161
4162
4055static const struct file_operations tracing_max_lat_fops = { 4163static const struct file_operations tracing_max_lat_fops = {
4056 .open = tracing_open_generic, 4164 .open = tracing_open_generic,
4057 .read = tracing_max_lat_read, 4165 .read = tracing_max_lat_read,
@@ -4108,6 +4216,16 @@ static const struct file_operations trace_clock_fops = {
4108 .write = tracing_clock_write, 4216 .write = tracing_clock_write,
4109}; 4217};
4110 4218
4219#ifdef CONFIG_TRACER_SNAPSHOT
4220static const struct file_operations snapshot_fops = {
4221 .open = tracing_snapshot_open,
4222 .read = seq_read,
4223 .write = tracing_snapshot_write,
4224 .llseek = tracing_seek,
4225 .release = tracing_release,
4226};
4227#endif /* CONFIG_TRACER_SNAPSHOT */
4228
4111struct ftrace_buffer_info { 4229struct ftrace_buffer_info {
4112 struct trace_array *tr; 4230 struct trace_array *tr;
4113 void *spare; 4231 void *spare;
@@ -4412,6 +4530,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4412 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); 4530 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4413 trace_seq_printf(s, "dropped events: %ld\n", cnt); 4531 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4414 4532
4533 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
4534 trace_seq_printf(s, "read events: %ld\n", cnt);
4535
4415 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4536 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4416 4537
4417 kfree(s); 4538 kfree(s);
@@ -4488,7 +4609,7 @@ struct dentry *tracing_init_dentry(void)
4488 4609
4489static struct dentry *d_percpu; 4610static struct dentry *d_percpu;
4490 4611
4491struct dentry *tracing_dentry_percpu(void) 4612static struct dentry *tracing_dentry_percpu(void)
4492{ 4613{
4493 static int once; 4614 static int once;
4494 struct dentry *d_tracer; 4615 struct dentry *d_tracer;
@@ -4815,10 +4936,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4815 return ret; 4936 return ret;
4816 4937
4817 if (buffer) { 4938 if (buffer) {
4818 if (val) 4939 mutex_lock(&trace_types_lock);
4940 if (val) {
4819 ring_buffer_record_on(buffer); 4941 ring_buffer_record_on(buffer);
4820 else 4942 if (current_trace->start)
4943 current_trace->start(tr);
4944 } else {
4821 ring_buffer_record_off(buffer); 4945 ring_buffer_record_off(buffer);
4946 if (current_trace->stop)
4947 current_trace->stop(tr);
4948 }
4949 mutex_unlock(&trace_types_lock);
4822 } 4950 }
4823 4951
4824 (*ppos)++; 4952 (*ppos)++;
@@ -4897,6 +5025,11 @@ static __init int tracer_init_debugfs(void)
4897 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5025 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4898#endif 5026#endif
4899 5027
5028#ifdef CONFIG_TRACER_SNAPSHOT
5029 trace_create_file("snapshot", 0644, d_tracer,
5030 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5031#endif
5032
4900 create_trace_options_dir(); 5033 create_trace_options_dir();
4901 5034
4902 for_each_tracing_cpu(cpu) 5035 for_each_tracing_cpu(cpu)
@@ -5005,6 +5138,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5005 if (disable_tracing) 5138 if (disable_tracing)
5006 ftrace_kill(); 5139 ftrace_kill();
5007 5140
5141 /* Simulate the iterator */
5008 trace_init_global_iter(&iter); 5142 trace_init_global_iter(&iter);
5009 5143
5010 for_each_tracing_cpu(cpu) { 5144 for_each_tracing_cpu(cpu) {
@@ -5016,10 +5150,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5016 /* don't look at user memory in panic mode */ 5150 /* don't look at user memory in panic mode */
5017 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 5151 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
5018 5152
5019 /* Simulate the iterator */
5020 iter.tr = &global_trace;
5021 iter.trace = current_trace;
5022
5023 switch (oops_dump_mode) { 5153 switch (oops_dump_mode) {
5024 case DUMP_ALL: 5154 case DUMP_ALL:
5025 iter.cpu_file = TRACE_PIPE_ALL_CPU; 5155 iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5164,7 +5294,7 @@ __init static int tracer_alloc_buffers(void)
5164 init_irq_work(&trace_work_wakeup, trace_wake_up); 5294 init_irq_work(&trace_work_wakeup, trace_wake_up);
5165 5295
5166 register_tracer(&nop_trace); 5296 register_tracer(&nop_trace);
5167 current_trace = &nop_trace; 5297
5168 /* All seems OK, enable tracing */ 5298 /* All seems OK, enable tracing */
5169 tracing_disabled = 0; 5299 tracing_disabled = 0;
5170 5300
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902c..57d7e5397d56 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -287,20 +287,62 @@ struct tracer {
287 struct tracer_flags *flags; 287 struct tracer_flags *flags;
288 bool print_max; 288 bool print_max;
289 bool use_max_tr; 289 bool use_max_tr;
290 bool allocated_snapshot;
290}; 291};
291 292
292 293
293/* Only current can touch trace_recursion */ 294/* Only current can touch trace_recursion */
294#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
295#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
296 295
297/* Ring buffer has the 10 LSB bits to count */ 296/*
298#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) 297 * For function tracing recursion:
299 298 * The order of these bits are important.
300/* for function tracing recursion */ 299 *
301#define TRACE_INTERNAL_BIT (1<<11) 300 * When function tracing occurs, the following steps are made:
302#define TRACE_GLOBAL_BIT (1<<12) 301 * If arch does not support a ftrace feature:
303#define TRACE_CONTROL_BIT (1<<13) 302 * call internal function (uses INTERNAL bits) which calls...
303 * If callback is registered to the "global" list, the list
304 * function is called and recursion checks the GLOBAL bits.
305 * then this function calls...
306 * The function callback, which can use the FTRACE bits to
307 * check for recursion.
308 *
309 * Now if the arch does not suppport a feature, and it calls
310 * the global list function which calls the ftrace callback
311 * all three of these steps will do a recursion protection.
312 * There's no reason to do one if the previous caller already
313 * did. The recursion that we are protecting against will
314 * go through the same steps again.
315 *
316 * To prevent the multiple recursion checks, if a recursion
317 * bit is set that is higher than the MAX bit of the current
318 * check, then we know that the check was made by the previous
319 * caller, and we can skip the current check.
320 */
321enum {
322 TRACE_BUFFER_BIT,
323 TRACE_BUFFER_NMI_BIT,
324 TRACE_BUFFER_IRQ_BIT,
325 TRACE_BUFFER_SIRQ_BIT,
326
327 /* Start of function recursion bits */
328 TRACE_FTRACE_BIT,
329 TRACE_FTRACE_NMI_BIT,
330 TRACE_FTRACE_IRQ_BIT,
331 TRACE_FTRACE_SIRQ_BIT,
332
333 /* GLOBAL_BITs must be greater than FTRACE_BITs */
334 TRACE_GLOBAL_BIT,
335 TRACE_GLOBAL_NMI_BIT,
336 TRACE_GLOBAL_IRQ_BIT,
337 TRACE_GLOBAL_SIRQ_BIT,
338
339 /* INTERNAL_BITs must be greater than GLOBAL_BITs */
340 TRACE_INTERNAL_BIT,
341 TRACE_INTERNAL_NMI_BIT,
342 TRACE_INTERNAL_IRQ_BIT,
343 TRACE_INTERNAL_SIRQ_BIT,
344
345 TRACE_CONTROL_BIT,
304 346
305/* 347/*
306 * Abuse of the trace_recursion. 348 * Abuse of the trace_recursion.
@@ -309,11 +351,77 @@ struct tracer {
309 * was called in irq context but we have irq tracing off. Since this 351 * was called in irq context but we have irq tracing off. Since this
310 * can only be modified by current, we can reuse trace_recursion. 352 * can only be modified by current, we can reuse trace_recursion.
311 */ 353 */
312#define TRACE_IRQ_BIT (1<<13) 354 TRACE_IRQ_BIT,
355};
356
357#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
358#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
359#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
360
361#define TRACE_CONTEXT_BITS 4
362
363#define TRACE_FTRACE_START TRACE_FTRACE_BIT
364#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
365
366#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
367#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
368
369#define TRACE_LIST_START TRACE_INTERNAL_BIT
370#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
371
372#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
373
374static __always_inline int trace_get_context_bit(void)
375{
376 int bit;
313 377
314#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) 378 if (in_interrupt()) {
315#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) 379 if (in_nmi())
316#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) 380 bit = 0;
381
382 else if (in_irq())
383 bit = 1;
384 else
385 bit = 2;
386 } else
387 bit = 3;
388
389 return bit;
390}
391
392static __always_inline int trace_test_and_set_recursion(int start, int max)
393{
394 unsigned int val = current->trace_recursion;
395 int bit;
396
397 /* A previous recursion check was made */
398 if ((val & TRACE_CONTEXT_MASK) > max)
399 return 0;
400
401 bit = trace_get_context_bit() + start;
402 if (unlikely(val & (1 << bit)))
403 return -1;
404
405 val |= 1 << bit;
406 current->trace_recursion = val;
407 barrier();
408
409 return bit;
410}
411
412static __always_inline void trace_clear_recursion(int bit)
413{
414 unsigned int val = current->trace_recursion;
415
416 if (!bit)
417 return;
418
419 bit = 1 << bit;
420 val &= ~bit;
421
422 barrier();
423 current->trace_recursion = val;
424}
317 425
318#define TRACE_PIPE_ALL_CPU -1 426#define TRACE_PIPE_ALL_CPU -1
319 427
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cbb..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
21#include <linux/ktime.h> 21#include <linux/ktime.h>
22#include <linux/trace_clock.h> 22#include <linux/trace_clock.h>
23 23
24#include "trace.h"
25
26/* 24/*
27 * trace_clock_local(): the simplest and least coherent tracing clock. 25 * trace_clock_local(): the simplest and least coherent tracing clock.
28 * 26 *
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
44 42
45 return clock; 43 return clock;
46} 44}
45EXPORT_SYMBOL_GPL(trace_clock_local);
47 46
48/* 47/*
49 * trace_clock(): 'between' trace clock. Not completely serialized, 48 * trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)
86 local_irq_save(flags); 85 local_irq_save(flags);
87 86
88 this_cpu = raw_smp_processor_id(); 87 this_cpu = raw_smp_processor_id();
89 now = cpu_clock(this_cpu); 88 now = sched_clock_cpu(this_cpu);
90 /* 89 /*
91 * If in an NMI context then dont risk lockups and return the 90 * If in an NMI context then dont risk lockups and return the
92 * cpu_clock() time: 91 * cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b946..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, padding);
120 119
121 return ret; 120 return ret;
122} 121}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab7..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(tr);
48} 48}
49 49
50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
52 struct ftrace_ops *op, struct pt_regs *pt_regs)
53{
54 struct trace_array *tr = func_trace;
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 long disabled;
58 int cpu;
59 int pc;
60
61 if (unlikely(!ftrace_function_enabled))
62 return;
63
64 pc = preempt_count();
65 preempt_disable_notrace();
66 local_save_flags(flags);
67 cpu = raw_smp_processor_id();
68 data = tr->data[cpu];
69 disabled = atomic_inc_return(&data->disabled);
70
71 if (likely(disabled == 1))
72 trace_function(tr, ip, parent_ip, flags, pc);
73
74 atomic_dec(&data->disabled);
75 preempt_enable_notrace();
76}
77
78/* Our option */ 50/* Our option */
79enum { 51enum {
80 TRACE_FUNC_OPT_STACK = 0x1, 52 TRACE_FUNC_OPT_STACK = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
85static void 57static void
86function_trace_call(unsigned long ip, unsigned long parent_ip, 58function_trace_call(unsigned long ip, unsigned long parent_ip,
87 struct ftrace_ops *op, struct pt_regs *pt_regs) 59 struct ftrace_ops *op, struct pt_regs *pt_regs)
88
89{ 60{
90 struct trace_array *tr = func_trace; 61 struct trace_array *tr = func_trace;
91 struct trace_array_cpu *data; 62 struct trace_array_cpu *data;
92 unsigned long flags; 63 unsigned long flags;
93 long disabled; 64 int bit;
94 int cpu; 65 int cpu;
95 int pc; 66 int pc;
96 67
97 if (unlikely(!ftrace_function_enabled)) 68 if (unlikely(!ftrace_function_enabled))
98 return; 69 return;
99 70
100 /* 71 pc = preempt_count();
101 * Need to use raw, since this must be called before the 72 preempt_disable_notrace();
102 * recursive protection is performed.
103 */
104 local_irq_save(flags);
105 cpu = raw_smp_processor_id();
106 data = tr->data[cpu];
107 disabled = atomic_inc_return(&data->disabled);
108 73
109 if (likely(disabled == 1)) { 74 bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
110 pc = preempt_count(); 75 if (bit < 0)
76 goto out;
77
78 cpu = smp_processor_id();
79 data = tr->data[cpu];
80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags);
111 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
112 } 83 }
84 trace_clear_recursion(bit);
113 85
114 atomic_dec(&data->disabled); 86 out:
115 local_irq_restore(flags); 87 preempt_enable_notrace();
116} 88}
117 89
118static void 90static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
185{ 157{
186 ftrace_function_enabled = 0; 158 ftrace_function_enabled = 0;
187 159
188 if (trace_flags & TRACE_ITER_PREEMPTONLY)
189 trace_ops.func = function_trace_call_preempt_only;
190 else
191 trace_ops.func = function_trace_call;
192
193 if (func_flags.val & TRACE_FUNC_OPT_STACK) 160 if (func_flags.val & TRACE_FUNC_OPT_STACK)
194 register_ftrace_function(&trace_stack_ops); 161 register_ftrace_function(&trace_stack_ops);
195 else 162 else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7e..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40 48#define TRACE_GRAPH_PRINT_IRQS 0x40
49 49
50static unsigned int max_depth;
51
50static struct tracer_opt trace_opts[] = { 52static struct tracer_opt trace_opts[] = {
51 /* Display overruns? (for self-debug purpose) */ 53 /* Display overruns? (for self-debug purpose) */
52 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, 54 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
189 191
190 ftrace_pop_return_trace(&trace, &ret, frame_pointer); 192 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
191 trace.rettime = trace_clock_local(); 193 trace.rettime = trace_clock_local();
192 ftrace_graph_return(&trace);
193 barrier(); 194 barrier();
194 current->curr_ret_stack--; 195 current->curr_ret_stack--;
195 196
197 /*
198 * The trace should run after decrementing the ret counter
199 * in case an interrupt were to come in. We don't want to
200 * lose the interrupt if max_depth is set.
201 */
202 ftrace_graph_return(&trace);
203
196 if (unlikely(!ret)) { 204 if (unlikely(!ret)) {
197 ftrace_graph_stop(); 205 ftrace_graph_stop();
198 WARN_ON(1); 206 WARN_ON(1);
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
250 return 0; 258 return 0;
251 259
252 /* trace it when it is-nested-in or is a function enabled. */ 260 /* trace it when it is-nested-in or is a function enabled. */
253 if (!(trace->depth || ftrace_graph_addr(trace->func)) || 261 if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
254 ftrace_graph_ignore_irqs()) 262 ftrace_graph_ignore_irqs()) ||
263 (max_depth && trace->depth >= max_depth))
255 return 0; 264 return 0;
256 265
257 local_irq_save(flags); 266 local_irq_save(flags);
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
1457#endif 1466#endif
1458}; 1467};
1459 1468
1469
1470static ssize_t
1471graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
1472 loff_t *ppos)
1473{
1474 unsigned long val;
1475 int ret;
1476
1477 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
1478 if (ret)
1479 return ret;
1480
1481 max_depth = val;
1482
1483 *ppos += cnt;
1484
1485 return cnt;
1486}
1487
1488static ssize_t
1489graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
1490 loff_t *ppos)
1491{
1492 char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
1493 int n;
1494
1495 n = sprintf(buf, "%d\n", max_depth);
1496
1497 return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
1498}
1499
1500static const struct file_operations graph_depth_fops = {
1501 .open = tracing_open_generic,
1502 .write = graph_depth_write,
1503 .read = graph_depth_read,
1504 .llseek = generic_file_llseek,
1505};
1506
1507static __init int init_graph_debugfs(void)
1508{
1509 struct dentry *d_tracer;
1510
1511 d_tracer = tracing_init_dentry();
1512 if (!d_tracer)
1513 return 0;
1514
1515 trace_create_file("max_graph_depth", 0644, d_tracer,
1516 NULL, &graph_depth_fops);
1517
1518 return 0;
1519}
1520fs_initcall(init_graph_debugfs);
1521
1460static __init int init_graph_trace(void) 1522static __init int init_graph_trace(void)
1461{ 1523{
1462 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1524 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc7..697e88d13907 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -739,12 +739,11 @@ static int task_state_char(unsigned long state)
739struct trace_event *ftrace_find_event(int type) 739struct trace_event *ftrace_find_event(int type)
740{ 740{
741 struct trace_event *event; 741 struct trace_event *event;
742 struct hlist_node *n;
743 unsigned key; 742 unsigned key;
744 743
745 key = type & (EVENT_HASHSIZE - 1); 744 key = type & (EVENT_HASHSIZE - 1);
746 745
747 hlist_for_each_entry(event, n, &event_hash[key], node) { 746 hlist_for_each_entry(event, &event_hash[key], node) {
748 if (event->type == type) 747 if (event->type == type)
749 return event; 748 return event;
750 } 749 }
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
66#define TP_FLAG_TRACE 1 66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2 67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4 68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70 69
71 70
72/* data_rloc: data relative location, compatible with u32 */ 71/* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fcefca0..75aa97fbe1a1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,8 +15,8 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/sched/rt.h>
18#include <trace/events/sched.h> 19#include <trace/events/sched.h>
19
20#include "trace.h" 20#include "trace.h"
21 21
22static struct trace_array *wakeup_trace; 22static struct trace_array *wakeup_trace;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a815..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
415 * The ftrace infrastructure should provide the recursion 415 * The ftrace infrastructure should provide the recursion
416 * protection. If not, this will crash the kernel! 416 * protection. If not, this will crash the kernel!
417 */ 417 */
418 trace_selftest_recursion_cnt++; 418 if (trace_selftest_recursion_cnt++ > 10)
419 return;
419 DYN_FTRACE_TEST_NAME(); 420 DYN_FTRACE_TEST_NAME();
420} 421}
421 422
@@ -452,7 +453,6 @@ trace_selftest_function_recursion(void)
452 char *func_name; 453 char *func_name;
453 int len; 454 int len;
454 int ret; 455 int ret;
455 int cnt;
456 456
457 /* The previous test PASSED */ 457 /* The previous test PASSED */
458 pr_cont("PASSED\n"); 458 pr_cont("PASSED\n");
@@ -510,19 +510,10 @@ trace_selftest_function_recursion(void)
510 510
511 unregister_ftrace_function(&test_recsafe_probe); 511 unregister_ftrace_function(&test_recsafe_probe);
512 512
513 /*
514 * If arch supports all ftrace features, and no other task
515 * was on the list, we should be fine.
516 */
517 if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
518 cnt = 2; /* Should have recursed */
519 else
520 cnt = 1;
521
522 ret = -1; 513 ret = -1;
523 if (trace_selftest_recursion_cnt != cnt) { 514 if (trace_selftest_recursion_cnt != 2) {
524 pr_cont("*callback not called expected %d times (%d)* ", 515 pr_cont("*callback not called expected 2 times (%d)* ",
525 cnt, trace_selftest_recursion_cnt); 516 trace_selftest_recursion_cnt);
526 goto out; 517 goto out;
527 } 518 }
528 519
@@ -568,7 +559,7 @@ trace_selftest_function_regs(void)
568 int ret; 559 int ret;
569 int supported = 0; 560 int supported = 0;
570 561
571#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS 562#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
572 supported = 1; 563 supported = 1;
573#endif 564#endif
574 565
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c2..7a809e321058 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/syscalls.h>
3#include <linux/slab.h> 4#include <linux/slab.h>
4#include <linux/kernel.h> 5#include <linux/kernel.h>
5#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ 6#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
@@ -47,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
47} 48}
48#endif 49#endif
49 50
51#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
52/*
53 * Some architectures that allow for 32bit applications
54 * to run on a 64bit kernel, do not map the syscalls for
55 * the 32bit tasks the same as they do for 64bit tasks.
56 *
57 * *cough*x86*cough*
58 *
59 * In such a case, instead of reporting the wrong syscalls,
60 * simply ignore them.
61 *
62 * For an arch to ignore the compat syscalls it needs to
63 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
64 * define the function arch_trace_is_compat_syscall() to let
65 * the tracing system know that it should ignore it.
66 */
67static int
68trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
69{
70 if (unlikely(arch_trace_is_compat_syscall(regs)))
71 return -1;
72
73 return syscall_get_nr(task, regs);
74}
75#else
76static inline int
77trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
78{
79 return syscall_get_nr(task, regs);
80}
81#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
82
50static __init struct syscall_metadata * 83static __init struct syscall_metadata *
51find_syscall_meta(unsigned long syscall) 84find_syscall_meta(unsigned long syscall)
52{ 85{
@@ -77,7 +110,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
77 return syscalls_metadata[nr]; 110 return syscalls_metadata[nr];
78} 111}
79 112
80enum print_line_t 113static enum print_line_t
81print_syscall_enter(struct trace_iterator *iter, int flags, 114print_syscall_enter(struct trace_iterator *iter, int flags,
82 struct trace_event *event) 115 struct trace_event *event)
83{ 116{
@@ -130,7 +163,7 @@ end:
130 return TRACE_TYPE_HANDLED; 163 return TRACE_TYPE_HANDLED;
131} 164}
132 165
133enum print_line_t 166static enum print_line_t
134print_syscall_exit(struct trace_iterator *iter, int flags, 167print_syscall_exit(struct trace_iterator *iter, int flags,
135 struct trace_event *event) 168 struct trace_event *event)
136{ 169{
@@ -270,16 +303,16 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
270 return ret; 303 return ret;
271} 304}
272 305
273void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 306static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
274{ 307{
275 struct syscall_trace_enter *entry; 308 struct syscall_trace_enter *entry;
276 struct syscall_metadata *sys_data; 309 struct syscall_metadata *sys_data;
277 struct ring_buffer_event *event; 310 struct ring_buffer_event *event;
278 struct ring_buffer *buffer; 311 struct ring_buffer *buffer;
279 int size;
280 int syscall_nr; 312 int syscall_nr;
313 int size;
281 314
282 syscall_nr = syscall_get_nr(current, regs); 315 syscall_nr = trace_get_syscall_nr(current, regs);
283 if (syscall_nr < 0) 316 if (syscall_nr < 0)
284 return; 317 return;
285 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 318 if (!test_bit(syscall_nr, enabled_enter_syscalls))
@@ -305,7 +338,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
305 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 338 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
306} 339}
307 340
308void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 341static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
309{ 342{
310 struct syscall_trace_exit *entry; 343 struct syscall_trace_exit *entry;
311 struct syscall_metadata *sys_data; 344 struct syscall_metadata *sys_data;
@@ -313,7 +346,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
313 struct ring_buffer *buffer; 346 struct ring_buffer *buffer;
314 int syscall_nr; 347 int syscall_nr;
315 348
316 syscall_nr = syscall_get_nr(current, regs); 349 syscall_nr = trace_get_syscall_nr(current, regs);
317 if (syscall_nr < 0) 350 if (syscall_nr < 0)
318 return; 351 return;
319 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 352 if (!test_bit(syscall_nr, enabled_exit_syscalls))
@@ -337,7 +370,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
337 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 370 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
338} 371}
339 372
340int reg_event_syscall_enter(struct ftrace_event_call *call) 373static int reg_event_syscall_enter(struct ftrace_event_call *call)
341{ 374{
342 int ret = 0; 375 int ret = 0;
343 int num; 376 int num;
@@ -356,7 +389,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
356 return ret; 389 return ret;
357} 390}
358 391
359void unreg_event_syscall_enter(struct ftrace_event_call *call) 392static void unreg_event_syscall_enter(struct ftrace_event_call *call)
360{ 393{
361 int num; 394 int num;
362 395
@@ -371,7 +404,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
371 mutex_unlock(&syscall_trace_lock); 404 mutex_unlock(&syscall_trace_lock);
372} 405}
373 406
374int reg_event_syscall_exit(struct ftrace_event_call *call) 407static int reg_event_syscall_exit(struct ftrace_event_call *call)
375{ 408{
376 int ret = 0; 409 int ret = 0;
377 int num; 410 int num;
@@ -390,7 +423,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
390 return ret; 423 return ret;
391} 424}
392 425
393void unreg_event_syscall_exit(struct ftrace_event_call *call) 426static void unreg_event_syscall_exit(struct ftrace_event_call *call)
394{ 427{
395 int num; 428 int num;
396 429
@@ -459,7 +492,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
459 return (unsigned long)sys_call_table[nr]; 492 return (unsigned long)sys_call_table[nr];
460} 493}
461 494
462int __init init_ftrace_syscalls(void) 495static int __init init_ftrace_syscalls(void)
463{ 496{
464 struct syscall_metadata *meta; 497 struct syscall_metadata *meta;
465 unsigned long addr; 498 unsigned long addr;
@@ -502,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
502 int rctx; 535 int rctx;
503 int size; 536 int size;
504 537
505 syscall_nr = syscall_get_nr(current, regs); 538 syscall_nr = trace_get_syscall_nr(current, regs);
506 if (syscall_nr < 0) 539 if (syscall_nr < 0)
507 return; 540 return;
508 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 541 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
@@ -578,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
578 int rctx; 611 int rctx;
579 int size; 612 int size;
580 613
581 syscall_nr = syscall_get_nr(current, regs); 614 syscall_nr = trace_get_syscall_nr(current, regs);
582 if (syscall_nr < 0) 615 if (syscall_nr < 0)
583 return; 616 return;
584 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 617 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c86e6d4f67fb..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,20 +28,21 @@
28 28
29#define UPROBE_EVENT_SYSTEM "uprobes" 29#define UPROBE_EVENT_SYSTEM "uprobes"
30 30
31struct trace_uprobe_filter {
32 rwlock_t rwlock;
33 int nr_systemwide;
34 struct list_head perf_events;
35};
36
31/* 37/*
32 * uprobe event core functions 38 * uprobe event core functions
33 */ 39 */
34struct trace_uprobe;
35struct uprobe_trace_consumer {
36 struct uprobe_consumer cons;
37 struct trace_uprobe *tu;
38};
39
40struct trace_uprobe { 40struct trace_uprobe {
41 struct list_head list; 41 struct list_head list;
42 struct ftrace_event_class class; 42 struct ftrace_event_class class;
43 struct ftrace_event_call call; 43 struct ftrace_event_call call;
44 struct uprobe_trace_consumer *consumer; 44 struct trace_uprobe_filter filter;
45 struct uprobe_consumer consumer;
45 struct inode *inode; 46 struct inode *inode;
46 char *filename; 47 char *filename;
47 unsigned long offset; 48 unsigned long offset;
@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
64 65
65static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 66static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
66 67
68static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
69{
70 rwlock_init(&filter->rwlock);
71 filter->nr_systemwide = 0;
72 INIT_LIST_HEAD(&filter->perf_events);
73}
74
75static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
76{
77 return !filter->nr_systemwide && list_empty(&filter->perf_events);
78}
79
67/* 80/*
68 * Allocate new trace_uprobe and initialize it (including uprobes). 81 * Allocate new trace_uprobe and initialize it (including uprobes).
69 */ 82 */
@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
92 goto error; 105 goto error;
93 106
94 INIT_LIST_HEAD(&tu->list); 107 INIT_LIST_HEAD(&tu->list);
108 tu->consumer.handler = uprobe_dispatcher;
109 init_trace_uprobe_filter(&tu->filter);
95 return tu; 110 return tu;
96 111
97error: 112error:
@@ -253,12 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
253 if (ret) 268 if (ret)
254 goto fail_address_parse; 269 goto fail_address_parse;
255 270
271 inode = igrab(path.dentry->d_inode);
272 path_put(&path);
273
274 if (!inode || !S_ISREG(inode->i_mode)) {
275 ret = -EINVAL;
276 goto fail_address_parse;
277 }
278
256 ret = kstrtoul(arg, 0, &offset); 279 ret = kstrtoul(arg, 0, &offset);
257 if (ret) 280 if (ret)
258 goto fail_address_parse; 281 goto fail_address_parse;
259 282
260 inode = igrab(path.dentry->d_inode);
261
262 argc -= 2; 283 argc -= 2;
263 argv += 2; 284 argv += 2;
264 285
@@ -356,7 +377,7 @@ fail_address_parse:
356 if (inode) 377 if (inode)
357 iput(inode); 378 iput(inode);
358 379
359 pr_info("Failed to parse address.\n"); 380 pr_info("Failed to parse address or file.\n");
360 381
361 return ret; 382 return ret;
362} 383}
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
465}; 486};
466 487
467/* uprobe handler */ 488/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 489static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{ 490{
470 struct uprobe_trace_entry_head *entry; 491 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event; 492 struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
475 unsigned long irq_flags; 496 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call; 497 struct ftrace_event_call *call = &tu->call;
477 498
478 tu->nhit++;
479
480 local_save_flags(irq_flags); 499 local_save_flags(irq_flags);
481 pc = preempt_count(); 500 pc = preempt_count();
482 501
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 504 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc); 505 size, irq_flags, pc);
487 if (!event) 506 if (!event)
488 return; 507 return 0;
489 508
490 entry = ring_buffer_event_data(event); 509 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 510 entry->ip = instruction_pointer(task_pt_regs(current));
492 data = (u8 *)&entry[1]; 511 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++) 512 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 513 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495 514
496 if (!filter_current_check_discard(buffer, call, entry, event)) 515 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc); 516 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
517
518 return 0;
498} 519}
499 520
500/* Event entry printers */ 521/* Event entry printers */
@@ -533,42 +554,43 @@ partial:
533 return TRACE_TYPE_PARTIAL_LINE; 554 return TRACE_TYPE_PARTIAL_LINE;
534} 555}
535 556
536static int probe_event_enable(struct trace_uprobe *tu, int flag) 557static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
537{ 558{
538 struct uprobe_trace_consumer *utc; 559 return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
539 int ret = 0; 560}
540 561
541 if (!tu->inode || tu->consumer) 562typedef bool (*filter_func_t)(struct uprobe_consumer *self,
542 return -EINTR; 563 enum uprobe_filter_ctx ctx,
564 struct mm_struct *mm);
543 565
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); 566static int
545 if (!utc) 567probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
568{
569 int ret = 0;
570
571 if (is_trace_uprobe_enabled(tu))
546 return -EINTR; 572 return -EINTR;
547 573
548 utc->cons.handler = uprobe_dispatcher; 574 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555 575
556 tu->flags |= flag; 576 tu->flags |= flag;
557 utc->tu = tu; 577 tu->consumer.filter = filter;
558 tu->consumer = utc; 578 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
579 if (ret)
580 tu->flags &= ~flag;
559 581
560 return 0; 582 return ret;
561} 583}
562 584
563static void probe_event_disable(struct trace_uprobe *tu, int flag) 585static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{ 586{
565 if (!tu->inode || !tu->consumer) 587 if (!is_trace_uprobe_enabled(tu))
566 return; 588 return;
567 589
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); 590 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
591
592 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
569 tu->flags &= ~flag; 593 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572} 594}
573 595
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 596static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
642} 664}
643 665
644#ifdef CONFIG_PERF_EVENTS 666#ifdef CONFIG_PERF_EVENTS
667static bool
668__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
669{
670 struct perf_event *event;
671
672 if (filter->nr_systemwide)
673 return true;
674
675 list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
676 if (event->hw.tp_target->mm == mm)
677 return true;
678 }
679
680 return false;
681}
682
683static inline bool
684uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
685{
686 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
687}
688
689static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
690{
691 bool done;
692
693 write_lock(&tu->filter.rwlock);
694 if (event->hw.tp_target) {
695 /*
696 * event->parent != NULL means copy_process(), we can avoid
697 * uprobe_apply(). current->mm must be probed and we can rely
698 * on dup_mmap() which preserves the already installed bp's.
699 *
700 * attr.enable_on_exec means that exec/mmap will install the
701 * breakpoints we need.
702 */
703 done = tu->filter.nr_systemwide ||
704 event->parent || event->attr.enable_on_exec ||
705 uprobe_filter_event(tu, event);
706 list_add(&event->hw.tp_list, &tu->filter.perf_events);
707 } else {
708 done = tu->filter.nr_systemwide;
709 tu->filter.nr_systemwide++;
710 }
711 write_unlock(&tu->filter.rwlock);
712
713 if (!done)
714 uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
715
716 return 0;
717}
718
719static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
720{
721 bool done;
722
723 write_lock(&tu->filter.rwlock);
724 if (event->hw.tp_target) {
725 list_del(&event->hw.tp_list);
726 done = tu->filter.nr_systemwide ||
727 (event->hw.tp_target->flags & PF_EXITING) ||
728 uprobe_filter_event(tu, event);
729 } else {
730 tu->filter.nr_systemwide--;
731 done = tu->filter.nr_systemwide;
732 }
733 write_unlock(&tu->filter.rwlock);
734
735 if (!done)
736 uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
737
738 return 0;
739}
740
741static bool uprobe_perf_filter(struct uprobe_consumer *uc,
742 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
743{
744 struct trace_uprobe *tu;
745 int ret;
746
747 tu = container_of(uc, struct trace_uprobe, consumer);
748 read_lock(&tu->filter.rwlock);
749 ret = __uprobe_perf_filter(&tu->filter, mm);
750 read_unlock(&tu->filter.rwlock);
751
752 return ret;
753}
754
645/* uprobe profile handler */ 755/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 756static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{ 757{
648 struct ftrace_event_call *call = &tu->call; 758 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry; 759 struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
652 int size, __size, i; 762 int size, __size, i;
653 int rctx; 763 int rctx;
654 764
765 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
766 return UPROBE_HANDLER_REMOVE;
767
655 __size = sizeof(*entry) + tu->size; 768 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 769 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32); 770 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 771 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return; 772 return 0;
660 773
661 preempt_disable(); 774 preempt_disable();
662 775
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
664 if (!entry) 777 if (!entry)
665 goto out; 778 goto out;
666 779
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 780 entry->ip = instruction_pointer(task_pt_regs(current));
668 data = (u8 *)&entry[1]; 781 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++) 782 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 783 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
674 787
675 out: 788 out:
676 preempt_enable(); 789 preempt_enable();
790 return 0;
677} 791}
678#endif /* CONFIG_PERF_EVENTS */ 792#endif /* CONFIG_PERF_EVENTS */
679 793
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
684 798
685 switch (type) { 799 switch (type) {
686 case TRACE_REG_REGISTER: 800 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE); 801 return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
688 802
689 case TRACE_REG_UNREGISTER: 803 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE); 804 probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
692 806
693#ifdef CONFIG_PERF_EVENTS 807#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER: 808 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE); 809 return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
696 810
697 case TRACE_REG_PERF_UNREGISTER: 811 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE); 812 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0; 813 return 0;
814
815 case TRACE_REG_PERF_OPEN:
816 return uprobe_perf_open(tu, data);
817
818 case TRACE_REG_PERF_CLOSE:
819 return uprobe_perf_close(tu, data);
820
700#endif 821#endif
701 default: 822 default:
702 return 0; 823 return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
706 827
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) 828static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{ 829{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu; 830 struct trace_uprobe *tu;
831 int ret = 0;
711 832
712 utc = container_of(con, struct uprobe_trace_consumer, cons); 833 tu = container_of(con, struct trace_uprobe, consumer);
713 tu = utc->tu; 834 tu->nhit++;
714 if (!tu || tu->consumer != utc)
715 return 0;
716 835
717 if (tu->flags & TP_FLAG_TRACE) 836 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs); 837 ret |= uprobe_trace_func(tu, regs);
719 838
720#ifdef CONFIG_PERF_EVENTS 839#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE) 840 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs); 841 ret |= uprobe_perf_func(tu, regs);
723#endif 842#endif
724 return 0; 843 return ret;
725} 844}
726 845
727static struct trace_event_functions uprobe_funcs = { 846static struct trace_event_functions uprobe_funcs = {
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabfa..0c05a4592047 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
192static struct tracepoint_entry *get_tracepoint(const char *name) 192static struct tracepoint_entry *get_tracepoint(const char *name)
193{ 193{
194 struct hlist_head *head; 194 struct hlist_head *head;
195 struct hlist_node *node;
196 struct tracepoint_entry *e; 195 struct tracepoint_entry *e;
197 u32 hash = jhash(name, strlen(name), 0); 196 u32 hash = jhash(name, strlen(name), 0);
198 197
199 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; 198 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
200 hlist_for_each_entry(e, node, head, hlist) { 199 hlist_for_each_entry(e, head, hlist) {
201 if (!strcmp(name, e->name)) 200 if (!strcmp(name, e->name))
202 return e; 201 return e;
203 } 202 }
@@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name)
211static struct tracepoint_entry *add_tracepoint(const char *name) 210static struct tracepoint_entry *add_tracepoint(const char *name)
212{ 211{
213 struct hlist_head *head; 212 struct hlist_head *head;
214 struct hlist_node *node;
215 struct tracepoint_entry *e; 213 struct tracepoint_entry *e;
216 size_t name_len = strlen(name) + 1; 214 size_t name_len = strlen(name) + 1;
217 u32 hash = jhash(name, name_len-1, 0); 215 u32 hash = jhash(name, name_len-1, 0);
218 216
219 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; 217 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
220 hlist_for_each_entry(e, node, head, hlist) { 218 hlist_for_each_entry(e, head, hlist) {
221 if (!strcmp(name, e->name)) { 219 if (!strcmp(name, e->name)) {
222 printk(KERN_NOTICE 220 printk(KERN_NOTICE
223 "tracepoint %s busy\n", name); 221 "tracepoint %s busy\n", name);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b44690..a1dd9a1b1327 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
32{ 32{
33 const struct cred *tcred; 33 const struct cred *tcred;
34 struct timespec uptime, ts; 34 struct timespec uptime, ts;
35 cputime_t utime, stime, utimescaled, stimescaled;
35 u64 ac_etime; 36 u64 ac_etime;
36 37
37 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 38 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns,
65 stats->ac_ppid = pid_alive(tsk) ? 66 stats->ac_ppid = pid_alive(tsk) ?
66 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; 67 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
67 rcu_read_unlock(); 68 rcu_read_unlock();
68 stats->ac_utime = cputime_to_usecs(tsk->utime); 69
69 stats->ac_stime = cputime_to_usecs(tsk->stime); 70 task_cputime(tsk, &utime, &stime);
70 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); 71 stats->ac_utime = cputime_to_usecs(utime);
71 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); 72 stats->ac_stime = cputime_to_usecs(stime);
73
74 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
75 stats->ac_utimescaled = cputime_to_usecs(utimescaled);
76 stats->ac_stimescaled = cputime_to_usecs(stimescaled);
77
72 stats->ac_minflt = tsk->min_flt; 78 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 79 stats->ac_majflt = tsk->maj_flt;
74 80
@@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
115#undef KB 121#undef KB
116#undef MB 122#undef MB
117 123
118/** 124static void __acct_update_integrals(struct task_struct *tsk,
119 * acct_update_integrals - update mm integral fields in task_struct 125 cputime_t utime, cputime_t stime)
120 * @tsk: task_struct for accounting
121 */
122void acct_update_integrals(struct task_struct *tsk)
123{ 126{
124 if (likely(tsk->mm)) { 127 if (likely(tsk->mm)) {
125 cputime_t time, dtime; 128 cputime_t time, dtime;
@@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)
128 u64 delta; 131 u64 delta;
129 132
130 local_irq_save(flags); 133 local_irq_save(flags);
131 time = tsk->stime + tsk->utime; 134 time = stime + utime;
132 dtime = time - tsk->acct_timexpd; 135 dtime = time - tsk->acct_timexpd;
133 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 136 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
134 delta = value.tv_sec; 137 delta = value.tv_sec;
@@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)
145} 148}
146 149
147/** 150/**
151 * acct_update_integrals - update mm integral fields in task_struct
152 * @tsk: task_struct for accounting
153 */
154void acct_update_integrals(struct task_struct *tsk)
155{
156 cputime_t utime, stime;
157
158 task_cputime(tsk, &utime, &stime);
159 __acct_update_integrals(tsk, utime, stime);
160}
161
162/**
163 * acct_account_cputime - update mm integral after cputime update
164 * @tsk: task_struct for accounting
165 */
166void acct_account_cputime(struct task_struct *tsk)
167{
168 __acct_update_integrals(tsk, tsk->utime, tsk->stime);
169}
170
171/**
148 * acct_clear_integrals - clear the mm integral fields in task_struct 172 * acct_clear_integrals - clear the mm integral fields in task_struct
149 * @tsk: task_struct whose accounting fields are cleared 173 * @tsk: task_struct whose accounting fields are cleared
150 */ 174 */
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1fb..394f70b17162 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
34void fire_user_return_notifiers(void) 34void fire_user_return_notifiers(void)
35{ 35{
36 struct user_return_notifier *urn; 36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2; 37 struct hlist_node *tmp2;
38 struct hlist_head *head; 38 struct hlist_head *head;
39 39
40 head = &get_cpu_var(return_notifier_list); 40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) 41 hlist_for_each_entry_safe(urn, tmp2, head, link)
42 urn->on_user_return(urn); 42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list); 43 put_cpu_var(return_notifier_list);
44} 44}
diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e53a5f..e81978e8c03b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -47,9 +47,7 @@ struct user_namespace init_user_ns = {
47 .count = 4294967295U, 47 .count = 4294967295U,
48 }, 48 },
49 }, 49 },
50 .kref = { 50 .count = ATOMIC_INIT(3),
51 .refcount = ATOMIC_INIT(3),
52 },
53 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
54 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
55 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
@@ -107,9 +105,8 @@ static void uid_hash_remove(struct user_struct *up)
107static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) 105static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
108{ 106{
109 struct user_struct *user; 107 struct user_struct *user;
110 struct hlist_node *h;
111 108
112 hlist_for_each_entry(user, h, hashent, uidhash_node) { 109 hlist_for_each_entry(user, hashent, uidhash_node) {
113 if (uid_eq(user->uid, uid)) { 110 if (uid_eq(user->uid, uid)) {
114 atomic_inc(&user->__count); 111 atomic_inc(&user->__count);
115 return user; 112 return user;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2b042c42fbc4..8b650837083e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -78,7 +78,7 @@ int create_user_ns(struct cred *new)
78 return ret; 78 return ret;
79 } 79 }
80 80
81 kref_init(&ns->kref); 81 atomic_set(&ns->count, 1);
82 /* Leave the new->user_ns reference with the new user namespace. */ 82 /* Leave the new->user_ns reference with the new user namespace. */
83 ns->parent = parent_ns; 83 ns->parent = parent_ns;
84 ns->owner = owner; 84 ns->owner = owner;
@@ -104,15 +104,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
104 return create_user_ns(cred); 104 return create_user_ns(cred);
105} 105}
106 106
107void free_user_ns(struct kref *kref) 107void free_user_ns(struct user_namespace *ns)
108{ 108{
109 struct user_namespace *parent, *ns = 109 struct user_namespace *parent;
110 container_of(kref, struct user_namespace, kref);
111 110
112 parent = ns->parent; 111 do {
113 proc_free_inum(ns->proc_inum); 112 parent = ns->parent;
114 kmem_cache_free(user_ns_cachep, ns); 113 proc_free_inum(ns->proc_inum);
115 put_user_ns(parent); 114 kmem_cache_free(user_ns_cachep, ns);
115 ns = parent;
116 } while (atomic_dec_and_test(&parent->count));
116} 117}
117EXPORT_SYMBOL(free_user_ns); 118EXPORT_SYMBOL(free_user_ns);
118 119
@@ -519,6 +520,42 @@ struct seq_operations proc_projid_seq_operations = {
519 .show = projid_m_show, 520 .show = projid_m_show,
520}; 521};
521 522
523static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
524{
525 u32 upper_first, lower_first, upper_last, lower_last;
526 unsigned idx;
527
528 upper_first = extent->first;
529 lower_first = extent->lower_first;
530 upper_last = upper_first + extent->count - 1;
531 lower_last = lower_first + extent->count - 1;
532
533 for (idx = 0; idx < new_map->nr_extents; idx++) {
534 u32 prev_upper_first, prev_lower_first;
535 u32 prev_upper_last, prev_lower_last;
536 struct uid_gid_extent *prev;
537
538 prev = &new_map->extent[idx];
539
540 prev_upper_first = prev->first;
541 prev_lower_first = prev->lower_first;
542 prev_upper_last = prev_upper_first + prev->count - 1;
543 prev_lower_last = prev_lower_first + prev->count - 1;
544
545 /* Does the upper range intersect a previous extent? */
546 if ((prev_upper_first <= upper_last) &&
547 (prev_upper_last >= upper_first))
548 return true;
549
550 /* Does the lower range intersect a previous extent? */
551 if ((prev_lower_first <= lower_last) &&
552 (prev_lower_last >= lower_first))
553 return true;
554 }
555 return false;
556}
557
558
522static DEFINE_MUTEX(id_map_mutex); 559static DEFINE_MUTEX(id_map_mutex);
523 560
524static ssize_t map_write(struct file *file, const char __user *buf, 561static ssize_t map_write(struct file *file, const char __user *buf,
@@ -531,7 +568,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
531 struct user_namespace *ns = seq->private; 568 struct user_namespace *ns = seq->private;
532 struct uid_gid_map new_map; 569 struct uid_gid_map new_map;
533 unsigned idx; 570 unsigned idx;
534 struct uid_gid_extent *extent, *last = NULL; 571 struct uid_gid_extent *extent = NULL;
535 unsigned long page = 0; 572 unsigned long page = 0;
536 char *kbuf, *pos, *next_line; 573 char *kbuf, *pos, *next_line;
537 ssize_t ret = -EINVAL; 574 ssize_t ret = -EINVAL;
@@ -634,14 +671,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,
634 if ((extent->lower_first + extent->count) <= extent->lower_first) 671 if ((extent->lower_first + extent->count) <= extent->lower_first)
635 goto out; 672 goto out;
636 673
637 /* For now only accept extents that are strictly in order */ 674 /* Do the ranges in extent overlap any previous extents? */
638 if (last && 675 if (mappings_overlap(&new_map, extent))
639 (((last->first + last->count) > extent->first) ||
640 ((last->lower_first + last->count) > extent->lower_first)))
641 goto out; 676 goto out;
642 677
643 new_map.nr_extents++; 678 new_map.nr_extents++;
644 last = extent;
645 679
646 /* Fail if the file contains too many extents */ 680 /* Fail if the file contains too many extents */
647 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && 681 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 08b197e8c485..a47fc5de3113 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,7 +30,7 @@ static struct uts_namespace *create_uts_ns(void)
30/* 30/*
31 * Clone a new ns copying an original utsname, setting refcount to 1 31 * Clone a new ns copying an original utsname, setting refcount to 1
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, 35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d820..4f69f9a5e221 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17 17
18#ifdef CONFIG_PROC_SYSCTL
19
18static void *get_uts(ctl_table *table, int write) 20static void *get_uts(ctl_table *table, int write)
19{ 21{
20 char *which = table->data; 22 char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
38 up_write(&uts_sem); 40 up_write(&uts_sem);
39} 41}
40 42
41#ifdef CONFIG_PROC_SYSCTL
42/* 43/*
43 * Special case of dostring for the UTS structure. This has locks 44 * Special case of dostring for the UTS structure. This has locks
44 * to observe. Should this be in kernel/sys.c ???? 45 * to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3d0b02..4a944676358e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25#include <linux/smpboot.h> 25#include <linux/smpboot.h>
26#include <linux/sched/rt.h>
26 27
27#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
28#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
@@ -112,9 +113,9 @@ static int get_softlockup_thresh(void)
112 * resolution, and we don't need to waste time with a big divide when 113 * resolution, and we don't need to waste time with a big divide when
113 * 2^30ns == 1.074s. 114 * 2^30ns == 1.074s.
114 */ 115 */
115static unsigned long get_timestamp(int this_cpu) 116static unsigned long get_timestamp(void)
116{ 117{
117 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 118 return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
118} 119}
119 120
120static void set_sample_period(void) 121static void set_sample_period(void)
@@ -132,9 +133,7 @@ static void set_sample_period(void)
132/* Commands for resetting the watchdog */ 133/* Commands for resetting the watchdog */
133static void __touch_watchdog(void) 134static void __touch_watchdog(void)
134{ 135{
135 int this_cpu = smp_processor_id(); 136 __this_cpu_write(watchdog_touch_ts, get_timestamp());
136
137 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
138} 137}
139 138
140void touch_softlockup_watchdog(void) 139void touch_softlockup_watchdog(void)
@@ -195,7 +194,7 @@ static int is_hardlockup(void)
195 194
196static int is_softlockup(unsigned long touch_ts) 195static int is_softlockup(unsigned long touch_ts)
197{ 196{
198 unsigned long now = get_timestamp(smp_processor_id()); 197 unsigned long now = get_timestamp();
199 198
200 /* Warn about unreasonable delays: */ 199 /* Warn about unreasonable delays: */
201 if (time_after(now, touch_ts + get_softlockup_thresh())) 200 if (time_after(now, touch_ts + get_softlockup_thresh()))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fbc6576a83c3..81f2457811eb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,32 +41,31 @@
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/hashtable.h>
44 45
45#include "workqueue_sched.h" 46#include "workqueue_internal.h"
46 47
47enum { 48enum {
48 /* 49 /*
49 * global_cwq flags 50 * worker_pool flags
50 * 51 *
51 * A bound gcwq is either associated or disassociated with its CPU. 52 * A bound pool is either associated or disassociated with its CPU.
52 * While associated (!DISASSOCIATED), all workers are bound to the 53 * While associated (!DISASSOCIATED), all workers are bound to the
53 * CPU and none has %WORKER_UNBOUND set and concurrency management 54 * CPU and none has %WORKER_UNBOUND set and concurrency management
54 * is in effect. 55 * is in effect.
55 * 56 *
56 * While DISASSOCIATED, the cpu may be offline and all workers have 57 * While DISASSOCIATED, the cpu may be offline and all workers have
57 * %WORKER_UNBOUND set and concurrency management disabled, and may 58 * %WORKER_UNBOUND set and concurrency management disabled, and may
58 * be executing on any CPU. The gcwq behaves as an unbound one. 59 * be executing on any CPU. The pool behaves as an unbound one.
59 * 60 *
60 * Note that DISASSOCIATED can be flipped only while holding 61 * Note that DISASSOCIATED can be flipped only while holding
61 * assoc_mutex of all pools on the gcwq to avoid changing binding 62 * assoc_mutex to avoid changing binding state while
62 * state while create_worker() is in progress. 63 * create_worker() is in progress.
63 */ 64 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65 GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66
67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 65 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
69 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ 66 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
67 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
68 POOL_FREEZING = 1 << 3, /* freeze in progress */
70 69
71 /* worker flags */ 70 /* worker flags */
72 WORKER_STARTED = 1 << 0, /* started */ 71 WORKER_STARTED = 1 << 0, /* started */
@@ -79,11 +78,9 @@ enum {
79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 78 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
80 WORKER_CPU_INTENSIVE, 79 WORKER_CPU_INTENSIVE,
81 80
82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 81 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
83 82
84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 83 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
85 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
86 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
87 84
88 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 85 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
89 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 86 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
@@ -111,48 +108,24 @@ enum {
111 * P: Preemption protected. Disabling preemption is enough and should 108 * P: Preemption protected. Disabling preemption is enough and should
112 * only be modified and accessed from the local cpu. 109 * only be modified and accessed from the local cpu.
113 * 110 *
114 * L: gcwq->lock protected. Access with gcwq->lock held. 111 * L: pool->lock protected. Access with pool->lock held.
115 * 112 *
116 * X: During normal operation, modification requires gcwq->lock and 113 * X: During normal operation, modification requires pool->lock and should
117 * should be done only from local cpu. Either disabling preemption 114 * be done only from local cpu. Either disabling preemption on local
118 * on local cpu or grabbing gcwq->lock is enough for read access. 115 * cpu or grabbing pool->lock is enough for read access. If
119 * If GCWQ_DISASSOCIATED is set, it's identical to L. 116 * POOL_DISASSOCIATED is set, it's identical to L.
120 * 117 *
121 * F: wq->flush_mutex protected. 118 * F: wq->flush_mutex protected.
122 * 119 *
123 * W: workqueue_lock protected. 120 * W: workqueue_lock protected.
124 */ 121 */
125 122
126struct global_cwq; 123/* struct worker is defined in workqueue_internal.h */
127struct worker_pool;
128
129/*
130 * The poor guys doing the actual heavy lifting. All on-duty workers
131 * are either serving the manager role, on idle list or on busy hash.
132 */
133struct worker {
134 /* on idle list while idle, on busy hash table while busy */
135 union {
136 struct list_head entry; /* L: while idle */
137 struct hlist_node hentry; /* L: while busy */
138 };
139
140 struct work_struct *current_work; /* L: work being processed */
141 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
142 struct list_head scheduled; /* L: scheduled works */
143 struct task_struct *task; /* I: worker task */
144 struct worker_pool *pool; /* I: the associated pool */
145 /* 64 bytes boundary on 64bit, 32 on 32bit */
146 unsigned long last_active; /* L: last active timestamp */
147 unsigned int flags; /* X: flags */
148 int id; /* I: worker id */
149
150 /* for rebinding worker to CPU */
151 struct work_struct rebind_work; /* L: for busy worker */
152};
153 124
154struct worker_pool { 125struct worker_pool {
155 struct global_cwq *gcwq; /* I: the owning gcwq */ 126 spinlock_t lock; /* the pool lock */
127 unsigned int cpu; /* I: the associated cpu */
128 int id; /* I: pool ID */
156 unsigned int flags; /* X: flags */ 129 unsigned int flags; /* X: flags */
157 130
158 struct list_head worklist; /* L: list of pending works */ 131 struct list_head worklist; /* L: list of pending works */
@@ -165,34 +138,28 @@ struct worker_pool {
165 struct timer_list idle_timer; /* L: worker idle timeout */ 138 struct timer_list idle_timer; /* L: worker idle timeout */
166 struct timer_list mayday_timer; /* L: SOS timer for workers */ 139 struct timer_list mayday_timer; /* L: SOS timer for workers */
167 140
168 struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ 141 /* workers are chained either in busy_hash or idle_list */
169 struct ida worker_ida; /* L: for worker IDs */ 142 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
170};
171
172/*
173 * Global per-cpu workqueue. There's one and only one for each cpu
174 * and all works are queued and processed here regardless of their
175 * target workqueues.
176 */
177struct global_cwq {
178 spinlock_t lock; /* the gcwq lock */
179 unsigned int cpu; /* I: the associated cpu */
180 unsigned int flags; /* L: GCWQ_* flags */
181
182 /* workers are chained either in busy_hash or pool idle_list */
183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
184 /* L: hash of busy workers */ 143 /* L: hash of busy workers */
185 144
186 struct worker_pool pools[NR_WORKER_POOLS]; 145 struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */
187 /* normal and highpri pools */ 146 struct ida worker_ida; /* L: for worker IDs */
147
148 /*
149 * The current concurrency level. As it's likely to be accessed
150 * from other CPUs during try_to_wake_up(), put it in a separate
151 * cacheline.
152 */
153 atomic_t nr_running ____cacheline_aligned_in_smp;
188} ____cacheline_aligned_in_smp; 154} ____cacheline_aligned_in_smp;
189 155
190/* 156/*
191 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of 157 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
192 * work_struct->data are used for flags and thus cwqs need to be 158 * of work_struct->data are used for flags and the remaining high bits
193 * aligned at two's power of the number of flag bits. 159 * point to the pwq; thus, pwqs need to be aligned at two's power of the
160 * number of flag bits.
194 */ 161 */
195struct cpu_workqueue_struct { 162struct pool_workqueue {
196 struct worker_pool *pool; /* I: the associated pool */ 163 struct worker_pool *pool; /* I: the associated pool */
197 struct workqueue_struct *wq; /* I: the owning workqueue */ 164 struct workqueue_struct *wq; /* I: the owning workqueue */
198 int work_color; /* L: current color */ 165 int work_color; /* L: current color */
@@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t;
241struct workqueue_struct { 208struct workqueue_struct {
242 unsigned int flags; /* W: WQ_* flags */ 209 unsigned int flags; /* W: WQ_* flags */
243 union { 210 union {
244 struct cpu_workqueue_struct __percpu *pcpu; 211 struct pool_workqueue __percpu *pcpu;
245 struct cpu_workqueue_struct *single; 212 struct pool_workqueue *single;
246 unsigned long v; 213 unsigned long v;
247 } cpu_wq; /* I: cwq's */ 214 } pool_wq; /* I: pwq's */
248 struct list_head list; /* W: list of all workqueues */ 215 struct list_head list; /* W: list of all workqueues */
249 216
250 struct mutex flush_mutex; /* protects wq flushing */ 217 struct mutex flush_mutex; /* protects wq flushing */
251 int work_color; /* F: current work color */ 218 int work_color; /* F: current work color */
252 int flush_color; /* F: current flush color */ 219 int flush_color; /* F: current flush color */
253 atomic_t nr_cwqs_to_flush; /* flush in progress */ 220 atomic_t nr_pwqs_to_flush; /* flush in progress */
254 struct wq_flusher *first_flusher; /* F: first flusher */ 221 struct wq_flusher *first_flusher; /* F: first flusher */
255 struct list_head flusher_queue; /* F: flush waiters */ 222 struct list_head flusher_queue; /* F: flush waiters */
256 struct list_head flusher_overflow; /* F: flush overflow list */ 223 struct list_head flusher_overflow; /* F: flush overflow list */
@@ -259,7 +226,7 @@ struct workqueue_struct {
259 struct worker *rescuer; /* I: rescue worker */ 226 struct worker *rescuer; /* I: rescue worker */
260 227
261 int nr_drainers; /* W: drain in progress */ 228 int nr_drainers; /* W: drain in progress */
262 int saved_max_active; /* W: saved cwq max_active */ 229 int saved_max_active; /* W: saved pwq max_active */
263#ifdef CONFIG_LOCKDEP 230#ifdef CONFIG_LOCKDEP
264 struct lockdep_map lockdep_map; 231 struct lockdep_map lockdep_map;
265#endif 232#endif
@@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
280#define CREATE_TRACE_POINTS 247#define CREATE_TRACE_POINTS
281#include <trace/events/workqueue.h> 248#include <trace/events/workqueue.h>
282 249
283#define for_each_worker_pool(pool, gcwq) \ 250#define for_each_std_worker_pool(pool, cpu) \
284 for ((pool) = &(gcwq)->pools[0]; \ 251 for ((pool) = &std_worker_pools(cpu)[0]; \
285 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) 252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
286 253
287#define for_each_busy_worker(worker, i, pos, gcwq) \ 254#define for_each_busy_worker(worker, i, pool) \
288 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 255 hash_for_each(pool->busy_hash, i, worker, hentry)
289 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
290 256
291static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, 257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
292 unsigned int sw) 258 unsigned int sw)
293{ 259{
294 if (cpu < nr_cpu_ids) { 260 if (cpu < nr_cpu_ids) {
295 if (sw & 1) { 261 if (sw & 1) {
@@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
300 if (sw & 2) 266 if (sw & 2)
301 return WORK_CPU_UNBOUND; 267 return WORK_CPU_UNBOUND;
302 } 268 }
303 return WORK_CPU_NONE; 269 return WORK_CPU_END;
304} 270}
305 271
306static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, 272static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
307 struct workqueue_struct *wq) 273 struct workqueue_struct *wq)
308{ 274{
309 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); 275 return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
310} 276}
311 277
312/* 278/*
313 * CPU iterators 279 * CPU iterators
314 * 280 *
315 * An extra gcwq is defined for an invalid cpu number 281 * An extra cpu number is defined using an invalid cpu number
316 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any 282 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
317 * specific CPU. The following iterators are similar to 283 * specific CPU. The following iterators are similar to for_each_*_cpu()
318 * for_each_*_cpu() iterators but also considers the unbound gcwq. 284 * iterators but also considers the unbound CPU.
319 * 285 *
320 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND 286 * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND
321 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND 287 * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND
322 * for_each_cwq_cpu() : possible CPUs for bound workqueues, 288 * for_each_pwq_cpu() : possible CPUs for bound workqueues,
323 * WORK_CPU_UNBOUND for unbound workqueues 289 * WORK_CPU_UNBOUND for unbound workqueues
324 */ 290 */
325#define for_each_gcwq_cpu(cpu) \ 291#define for_each_wq_cpu(cpu) \
326 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ 292 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \
327 (cpu) < WORK_CPU_NONE; \ 293 (cpu) < WORK_CPU_END; \
328 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) 294 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
329 295
330#define for_each_online_gcwq_cpu(cpu) \ 296#define for_each_online_wq_cpu(cpu) \
331 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ 297 for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \
332 (cpu) < WORK_CPU_NONE; \ 298 (cpu) < WORK_CPU_END; \
333 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) 299 (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
334 300
335#define for_each_cwq_cpu(cpu, wq) \ 301#define for_each_pwq_cpu(cpu, wq) \
336 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ 302 for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \
337 (cpu) < WORK_CPU_NONE; \ 303 (cpu) < WORK_CPU_END; \
338 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) 304 (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
339 305
340#ifdef CONFIG_DEBUG_OBJECTS_WORK 306#ifdef CONFIG_DEBUG_OBJECTS_WORK
341 307
@@ -459,57 +425,69 @@ static LIST_HEAD(workqueues);
459static bool workqueue_freezing; /* W: have wqs started freezing? */ 425static bool workqueue_freezing; /* W: have wqs started freezing? */
460 426
461/* 427/*
462 * The almighty global cpu workqueues. nr_running is the only field 428 * The CPU and unbound standard worker pools. The unbound ones have
463 * which is expected to be used frequently by other cpus via 429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
464 * try_to_wake_up(). Put it in a separate cacheline.
465 */ 430 */
466static DEFINE_PER_CPU(struct global_cwq, global_cwq); 431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
467static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); 432 cpu_std_worker_pools);
433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
468 434
469/* 435/* idr of all pools */
470 * Global cpu workqueue and nr_running counter for unbound gcwq. The 436static DEFINE_MUTEX(worker_pool_idr_mutex);
471 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its 437static DEFINE_IDR(worker_pool_idr);
472 * workers have WORKER_UNBOUND set.
473 */
474static struct global_cwq unbound_global_cwq;
475static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
476 [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
477};
478 438
479static int worker_thread(void *__worker); 439static int worker_thread(void *__worker);
480 440
481static int worker_pool_pri(struct worker_pool *pool) 441static struct worker_pool *std_worker_pools(int cpu)
482{ 442{
483 return pool - pool->gcwq->pools; 443 if (cpu != WORK_CPU_UNBOUND)
444 return per_cpu(cpu_std_worker_pools, cpu);
445 else
446 return unbound_std_worker_pools;
484} 447}
485 448
486static struct global_cwq *get_gcwq(unsigned int cpu) 449static int std_worker_pool_pri(struct worker_pool *pool)
487{ 450{
488 if (cpu != WORK_CPU_UNBOUND) 451 return pool - std_worker_pools(pool->cpu);
489 return &per_cpu(global_cwq, cpu);
490 else
491 return &unbound_global_cwq;
492} 452}
493 453
494static atomic_t *get_pool_nr_running(struct worker_pool *pool) 454/* allocate ID and assign it to @pool */
455static int worker_pool_assign_id(struct worker_pool *pool)
495{ 456{
496 int cpu = pool->gcwq->cpu; 457 int ret;
497 int idx = worker_pool_pri(pool);
498 458
499 if (cpu != WORK_CPU_UNBOUND) 459 mutex_lock(&worker_pool_idr_mutex);
500 return &per_cpu(pool_nr_running, cpu)[idx]; 460 idr_pre_get(&worker_pool_idr, GFP_KERNEL);
501 else 461 ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
502 return &unbound_pool_nr_running[idx]; 462 mutex_unlock(&worker_pool_idr_mutex);
463
464 return ret;
503} 465}
504 466
505static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, 467/*
506 struct workqueue_struct *wq) 468 * Lookup worker_pool by id. The idr currently is built during boot and
469 * never modified. Don't worry about locking for now.
470 */
471static struct worker_pool *worker_pool_by_id(int pool_id)
472{
473 return idr_find(&worker_pool_idr, pool_id);
474}
475
476static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
477{
478 struct worker_pool *pools = std_worker_pools(cpu);
479
480 return &pools[highpri];
481}
482
483static struct pool_workqueue *get_pwq(unsigned int cpu,
484 struct workqueue_struct *wq)
507{ 485{
508 if (!(wq->flags & WQ_UNBOUND)) { 486 if (!(wq->flags & WQ_UNBOUND)) {
509 if (likely(cpu < nr_cpu_ids)) 487 if (likely(cpu < nr_cpu_ids))
510 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 488 return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
511 } else if (likely(cpu == WORK_CPU_UNBOUND)) 489 } else if (likely(cpu == WORK_CPU_UNBOUND))
512 return wq->cpu_wq.single; 490 return wq->pool_wq.single;
513 return NULL; 491 return NULL;
514} 492}
515 493
@@ -530,19 +508,19 @@ static int work_next_color(int color)
530} 508}
531 509
532/* 510/*
533 * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data 511 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
534 * contain the pointer to the queued cwq. Once execution starts, the flag 512 * contain the pointer to the queued pwq. Once execution starts, the flag
535 * is cleared and the high bits contain OFFQ flags and CPU number. 513 * is cleared and the high bits contain OFFQ flags and pool ID.
536 * 514 *
537 * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() 515 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
538 * and clear_work_data() can be used to set the cwq, cpu or clear 516 * and clear_work_data() can be used to set the pwq, pool or clear
539 * work->data. These functions should only be called while the work is 517 * work->data. These functions should only be called while the work is
540 * owned - ie. while the PENDING bit is set. 518 * owned - ie. while the PENDING bit is set.
541 * 519 *
542 * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to 520 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
543 * a work. gcwq is available once the work has been queued anywhere after 521 * corresponding to a work. Pool is available once the work has been
544 * initialization until it is sync canceled. cwq is available only while 522 * queued anywhere after initialization until it is sync canceled. pwq is
545 * the work item is queued. 523 * available only while the work item is queued.
546 * 524 *
547 * %WORK_OFFQ_CANCELING is used to mark a work item which is being 525 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
548 * canceled. While being canceled, a work item may have its PENDING set 526 * canceled. While being canceled, a work item may have its PENDING set
@@ -556,16 +534,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data,
556 atomic_long_set(&work->data, data | flags | work_static(work)); 534 atomic_long_set(&work->data, data | flags | work_static(work));
557} 535}
558 536
559static void set_work_cwq(struct work_struct *work, 537static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
560 struct cpu_workqueue_struct *cwq,
561 unsigned long extra_flags) 538 unsigned long extra_flags)
562{ 539{
563 set_work_data(work, (unsigned long)cwq, 540 set_work_data(work, (unsigned long)pwq,
564 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); 541 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
565} 542}
566 543
567static void set_work_cpu_and_clear_pending(struct work_struct *work, 544static void set_work_pool_and_keep_pending(struct work_struct *work,
568 unsigned int cpu) 545 int pool_id)
546{
547 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
548 WORK_STRUCT_PENDING);
549}
550
551static void set_work_pool_and_clear_pending(struct work_struct *work,
552 int pool_id)
569{ 553{
570 /* 554 /*
571 * The following wmb is paired with the implied mb in 555 * The following wmb is paired with the implied mb in
@@ -574,67 +558,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work,
574 * owner. 558 * owner.
575 */ 559 */
576 smp_wmb(); 560 smp_wmb();
577 set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); 561 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
578} 562}
579 563
580static void clear_work_data(struct work_struct *work) 564static void clear_work_data(struct work_struct *work)
581{ 565{
582 smp_wmb(); /* see set_work_cpu_and_clear_pending() */ 566 smp_wmb(); /* see set_work_pool_and_clear_pending() */
583 set_work_data(work, WORK_STRUCT_NO_CPU, 0); 567 set_work_data(work, WORK_STRUCT_NO_POOL, 0);
584} 568}
585 569
586static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) 570static struct pool_workqueue *get_work_pwq(struct work_struct *work)
587{ 571{
588 unsigned long data = atomic_long_read(&work->data); 572 unsigned long data = atomic_long_read(&work->data);
589 573
590 if (data & WORK_STRUCT_CWQ) 574 if (data & WORK_STRUCT_PWQ)
591 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); 575 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
592 else 576 else
593 return NULL; 577 return NULL;
594} 578}
595 579
596static struct global_cwq *get_work_gcwq(struct work_struct *work) 580/**
581 * get_work_pool - return the worker_pool a given work was associated with
582 * @work: the work item of interest
583 *
584 * Return the worker_pool @work was last associated with. %NULL if none.
585 */
586static struct worker_pool *get_work_pool(struct work_struct *work)
597{ 587{
598 unsigned long data = atomic_long_read(&work->data); 588 unsigned long data = atomic_long_read(&work->data);
599 unsigned int cpu; 589 struct worker_pool *pool;
590 int pool_id;
600 591
601 if (data & WORK_STRUCT_CWQ) 592 if (data & WORK_STRUCT_PWQ)
602 return ((struct cpu_workqueue_struct *) 593 return ((struct pool_workqueue *)
603 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; 594 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
604 595
605 cpu = data >> WORK_OFFQ_CPU_SHIFT; 596 pool_id = data >> WORK_OFFQ_POOL_SHIFT;
606 if (cpu == WORK_CPU_NONE) 597 if (pool_id == WORK_OFFQ_POOL_NONE)
607 return NULL; 598 return NULL;
608 599
609 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); 600 pool = worker_pool_by_id(pool_id);
610 return get_gcwq(cpu); 601 WARN_ON_ONCE(!pool);
602 return pool;
603}
604
605/**
606 * get_work_pool_id - return the worker pool ID a given work is associated with
607 * @work: the work item of interest
608 *
609 * Return the worker_pool ID @work was last associated with.
610 * %WORK_OFFQ_POOL_NONE if none.
611 */
612static int get_work_pool_id(struct work_struct *work)
613{
614 unsigned long data = atomic_long_read(&work->data);
615
616 if (data & WORK_STRUCT_PWQ)
617 return ((struct pool_workqueue *)
618 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
619
620 return data >> WORK_OFFQ_POOL_SHIFT;
611} 621}
612 622
613static void mark_work_canceling(struct work_struct *work) 623static void mark_work_canceling(struct work_struct *work)
614{ 624{
615 struct global_cwq *gcwq = get_work_gcwq(work); 625 unsigned long pool_id = get_work_pool_id(work);
616 unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
617 626
618 set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, 627 pool_id <<= WORK_OFFQ_POOL_SHIFT;
619 WORK_STRUCT_PENDING); 628 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
620} 629}
621 630
622static bool work_is_canceling(struct work_struct *work) 631static bool work_is_canceling(struct work_struct *work)
623{ 632{
624 unsigned long data = atomic_long_read(&work->data); 633 unsigned long data = atomic_long_read(&work->data);
625 634
626 return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); 635 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
627} 636}
628 637
629/* 638/*
630 * Policy functions. These define the policies on how the global worker 639 * Policy functions. These define the policies on how the global worker
631 * pools are managed. Unless noted otherwise, these functions assume that 640 * pools are managed. Unless noted otherwise, these functions assume that
632 * they're being called with gcwq->lock held. 641 * they're being called with pool->lock held.
633 */ 642 */
634 643
635static bool __need_more_worker(struct worker_pool *pool) 644static bool __need_more_worker(struct worker_pool *pool)
636{ 645{
637 return !atomic_read(get_pool_nr_running(pool)); 646 return !atomic_read(&pool->nr_running);
638} 647}
639 648
640/* 649/*
@@ -642,7 +651,7 @@ static bool __need_more_worker(struct worker_pool *pool)
642 * running workers. 651 * running workers.
643 * 652 *
644 * Note that, because unbound workers never contribute to nr_running, this 653 * Note that, because unbound workers never contribute to nr_running, this
645 * function will always return %true for unbound gcwq as long as the 654 * function will always return %true for unbound pools as long as the
646 * worklist isn't empty. 655 * worklist isn't empty.
647 */ 656 */
648static bool need_more_worker(struct worker_pool *pool) 657static bool need_more_worker(struct worker_pool *pool)
@@ -659,9 +668,8 @@ static bool may_start_working(struct worker_pool *pool)
659/* Do I need to keep working? Called from currently running workers. */ 668/* Do I need to keep working? Called from currently running workers. */
660static bool keep_working(struct worker_pool *pool) 669static bool keep_working(struct worker_pool *pool)
661{ 670{
662 atomic_t *nr_running = get_pool_nr_running(pool); 671 return !list_empty(&pool->worklist) &&
663 672 atomic_read(&pool->nr_running) <= 1;
664 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
665} 673}
666 674
667/* Do we need a new worker? Called from manager. */ 675/* Do we need a new worker? Called from manager. */
@@ -714,7 +722,7 @@ static struct worker *first_worker(struct worker_pool *pool)
714 * Wake up the first idle worker of @pool. 722 * Wake up the first idle worker of @pool.
715 * 723 *
716 * CONTEXT: 724 * CONTEXT:
717 * spin_lock_irq(gcwq->lock). 725 * spin_lock_irq(pool->lock).
718 */ 726 */
719static void wake_up_worker(struct worker_pool *pool) 727static void wake_up_worker(struct worker_pool *pool)
720{ 728{
@@ -740,8 +748,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
740 struct worker *worker = kthread_data(task); 748 struct worker *worker = kthread_data(task);
741 749
742 if (!(worker->flags & WORKER_NOT_RUNNING)) { 750 if (!(worker->flags & WORKER_NOT_RUNNING)) {
743 WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); 751 WARN_ON_ONCE(worker->pool->cpu != cpu);
744 atomic_inc(get_pool_nr_running(worker->pool)); 752 atomic_inc(&worker->pool->nr_running);
745 } 753 }
746} 754}
747 755
@@ -764,12 +772,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
764 unsigned int cpu) 772 unsigned int cpu)
765{ 773{
766 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 774 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
767 struct worker_pool *pool = worker->pool; 775 struct worker_pool *pool;
768 atomic_t *nr_running = get_pool_nr_running(pool);
769 776
777 /*
778 * Rescuers, which may not have all the fields set up like normal
779 * workers, also reach here, let's not access anything before
780 * checking NOT_RUNNING.
781 */
770 if (worker->flags & WORKER_NOT_RUNNING) 782 if (worker->flags & WORKER_NOT_RUNNING)
771 return NULL; 783 return NULL;
772 784
785 pool = worker->pool;
786
773 /* this can only happen on the local cpu */ 787 /* this can only happen on the local cpu */
774 BUG_ON(cpu != raw_smp_processor_id()); 788 BUG_ON(cpu != raw_smp_processor_id());
775 789
@@ -781,10 +795,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
781 * NOT_RUNNING is clear. This means that we're bound to and 795 * NOT_RUNNING is clear. This means that we're bound to and
782 * running on the local cpu w/ rq lock held and preemption 796 * running on the local cpu w/ rq lock held and preemption
783 * disabled, which in turn means that none else could be 797 * disabled, which in turn means that none else could be
784 * manipulating idle_list, so dereferencing idle_list without gcwq 798 * manipulating idle_list, so dereferencing idle_list without pool
785 * lock is safe. 799 * lock is safe.
786 */ 800 */
787 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) 801 if (atomic_dec_and_test(&pool->nr_running) &&
802 !list_empty(&pool->worklist))
788 to_wakeup = first_worker(pool); 803 to_wakeup = first_worker(pool);
789 return to_wakeup ? to_wakeup->task : NULL; 804 return to_wakeup ? to_wakeup->task : NULL;
790} 805}
@@ -800,7 +815,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
800 * woken up. 815 * woken up.
801 * 816 *
802 * CONTEXT: 817 * CONTEXT:
803 * spin_lock_irq(gcwq->lock) 818 * spin_lock_irq(pool->lock)
804 */ 819 */
805static inline void worker_set_flags(struct worker *worker, unsigned int flags, 820static inline void worker_set_flags(struct worker *worker, unsigned int flags,
806 bool wakeup) 821 bool wakeup)
@@ -816,14 +831,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
816 */ 831 */
817 if ((flags & WORKER_NOT_RUNNING) && 832 if ((flags & WORKER_NOT_RUNNING) &&
818 !(worker->flags & WORKER_NOT_RUNNING)) { 833 !(worker->flags & WORKER_NOT_RUNNING)) {
819 atomic_t *nr_running = get_pool_nr_running(pool);
820
821 if (wakeup) { 834 if (wakeup) {
822 if (atomic_dec_and_test(nr_running) && 835 if (atomic_dec_and_test(&pool->nr_running) &&
823 !list_empty(&pool->worklist)) 836 !list_empty(&pool->worklist))
824 wake_up_worker(pool); 837 wake_up_worker(pool);
825 } else 838 } else
826 atomic_dec(nr_running); 839 atomic_dec(&pool->nr_running);
827 } 840 }
828 841
829 worker->flags |= flags; 842 worker->flags |= flags;
@@ -837,7 +850,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
837 * Clear @flags in @worker->flags and adjust nr_running accordingly. 850 * Clear @flags in @worker->flags and adjust nr_running accordingly.
838 * 851 *
839 * CONTEXT: 852 * CONTEXT:
840 * spin_lock_irq(gcwq->lock) 853 * spin_lock_irq(pool->lock)
841 */ 854 */
842static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 855static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
843{ 856{
@@ -855,87 +868,55 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
855 */ 868 */
856 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 869 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
857 if (!(worker->flags & WORKER_NOT_RUNNING)) 870 if (!(worker->flags & WORKER_NOT_RUNNING))
858 atomic_inc(get_pool_nr_running(pool)); 871 atomic_inc(&pool->nr_running);
859} 872}
860 873
861/** 874/**
862 * busy_worker_head - return the busy hash head for a work 875 * find_worker_executing_work - find worker which is executing a work
863 * @gcwq: gcwq of interest 876 * @pool: pool of interest
864 * @work: work to be hashed
865 *
866 * Return hash head of @gcwq for @work.
867 *
868 * CONTEXT:
869 * spin_lock_irq(gcwq->lock).
870 *
871 * RETURNS:
872 * Pointer to the hash head.
873 */
874static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
875 struct work_struct *work)
876{
877 const int base_shift = ilog2(sizeof(struct work_struct));
878 unsigned long v = (unsigned long)work;
879
880 /* simple shift and fold hash, do we need something better? */
881 v >>= base_shift;
882 v += v >> BUSY_WORKER_HASH_ORDER;
883 v &= BUSY_WORKER_HASH_MASK;
884
885 return &gcwq->busy_hash[v];
886}
887
888/**
889 * __find_worker_executing_work - find worker which is executing a work
890 * @gcwq: gcwq of interest
891 * @bwh: hash head as returned by busy_worker_head()
892 * @work: work to find worker for 877 * @work: work to find worker for
893 * 878 *
894 * Find a worker which is executing @work on @gcwq. @bwh should be 879 * Find a worker which is executing @work on @pool by searching
895 * the hash head obtained by calling busy_worker_head() with the same 880 * @pool->busy_hash which is keyed by the address of @work. For a worker
896 * work. 881 * to match, its current execution should match the address of @work and
882 * its work function. This is to avoid unwanted dependency between
883 * unrelated work executions through a work item being recycled while still
884 * being executed.
885 *
886 * This is a bit tricky. A work item may be freed once its execution
887 * starts and nothing prevents the freed area from being recycled for
888 * another work item. If the same work item address ends up being reused
889 * before the original execution finishes, workqueue will identify the
890 * recycled work item as currently executing and make it wait until the
891 * current execution finishes, introducing an unwanted dependency.
892 *
893 * This function checks the work item address, work function and workqueue
894 * to avoid false positives. Note that this isn't complete as one may
895 * construct a work function which can introduce dependency onto itself
896 * through a recycled work item. Well, if somebody wants to shoot oneself
897 * in the foot that badly, there's only so much we can do, and if such
898 * deadlock actually occurs, it should be easy to locate the culprit work
899 * function.
897 * 900 *
898 * CONTEXT: 901 * CONTEXT:
899 * spin_lock_irq(gcwq->lock). 902 * spin_lock_irq(pool->lock).
900 * 903 *
901 * RETURNS: 904 * RETURNS:
902 * Pointer to worker which is executing @work if found, NULL 905 * Pointer to worker which is executing @work if found, NULL
903 * otherwise. 906 * otherwise.
904 */ 907 */
905static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, 908static struct worker *find_worker_executing_work(struct worker_pool *pool,
906 struct hlist_head *bwh, 909 struct work_struct *work)
907 struct work_struct *work)
908{ 910{
909 struct worker *worker; 911 struct worker *worker;
910 struct hlist_node *tmp;
911 912
912 hlist_for_each_entry(worker, tmp, bwh, hentry) 913 hash_for_each_possible(pool->busy_hash, worker, hentry,
913 if (worker->current_work == work) 914 (unsigned long)work)
915 if (worker->current_work == work &&
916 worker->current_func == work->func)
914 return worker; 917 return worker;
915 return NULL;
916}
917 918
918/** 919 return NULL;
919 * find_worker_executing_work - find worker which is executing a work
920 * @gcwq: gcwq of interest
921 * @work: work to find worker for
922 *
923 * Find a worker which is executing @work on @gcwq. This function is
924 * identical to __find_worker_executing_work() except that this
925 * function calculates @bwh itself.
926 *
927 * CONTEXT:
928 * spin_lock_irq(gcwq->lock).
929 *
930 * RETURNS:
931 * Pointer to worker which is executing @work if found, NULL
932 * otherwise.
933 */
934static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
935 struct work_struct *work)
936{
937 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
938 work);
939} 920}
940 921
941/** 922/**
@@ -953,7 +934,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
953 * nested inside outer list_for_each_entry_safe(). 934 * nested inside outer list_for_each_entry_safe().
954 * 935 *
955 * CONTEXT: 936 * CONTEXT:
956 * spin_lock_irq(gcwq->lock). 937 * spin_lock_irq(pool->lock).
957 */ 938 */
958static void move_linked_works(struct work_struct *work, struct list_head *head, 939static void move_linked_works(struct work_struct *work, struct list_head *head,
959 struct work_struct **nextp) 940 struct work_struct **nextp)
@@ -979,67 +960,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
979 *nextp = n; 960 *nextp = n;
980} 961}
981 962
982static void cwq_activate_delayed_work(struct work_struct *work) 963static void pwq_activate_delayed_work(struct work_struct *work)
983{ 964{
984 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 965 struct pool_workqueue *pwq = get_work_pwq(work);
985 966
986 trace_workqueue_activate_work(work); 967 trace_workqueue_activate_work(work);
987 move_linked_works(work, &cwq->pool->worklist, NULL); 968 move_linked_works(work, &pwq->pool->worklist, NULL);
988 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 969 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
989 cwq->nr_active++; 970 pwq->nr_active++;
990} 971}
991 972
992static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) 973static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
993{ 974{
994 struct work_struct *work = list_first_entry(&cwq->delayed_works, 975 struct work_struct *work = list_first_entry(&pwq->delayed_works,
995 struct work_struct, entry); 976 struct work_struct, entry);
996 977
997 cwq_activate_delayed_work(work); 978 pwq_activate_delayed_work(work);
998} 979}
999 980
1000/** 981/**
1001 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight 982 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1002 * @cwq: cwq of interest 983 * @pwq: pwq of interest
1003 * @color: color of work which left the queue 984 * @color: color of work which left the queue
1004 * 985 *
1005 * A work either has completed or is removed from pending queue, 986 * A work either has completed or is removed from pending queue,
1006 * decrement nr_in_flight of its cwq and handle workqueue flushing. 987 * decrement nr_in_flight of its pwq and handle workqueue flushing.
1007 * 988 *
1008 * CONTEXT: 989 * CONTEXT:
1009 * spin_lock_irq(gcwq->lock). 990 * spin_lock_irq(pool->lock).
1010 */ 991 */
1011static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) 992static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1012{ 993{
1013 /* ignore uncolored works */ 994 /* ignore uncolored works */
1014 if (color == WORK_NO_COLOR) 995 if (color == WORK_NO_COLOR)
1015 return; 996 return;
1016 997
1017 cwq->nr_in_flight[color]--; 998 pwq->nr_in_flight[color]--;
1018 999
1019 cwq->nr_active--; 1000 pwq->nr_active--;
1020 if (!list_empty(&cwq->delayed_works)) { 1001 if (!list_empty(&pwq->delayed_works)) {
1021 /* one down, submit a delayed one */ 1002 /* one down, submit a delayed one */
1022 if (cwq->nr_active < cwq->max_active) 1003 if (pwq->nr_active < pwq->max_active)
1023 cwq_activate_first_delayed(cwq); 1004 pwq_activate_first_delayed(pwq);
1024 } 1005 }
1025 1006
1026 /* is flush in progress and are we at the flushing tip? */ 1007 /* is flush in progress and are we at the flushing tip? */
1027 if (likely(cwq->flush_color != color)) 1008 if (likely(pwq->flush_color != color))
1028 return; 1009 return;
1029 1010
1030 /* are there still in-flight works? */ 1011 /* are there still in-flight works? */
1031 if (cwq->nr_in_flight[color]) 1012 if (pwq->nr_in_flight[color])
1032 return; 1013 return;
1033 1014
1034 /* this cwq is done, clear flush_color */ 1015 /* this pwq is done, clear flush_color */
1035 cwq->flush_color = -1; 1016 pwq->flush_color = -1;
1036 1017
1037 /* 1018 /*
1038 * If this was the last cwq, wake up the first flusher. It 1019 * If this was the last pwq, wake up the first flusher. It
1039 * will handle the rest. 1020 * will handle the rest.
1040 */ 1021 */
1041 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) 1022 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1042 complete(&cwq->wq->first_flusher->done); 1023 complete(&pwq->wq->first_flusher->done);
1043} 1024}
1044 1025
1045/** 1026/**
@@ -1070,7 +1051,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1070static int try_to_grab_pending(struct work_struct *work, bool is_dwork, 1051static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1071 unsigned long *flags) 1052 unsigned long *flags)
1072{ 1053{
1073 struct global_cwq *gcwq; 1054 struct worker_pool *pool;
1055 struct pool_workqueue *pwq;
1074 1056
1075 local_irq_save(*flags); 1057 local_irq_save(*flags);
1076 1058
@@ -1095,41 +1077,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1095 * The queueing is in progress, or it is already queued. Try to 1077 * The queueing is in progress, or it is already queued. Try to
1096 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 1078 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1097 */ 1079 */
1098 gcwq = get_work_gcwq(work); 1080 pool = get_work_pool(work);
1099 if (!gcwq) 1081 if (!pool)
1100 goto fail; 1082 goto fail;
1101 1083
1102 spin_lock(&gcwq->lock); 1084 spin_lock(&pool->lock);
1103 if (!list_empty(&work->entry)) { 1085 /*
1086 * work->data is guaranteed to point to pwq only while the work
1087 * item is queued on pwq->wq, and both updating work->data to point
1088 * to pwq on queueing and to pool on dequeueing are done under
1089 * pwq->pool->lock. This in turn guarantees that, if work->data
1090 * points to pwq which is associated with a locked pool, the work
1091 * item is currently queued on that pool.
1092 */
1093 pwq = get_work_pwq(work);
1094 if (pwq && pwq->pool == pool) {
1095 debug_work_deactivate(work);
1096
1104 /* 1097 /*
1105 * This work is queued, but perhaps we locked the wrong gcwq. 1098 * A delayed work item cannot be grabbed directly because
1106 * In that case we must see the new value after rmb(), see 1099 * it might have linked NO_COLOR work items which, if left
1107 * insert_work()->wmb(). 1100 * on the delayed_list, will confuse pwq->nr_active
1101 * management later on and cause stall. Make sure the work
1102 * item is activated before grabbing.
1108 */ 1103 */
1109 smp_rmb(); 1104 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1110 if (gcwq == get_work_gcwq(work)) { 1105 pwq_activate_delayed_work(work);
1111 debug_work_deactivate(work);
1112 1106
1113 /* 1107 list_del_init(&work->entry);
1114 * A delayed work item cannot be grabbed directly 1108 pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
1115 * because it might have linked NO_COLOR work items
1116 * which, if left on the delayed_list, will confuse
1117 * cwq->nr_active management later on and cause
1118 * stall. Make sure the work item is activated
1119 * before grabbing.
1120 */
1121 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1122 cwq_activate_delayed_work(work);
1123 1109
1124 list_del_init(&work->entry); 1110 /* work->data points to pwq iff queued, point to pool */
1125 cwq_dec_nr_in_flight(get_work_cwq(work), 1111 set_work_pool_and_keep_pending(work, pool->id);
1126 get_work_color(work));
1127 1112
1128 spin_unlock(&gcwq->lock); 1113 spin_unlock(&pool->lock);
1129 return 1; 1114 return 1;
1130 }
1131 } 1115 }
1132 spin_unlock(&gcwq->lock); 1116 spin_unlock(&pool->lock);
1133fail: 1117fail:
1134 local_irq_restore(*flags); 1118 local_irq_restore(*flags);
1135 if (work_is_canceling(work)) 1119 if (work_is_canceling(work))
@@ -1139,33 +1123,25 @@ fail:
1139} 1123}
1140 1124
1141/** 1125/**
1142 * insert_work - insert a work into gcwq 1126 * insert_work - insert a work into a pool
1143 * @cwq: cwq @work belongs to 1127 * @pwq: pwq @work belongs to
1144 * @work: work to insert 1128 * @work: work to insert
1145 * @head: insertion point 1129 * @head: insertion point
1146 * @extra_flags: extra WORK_STRUCT_* flags to set 1130 * @extra_flags: extra WORK_STRUCT_* flags to set
1147 * 1131 *
1148 * Insert @work which belongs to @cwq into @gcwq after @head. 1132 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
1149 * @extra_flags is or'd to work_struct flags. 1133 * work_struct flags.
1150 * 1134 *
1151 * CONTEXT: 1135 * CONTEXT:
1152 * spin_lock_irq(gcwq->lock). 1136 * spin_lock_irq(pool->lock).
1153 */ 1137 */
1154static void insert_work(struct cpu_workqueue_struct *cwq, 1138static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1155 struct work_struct *work, struct list_head *head, 1139 struct list_head *head, unsigned int extra_flags)
1156 unsigned int extra_flags)
1157{ 1140{
1158 struct worker_pool *pool = cwq->pool; 1141 struct worker_pool *pool = pwq->pool;
1159 1142
1160 /* we own @work, set data and link */ 1143 /* we own @work, set data and link */
1161 set_work_cwq(work, cwq, extra_flags); 1144 set_work_pwq(work, pwq, extra_flags);
1162
1163 /*
1164 * Ensure that we get the right work->data if we see the
1165 * result of list_add() below, see try_to_grab_pending().
1166 */
1167 smp_wmb();
1168
1169 list_add_tail(&work->entry, head); 1145 list_add_tail(&work->entry, head);
1170 1146
1171 /* 1147 /*
@@ -1181,41 +1157,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
1181 1157
1182/* 1158/*
1183 * Test whether @work is being queued from another work executing on the 1159 * Test whether @work is being queued from another work executing on the
1184 * same workqueue. This is rather expensive and should only be used from 1160 * same workqueue.
1185 * cold paths.
1186 */ 1161 */
1187static bool is_chained_work(struct workqueue_struct *wq) 1162static bool is_chained_work(struct workqueue_struct *wq)
1188{ 1163{
1189 unsigned long flags; 1164 struct worker *worker;
1190 unsigned int cpu;
1191
1192 for_each_gcwq_cpu(cpu) {
1193 struct global_cwq *gcwq = get_gcwq(cpu);
1194 struct worker *worker;
1195 struct hlist_node *pos;
1196 int i;
1197 1165
1198 spin_lock_irqsave(&gcwq->lock, flags); 1166 worker = current_wq_worker();
1199 for_each_busy_worker(worker, i, pos, gcwq) { 1167 /*
1200 if (worker->task != current) 1168 * Return %true iff I'm a worker execuing a work item on @wq. If
1201 continue; 1169 * I'm @worker, it's safe to dereference it without locking.
1202 spin_unlock_irqrestore(&gcwq->lock, flags); 1170 */
1203 /* 1171 return worker && worker->current_pwq->wq == wq;
1204 * I'm @worker, no locking necessary. See if @work
1205 * is headed to the same workqueue.
1206 */
1207 return worker->current_cwq->wq == wq;
1208 }
1209 spin_unlock_irqrestore(&gcwq->lock, flags);
1210 }
1211 return false;
1212} 1172}
1213 1173
1214static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 1174static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1215 struct work_struct *work) 1175 struct work_struct *work)
1216{ 1176{
1217 struct global_cwq *gcwq; 1177 struct pool_workqueue *pwq;
1218 struct cpu_workqueue_struct *cwq;
1219 struct list_head *worklist; 1178 struct list_head *worklist;
1220 unsigned int work_flags; 1179 unsigned int work_flags;
1221 unsigned int req_cpu = cpu; 1180 unsigned int req_cpu = cpu;
@@ -1235,9 +1194,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1235 WARN_ON_ONCE(!is_chained_work(wq))) 1194 WARN_ON_ONCE(!is_chained_work(wq)))
1236 return; 1195 return;
1237 1196
1238 /* determine gcwq to use */ 1197 /* determine the pwq to use */
1239 if (!(wq->flags & WQ_UNBOUND)) { 1198 if (!(wq->flags & WQ_UNBOUND)) {
1240 struct global_cwq *last_gcwq; 1199 struct worker_pool *last_pool;
1241 1200
1242 if (cpu == WORK_CPU_UNBOUND) 1201 if (cpu == WORK_CPU_UNBOUND)
1243 cpu = raw_smp_processor_id(); 1202 cpu = raw_smp_processor_id();
@@ -1248,55 +1207,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1248 * work needs to be queued on that cpu to guarantee 1207 * work needs to be queued on that cpu to guarantee
1249 * non-reentrancy. 1208 * non-reentrancy.
1250 */ 1209 */
1251 gcwq = get_gcwq(cpu); 1210 pwq = get_pwq(cpu, wq);
1252 last_gcwq = get_work_gcwq(work); 1211 last_pool = get_work_pool(work);
1253 1212
1254 if (last_gcwq && last_gcwq != gcwq) { 1213 if (last_pool && last_pool != pwq->pool) {
1255 struct worker *worker; 1214 struct worker *worker;
1256 1215
1257 spin_lock(&last_gcwq->lock); 1216 spin_lock(&last_pool->lock);
1258 1217
1259 worker = find_worker_executing_work(last_gcwq, work); 1218 worker = find_worker_executing_work(last_pool, work);
1260 1219
1261 if (worker && worker->current_cwq->wq == wq) 1220 if (worker && worker->current_pwq->wq == wq) {
1262 gcwq = last_gcwq; 1221 pwq = get_pwq(last_pool->cpu, wq);
1263 else { 1222 } else {
1264 /* meh... not running there, queue here */ 1223 /* meh... not running there, queue here */
1265 spin_unlock(&last_gcwq->lock); 1224 spin_unlock(&last_pool->lock);
1266 spin_lock(&gcwq->lock); 1225 spin_lock(&pwq->pool->lock);
1267 } 1226 }
1268 } else { 1227 } else {
1269 spin_lock(&gcwq->lock); 1228 spin_lock(&pwq->pool->lock);
1270 } 1229 }
1271 } else { 1230 } else {
1272 gcwq = get_gcwq(WORK_CPU_UNBOUND); 1231 pwq = get_pwq(WORK_CPU_UNBOUND, wq);
1273 spin_lock(&gcwq->lock); 1232 spin_lock(&pwq->pool->lock);
1274 } 1233 }
1275 1234
1276 /* gcwq determined, get cwq and queue */ 1235 /* pwq determined, queue */
1277 cwq = get_cwq(gcwq->cpu, wq); 1236 trace_workqueue_queue_work(req_cpu, pwq, work);
1278 trace_workqueue_queue_work(req_cpu, cwq, work);
1279 1237
1280 if (WARN_ON(!list_empty(&work->entry))) { 1238 if (WARN_ON(!list_empty(&work->entry))) {
1281 spin_unlock(&gcwq->lock); 1239 spin_unlock(&pwq->pool->lock);
1282 return; 1240 return;
1283 } 1241 }
1284 1242
1285 cwq->nr_in_flight[cwq->work_color]++; 1243 pwq->nr_in_flight[pwq->work_color]++;
1286 work_flags = work_color_to_flags(cwq->work_color); 1244 work_flags = work_color_to_flags(pwq->work_color);
1287 1245
1288 if (likely(cwq->nr_active < cwq->max_active)) { 1246 if (likely(pwq->nr_active < pwq->max_active)) {
1289 trace_workqueue_activate_work(work); 1247 trace_workqueue_activate_work(work);
1290 cwq->nr_active++; 1248 pwq->nr_active++;
1291 worklist = &cwq->pool->worklist; 1249 worklist = &pwq->pool->worklist;
1292 } else { 1250 } else {
1293 work_flags |= WORK_STRUCT_DELAYED; 1251 work_flags |= WORK_STRUCT_DELAYED;
1294 worklist = &cwq->delayed_works; 1252 worklist = &pwq->delayed_works;
1295 } 1253 }
1296 1254
1297 insert_work(cwq, work, worklist, work_flags); 1255 insert_work(pwq, work, worklist, work_flags);
1298 1256
1299 spin_unlock(&gcwq->lock); 1257 spin_unlock(&pwq->pool->lock);
1300} 1258}
1301 1259
1302/** 1260/**
@@ -1347,19 +1305,17 @@ EXPORT_SYMBOL_GPL(queue_work);
1347void delayed_work_timer_fn(unsigned long __data) 1305void delayed_work_timer_fn(unsigned long __data)
1348{ 1306{
1349 struct delayed_work *dwork = (struct delayed_work *)__data; 1307 struct delayed_work *dwork = (struct delayed_work *)__data;
1350 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1351 1308
1352 /* should have been called from irqsafe timer with irq already off */ 1309 /* should have been called from irqsafe timer with irq already off */
1353 __queue_work(dwork->cpu, cwq->wq, &dwork->work); 1310 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
1354} 1311}
1355EXPORT_SYMBOL_GPL(delayed_work_timer_fn); 1312EXPORT_SYMBOL(delayed_work_timer_fn);
1356 1313
1357static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, 1314static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1358 struct delayed_work *dwork, unsigned long delay) 1315 struct delayed_work *dwork, unsigned long delay)
1359{ 1316{
1360 struct timer_list *timer = &dwork->timer; 1317 struct timer_list *timer = &dwork->timer;
1361 struct work_struct *work = &dwork->work; 1318 struct work_struct *work = &dwork->work;
1362 unsigned int lcpu;
1363 1319
1364 WARN_ON_ONCE(timer->function != delayed_work_timer_fn || 1320 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1365 timer->data != (unsigned long)dwork); 1321 timer->data != (unsigned long)dwork);
@@ -1379,30 +1335,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1379 1335
1380 timer_stats_timer_set_start_info(&dwork->timer); 1336 timer_stats_timer_set_start_info(&dwork->timer);
1381 1337
1382 /* 1338 dwork->wq = wq;
1383 * This stores cwq for the moment, for the timer_fn. Note that the
1384 * work's gcwq is preserved to allow reentrance detection for
1385 * delayed works.
1386 */
1387 if (!(wq->flags & WQ_UNBOUND)) {
1388 struct global_cwq *gcwq = get_work_gcwq(work);
1389
1390 /*
1391 * If we cannot get the last gcwq from @work directly,
1392 * select the last CPU such that it avoids unnecessarily
1393 * triggering non-reentrancy check in __queue_work().
1394 */
1395 lcpu = cpu;
1396 if (gcwq)
1397 lcpu = gcwq->cpu;
1398 if (lcpu == WORK_CPU_UNBOUND)
1399 lcpu = raw_smp_processor_id();
1400 } else {
1401 lcpu = WORK_CPU_UNBOUND;
1402 }
1403
1404 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1405
1406 dwork->cpu = cpu; 1339 dwork->cpu = cpu;
1407 timer->expires = jiffies + delay; 1340 timer->expires = jiffies + delay;
1408 1341
@@ -1519,12 +1452,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
1519 * necessary. 1452 * necessary.
1520 * 1453 *
1521 * LOCKING: 1454 * LOCKING:
1522 * spin_lock_irq(gcwq->lock). 1455 * spin_lock_irq(pool->lock).
1523 */ 1456 */
1524static void worker_enter_idle(struct worker *worker) 1457static void worker_enter_idle(struct worker *worker)
1525{ 1458{
1526 struct worker_pool *pool = worker->pool; 1459 struct worker_pool *pool = worker->pool;
1527 struct global_cwq *gcwq = pool->gcwq;
1528 1460
1529 BUG_ON(worker->flags & WORKER_IDLE); 1461 BUG_ON(worker->flags & WORKER_IDLE);
1530 BUG_ON(!list_empty(&worker->entry) && 1462 BUG_ON(!list_empty(&worker->entry) &&
@@ -1542,14 +1474,14 @@ static void worker_enter_idle(struct worker *worker)
1542 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 1474 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1543 1475
1544 /* 1476 /*
1545 * Sanity check nr_running. Because gcwq_unbind_fn() releases 1477 * Sanity check nr_running. Because wq_unbind_fn() releases
1546 * gcwq->lock between setting %WORKER_UNBOUND and zapping 1478 * pool->lock between setting %WORKER_UNBOUND and zapping
1547 * nr_running, the warning may trigger spuriously. Check iff 1479 * nr_running, the warning may trigger spuriously. Check iff
1548 * unbind is not in progress. 1480 * unbind is not in progress.
1549 */ 1481 */
1550 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && 1482 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
1551 pool->nr_workers == pool->nr_idle && 1483 pool->nr_workers == pool->nr_idle &&
1552 atomic_read(get_pool_nr_running(pool))); 1484 atomic_read(&pool->nr_running));
1553} 1485}
1554 1486
1555/** 1487/**
@@ -1559,7 +1491,7 @@ static void worker_enter_idle(struct worker *worker)
1559 * @worker is leaving idle state. Update stats. 1491 * @worker is leaving idle state. Update stats.
1560 * 1492 *
1561 * LOCKING: 1493 * LOCKING:
1562 * spin_lock_irq(gcwq->lock). 1494 * spin_lock_irq(pool->lock).
1563 */ 1495 */
1564static void worker_leave_idle(struct worker *worker) 1496static void worker_leave_idle(struct worker *worker)
1565{ 1497{
@@ -1572,7 +1504,7 @@ static void worker_leave_idle(struct worker *worker)
1572} 1504}
1573 1505
1574/** 1506/**
1575 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq 1507 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
1576 * @worker: self 1508 * @worker: self
1577 * 1509 *
1578 * Works which are scheduled while the cpu is online must at least be 1510 * Works which are scheduled while the cpu is online must at least be
@@ -1584,27 +1516,27 @@ static void worker_leave_idle(struct worker *worker)
1584 * themselves to the target cpu and may race with cpu going down or 1516 * themselves to the target cpu and may race with cpu going down or
1585 * coming online. kthread_bind() can't be used because it may put the 1517 * coming online. kthread_bind() can't be used because it may put the
1586 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used 1518 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1587 * verbatim as it's best effort and blocking and gcwq may be 1519 * verbatim as it's best effort and blocking and pool may be
1588 * [dis]associated in the meantime. 1520 * [dis]associated in the meantime.
1589 * 1521 *
1590 * This function tries set_cpus_allowed() and locks gcwq and verifies the 1522 * This function tries set_cpus_allowed() and locks pool and verifies the
1591 * binding against %GCWQ_DISASSOCIATED which is set during 1523 * binding against %POOL_DISASSOCIATED which is set during
1592 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker 1524 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1593 * enters idle state or fetches works without dropping lock, it can 1525 * enters idle state or fetches works without dropping lock, it can
1594 * guarantee the scheduling requirement described in the first paragraph. 1526 * guarantee the scheduling requirement described in the first paragraph.
1595 * 1527 *
1596 * CONTEXT: 1528 * CONTEXT:
1597 * Might sleep. Called without any lock but returns with gcwq->lock 1529 * Might sleep. Called without any lock but returns with pool->lock
1598 * held. 1530 * held.
1599 * 1531 *
1600 * RETURNS: 1532 * RETURNS:
1601 * %true if the associated gcwq is online (@worker is successfully 1533 * %true if the associated pool is online (@worker is successfully
1602 * bound), %false if offline. 1534 * bound), %false if offline.
1603 */ 1535 */
1604static bool worker_maybe_bind_and_lock(struct worker *worker) 1536static bool worker_maybe_bind_and_lock(struct worker *worker)
1605__acquires(&gcwq->lock) 1537__acquires(&pool->lock)
1606{ 1538{
1607 struct global_cwq *gcwq = worker->pool->gcwq; 1539 struct worker_pool *pool = worker->pool;
1608 struct task_struct *task = worker->task; 1540 struct task_struct *task = worker->task;
1609 1541
1610 while (true) { 1542 while (true) {
@@ -1612,19 +1544,19 @@ __acquires(&gcwq->lock)
1612 * The following call may fail, succeed or succeed 1544 * The following call may fail, succeed or succeed
1613 * without actually migrating the task to the cpu if 1545 * without actually migrating the task to the cpu if
1614 * it races with cpu hotunplug operation. Verify 1546 * it races with cpu hotunplug operation. Verify
1615 * against GCWQ_DISASSOCIATED. 1547 * against POOL_DISASSOCIATED.
1616 */ 1548 */
1617 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) 1549 if (!(pool->flags & POOL_DISASSOCIATED))
1618 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); 1550 set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
1619 1551
1620 spin_lock_irq(&gcwq->lock); 1552 spin_lock_irq(&pool->lock);
1621 if (gcwq->flags & GCWQ_DISASSOCIATED) 1553 if (pool->flags & POOL_DISASSOCIATED)
1622 return false; 1554 return false;
1623 if (task_cpu(task) == gcwq->cpu && 1555 if (task_cpu(task) == pool->cpu &&
1624 cpumask_equal(&current->cpus_allowed, 1556 cpumask_equal(&current->cpus_allowed,
1625 get_cpu_mask(gcwq->cpu))) 1557 get_cpu_mask(pool->cpu)))
1626 return true; 1558 return true;
1627 spin_unlock_irq(&gcwq->lock); 1559 spin_unlock_irq(&pool->lock);
1628 1560
1629 /* 1561 /*
1630 * We've raced with CPU hot[un]plug. Give it a breather 1562 * We've raced with CPU hot[un]plug. Give it a breather
@@ -1643,15 +1575,13 @@ __acquires(&gcwq->lock)
1643 */ 1575 */
1644static void idle_worker_rebind(struct worker *worker) 1576static void idle_worker_rebind(struct worker *worker)
1645{ 1577{
1646 struct global_cwq *gcwq = worker->pool->gcwq;
1647
1648 /* CPU may go down again inbetween, clear UNBOUND only on success */ 1578 /* CPU may go down again inbetween, clear UNBOUND only on success */
1649 if (worker_maybe_bind_and_lock(worker)) 1579 if (worker_maybe_bind_and_lock(worker))
1650 worker_clr_flags(worker, WORKER_UNBOUND); 1580 worker_clr_flags(worker, WORKER_UNBOUND);
1651 1581
1652 /* rebind complete, become available again */ 1582 /* rebind complete, become available again */
1653 list_add(&worker->entry, &worker->pool->idle_list); 1583 list_add(&worker->entry, &worker->pool->idle_list);
1654 spin_unlock_irq(&gcwq->lock); 1584 spin_unlock_irq(&worker->pool->lock);
1655} 1585}
1656 1586
1657/* 1587/*
@@ -1663,19 +1593,18 @@ static void idle_worker_rebind(struct worker *worker)
1663static void busy_worker_rebind_fn(struct work_struct *work) 1593static void busy_worker_rebind_fn(struct work_struct *work)
1664{ 1594{
1665 struct worker *worker = container_of(work, struct worker, rebind_work); 1595 struct worker *worker = container_of(work, struct worker, rebind_work);
1666 struct global_cwq *gcwq = worker->pool->gcwq;
1667 1596
1668 if (worker_maybe_bind_and_lock(worker)) 1597 if (worker_maybe_bind_and_lock(worker))
1669 worker_clr_flags(worker, WORKER_UNBOUND); 1598 worker_clr_flags(worker, WORKER_UNBOUND);
1670 1599
1671 spin_unlock_irq(&gcwq->lock); 1600 spin_unlock_irq(&worker->pool->lock);
1672} 1601}
1673 1602
1674/** 1603/**
1675 * rebind_workers - rebind all workers of a gcwq to the associated CPU 1604 * rebind_workers - rebind all workers of a pool to the associated CPU
1676 * @gcwq: gcwq of interest 1605 * @pool: pool of interest
1677 * 1606 *
1678 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding 1607 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
1679 * is different for idle and busy ones. 1608 * is different for idle and busy ones.
1680 * 1609 *
1681 * Idle ones will be removed from the idle_list and woken up. They will 1610 * Idle ones will be removed from the idle_list and woken up. They will
@@ -1693,38 +1622,31 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1693 * including the manager will not appear on @idle_list until rebind is 1622 * including the manager will not appear on @idle_list until rebind is
1694 * complete, making local wake-ups safe. 1623 * complete, making local wake-ups safe.
1695 */ 1624 */
1696static void rebind_workers(struct global_cwq *gcwq) 1625static void rebind_workers(struct worker_pool *pool)
1697{ 1626{
1698 struct worker_pool *pool;
1699 struct worker *worker, *n; 1627 struct worker *worker, *n;
1700 struct hlist_node *pos;
1701 int i; 1628 int i;
1702 1629
1703 lockdep_assert_held(&gcwq->lock); 1630 lockdep_assert_held(&pool->assoc_mutex);
1704 1631 lockdep_assert_held(&pool->lock);
1705 for_each_worker_pool(pool, gcwq)
1706 lockdep_assert_held(&pool->assoc_mutex);
1707 1632
1708 /* dequeue and kick idle ones */ 1633 /* dequeue and kick idle ones */
1709 for_each_worker_pool(pool, gcwq) { 1634 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1710 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { 1635 /*
1711 /* 1636 * idle workers should be off @pool->idle_list until rebind
1712 * idle workers should be off @pool->idle_list 1637 * is complete to avoid receiving premature local wake-ups.
1713 * until rebind is complete to avoid receiving 1638 */
1714 * premature local wake-ups. 1639 list_del_init(&worker->entry);
1715 */
1716 list_del_init(&worker->entry);
1717 1640
1718 /* 1641 /*
1719 * worker_thread() will see the above dequeuing 1642 * worker_thread() will see the above dequeuing and call
1720 * and call idle_worker_rebind(). 1643 * idle_worker_rebind().
1721 */ 1644 */
1722 wake_up_process(worker->task); 1645 wake_up_process(worker->task);
1723 }
1724 } 1646 }
1725 1647
1726 /* rebind busy workers */ 1648 /* rebind busy workers */
1727 for_each_busy_worker(worker, i, pos, gcwq) { 1649 for_each_busy_worker(worker, i, pool) {
1728 struct work_struct *rebind_work = &worker->rebind_work; 1650 struct work_struct *rebind_work = &worker->rebind_work;
1729 struct workqueue_struct *wq; 1651 struct workqueue_struct *wq;
1730 1652
@@ -1736,16 +1658,16 @@ static void rebind_workers(struct global_cwq *gcwq)
1736 1658
1737 /* 1659 /*
1738 * wq doesn't really matter but let's keep @worker->pool 1660 * wq doesn't really matter but let's keep @worker->pool
1739 * and @cwq->pool consistent for sanity. 1661 * and @pwq->pool consistent for sanity.
1740 */ 1662 */
1741 if (worker_pool_pri(worker->pool)) 1663 if (std_worker_pool_pri(worker->pool))
1742 wq = system_highpri_wq; 1664 wq = system_highpri_wq;
1743 else 1665 else
1744 wq = system_wq; 1666 wq = system_wq;
1745 1667
1746 insert_work(get_cwq(gcwq->cpu, wq), rebind_work, 1668 insert_work(get_pwq(pool->cpu, wq), rebind_work,
1747 worker->scheduled.next, 1669 worker->scheduled.next,
1748 work_color_to_flags(WORK_NO_COLOR)); 1670 work_color_to_flags(WORK_NO_COLOR));
1749 } 1671 }
1750} 1672}
1751 1673
@@ -1780,19 +1702,18 @@ static struct worker *alloc_worker(void)
1780 */ 1702 */
1781static struct worker *create_worker(struct worker_pool *pool) 1703static struct worker *create_worker(struct worker_pool *pool)
1782{ 1704{
1783 struct global_cwq *gcwq = pool->gcwq; 1705 const char *pri = std_worker_pool_pri(pool) ? "H" : "";
1784 const char *pri = worker_pool_pri(pool) ? "H" : "";
1785 struct worker *worker = NULL; 1706 struct worker *worker = NULL;
1786 int id = -1; 1707 int id = -1;
1787 1708
1788 spin_lock_irq(&gcwq->lock); 1709 spin_lock_irq(&pool->lock);
1789 while (ida_get_new(&pool->worker_ida, &id)) { 1710 while (ida_get_new(&pool->worker_ida, &id)) {
1790 spin_unlock_irq(&gcwq->lock); 1711 spin_unlock_irq(&pool->lock);
1791 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) 1712 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1792 goto fail; 1713 goto fail;
1793 spin_lock_irq(&gcwq->lock); 1714 spin_lock_irq(&pool->lock);
1794 } 1715 }
1795 spin_unlock_irq(&gcwq->lock); 1716 spin_unlock_irq(&pool->lock);
1796 1717
1797 worker = alloc_worker(); 1718 worker = alloc_worker();
1798 if (!worker) 1719 if (!worker)
@@ -1801,30 +1722,30 @@ static struct worker *create_worker(struct worker_pool *pool)
1801 worker->pool = pool; 1722 worker->pool = pool;
1802 worker->id = id; 1723 worker->id = id;
1803 1724
1804 if (gcwq->cpu != WORK_CPU_UNBOUND) 1725 if (pool->cpu != WORK_CPU_UNBOUND)
1805 worker->task = kthread_create_on_node(worker_thread, 1726 worker->task = kthread_create_on_node(worker_thread,
1806 worker, cpu_to_node(gcwq->cpu), 1727 worker, cpu_to_node(pool->cpu),
1807 "kworker/%u:%d%s", gcwq->cpu, id, pri); 1728 "kworker/%u:%d%s", pool->cpu, id, pri);
1808 else 1729 else
1809 worker->task = kthread_create(worker_thread, worker, 1730 worker->task = kthread_create(worker_thread, worker,
1810 "kworker/u:%d%s", id, pri); 1731 "kworker/u:%d%s", id, pri);
1811 if (IS_ERR(worker->task)) 1732 if (IS_ERR(worker->task))
1812 goto fail; 1733 goto fail;
1813 1734
1814 if (worker_pool_pri(pool)) 1735 if (std_worker_pool_pri(pool))
1815 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); 1736 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1816 1737
1817 /* 1738 /*
1818 * Determine CPU binding of the new worker depending on 1739 * Determine CPU binding of the new worker depending on
1819 * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the 1740 * %POOL_DISASSOCIATED. The caller is responsible for ensuring the
1820 * flag remains stable across this function. See the comments 1741 * flag remains stable across this function. See the comments
1821 * above the flag definition for details. 1742 * above the flag definition for details.
1822 * 1743 *
1823 * As an unbound worker may later become a regular one if CPU comes 1744 * As an unbound worker may later become a regular one if CPU comes
1824 * online, make sure every worker has %PF_THREAD_BOUND set. 1745 * online, make sure every worker has %PF_THREAD_BOUND set.
1825 */ 1746 */
1826 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { 1747 if (!(pool->flags & POOL_DISASSOCIATED)) {
1827 kthread_bind(worker->task, gcwq->cpu); 1748 kthread_bind(worker->task, pool->cpu);
1828 } else { 1749 } else {
1829 worker->task->flags |= PF_THREAD_BOUND; 1750 worker->task->flags |= PF_THREAD_BOUND;
1830 worker->flags |= WORKER_UNBOUND; 1751 worker->flags |= WORKER_UNBOUND;
@@ -1833,9 +1754,9 @@ static struct worker *create_worker(struct worker_pool *pool)
1833 return worker; 1754 return worker;
1834fail: 1755fail:
1835 if (id >= 0) { 1756 if (id >= 0) {
1836 spin_lock_irq(&gcwq->lock); 1757 spin_lock_irq(&pool->lock);
1837 ida_remove(&pool->worker_ida, id); 1758 ida_remove(&pool->worker_ida, id);
1838 spin_unlock_irq(&gcwq->lock); 1759 spin_unlock_irq(&pool->lock);
1839 } 1760 }
1840 kfree(worker); 1761 kfree(worker);
1841 return NULL; 1762 return NULL;
@@ -1845,10 +1766,10 @@ fail:
1845 * start_worker - start a newly created worker 1766 * start_worker - start a newly created worker
1846 * @worker: worker to start 1767 * @worker: worker to start
1847 * 1768 *
1848 * Make the gcwq aware of @worker and start it. 1769 * Make the pool aware of @worker and start it.
1849 * 1770 *
1850 * CONTEXT: 1771 * CONTEXT:
1851 * spin_lock_irq(gcwq->lock). 1772 * spin_lock_irq(pool->lock).
1852 */ 1773 */
1853static void start_worker(struct worker *worker) 1774static void start_worker(struct worker *worker)
1854{ 1775{
@@ -1862,15 +1783,14 @@ static void start_worker(struct worker *worker)
1862 * destroy_worker - destroy a workqueue worker 1783 * destroy_worker - destroy a workqueue worker
1863 * @worker: worker to be destroyed 1784 * @worker: worker to be destroyed
1864 * 1785 *
1865 * Destroy @worker and adjust @gcwq stats accordingly. 1786 * Destroy @worker and adjust @pool stats accordingly.
1866 * 1787 *
1867 * CONTEXT: 1788 * CONTEXT:
1868 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 1789 * spin_lock_irq(pool->lock) which is released and regrabbed.
1869 */ 1790 */
1870static void destroy_worker(struct worker *worker) 1791static void destroy_worker(struct worker *worker)
1871{ 1792{
1872 struct worker_pool *pool = worker->pool; 1793 struct worker_pool *pool = worker->pool;
1873 struct global_cwq *gcwq = pool->gcwq;
1874 int id = worker->id; 1794 int id = worker->id;
1875 1795
1876 /* sanity check frenzy */ 1796 /* sanity check frenzy */
@@ -1885,21 +1805,20 @@ static void destroy_worker(struct worker *worker)
1885 list_del_init(&worker->entry); 1805 list_del_init(&worker->entry);
1886 worker->flags |= WORKER_DIE; 1806 worker->flags |= WORKER_DIE;
1887 1807
1888 spin_unlock_irq(&gcwq->lock); 1808 spin_unlock_irq(&pool->lock);
1889 1809
1890 kthread_stop(worker->task); 1810 kthread_stop(worker->task);
1891 kfree(worker); 1811 kfree(worker);
1892 1812
1893 spin_lock_irq(&gcwq->lock); 1813 spin_lock_irq(&pool->lock);
1894 ida_remove(&pool->worker_ida, id); 1814 ida_remove(&pool->worker_ida, id);
1895} 1815}
1896 1816
1897static void idle_worker_timeout(unsigned long __pool) 1817static void idle_worker_timeout(unsigned long __pool)
1898{ 1818{
1899 struct worker_pool *pool = (void *)__pool; 1819 struct worker_pool *pool = (void *)__pool;
1900 struct global_cwq *gcwq = pool->gcwq;
1901 1820
1902 spin_lock_irq(&gcwq->lock); 1821 spin_lock_irq(&pool->lock);
1903 1822
1904 if (too_many_workers(pool)) { 1823 if (too_many_workers(pool)) {
1905 struct worker *worker; 1824 struct worker *worker;
@@ -1918,20 +1837,20 @@ static void idle_worker_timeout(unsigned long __pool)
1918 } 1837 }
1919 } 1838 }
1920 1839
1921 spin_unlock_irq(&gcwq->lock); 1840 spin_unlock_irq(&pool->lock);
1922} 1841}
1923 1842
1924static bool send_mayday(struct work_struct *work) 1843static bool send_mayday(struct work_struct *work)
1925{ 1844{
1926 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 1845 struct pool_workqueue *pwq = get_work_pwq(work);
1927 struct workqueue_struct *wq = cwq->wq; 1846 struct workqueue_struct *wq = pwq->wq;
1928 unsigned int cpu; 1847 unsigned int cpu;
1929 1848
1930 if (!(wq->flags & WQ_RESCUER)) 1849 if (!(wq->flags & WQ_RESCUER))
1931 return false; 1850 return false;
1932 1851
1933 /* mayday mayday mayday */ 1852 /* mayday mayday mayday */
1934 cpu = cwq->pool->gcwq->cpu; 1853 cpu = pwq->pool->cpu;
1935 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1854 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1936 if (cpu == WORK_CPU_UNBOUND) 1855 if (cpu == WORK_CPU_UNBOUND)
1937 cpu = 0; 1856 cpu = 0;
@@ -1940,13 +1859,12 @@ static bool send_mayday(struct work_struct *work)
1940 return true; 1859 return true;
1941} 1860}
1942 1861
1943static void gcwq_mayday_timeout(unsigned long __pool) 1862static void pool_mayday_timeout(unsigned long __pool)
1944{ 1863{
1945 struct worker_pool *pool = (void *)__pool; 1864 struct worker_pool *pool = (void *)__pool;
1946 struct global_cwq *gcwq = pool->gcwq;
1947 struct work_struct *work; 1865 struct work_struct *work;
1948 1866
1949 spin_lock_irq(&gcwq->lock); 1867 spin_lock_irq(&pool->lock);
1950 1868
1951 if (need_to_create_worker(pool)) { 1869 if (need_to_create_worker(pool)) {
1952 /* 1870 /*
@@ -1959,7 +1877,7 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1959 send_mayday(work); 1877 send_mayday(work);
1960 } 1878 }
1961 1879
1962 spin_unlock_irq(&gcwq->lock); 1880 spin_unlock_irq(&pool->lock);
1963 1881
1964 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1882 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1965} 1883}
@@ -1978,24 +1896,22 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1978 * may_start_working() true. 1896 * may_start_working() true.
1979 * 1897 *
1980 * LOCKING: 1898 * LOCKING:
1981 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1899 * spin_lock_irq(pool->lock) which may be released and regrabbed
1982 * multiple times. Does GFP_KERNEL allocations. Called only from 1900 * multiple times. Does GFP_KERNEL allocations. Called only from
1983 * manager. 1901 * manager.
1984 * 1902 *
1985 * RETURNS: 1903 * RETURNS:
1986 * false if no action was taken and gcwq->lock stayed locked, true 1904 * false if no action was taken and pool->lock stayed locked, true
1987 * otherwise. 1905 * otherwise.
1988 */ 1906 */
1989static bool maybe_create_worker(struct worker_pool *pool) 1907static bool maybe_create_worker(struct worker_pool *pool)
1990__releases(&gcwq->lock) 1908__releases(&pool->lock)
1991__acquires(&gcwq->lock) 1909__acquires(&pool->lock)
1992{ 1910{
1993 struct global_cwq *gcwq = pool->gcwq;
1994
1995 if (!need_to_create_worker(pool)) 1911 if (!need_to_create_worker(pool))
1996 return false; 1912 return false;
1997restart: 1913restart:
1998 spin_unlock_irq(&gcwq->lock); 1914 spin_unlock_irq(&pool->lock);
1999 1915
2000 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 1916 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
2001 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1917 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2006,7 +1922,7 @@ restart:
2006 worker = create_worker(pool); 1922 worker = create_worker(pool);
2007 if (worker) { 1923 if (worker) {
2008 del_timer_sync(&pool->mayday_timer); 1924 del_timer_sync(&pool->mayday_timer);
2009 spin_lock_irq(&gcwq->lock); 1925 spin_lock_irq(&pool->lock);
2010 start_worker(worker); 1926 start_worker(worker);
2011 BUG_ON(need_to_create_worker(pool)); 1927 BUG_ON(need_to_create_worker(pool));
2012 return true; 1928 return true;
@@ -2023,7 +1939,7 @@ restart:
2023 } 1939 }
2024 1940
2025 del_timer_sync(&pool->mayday_timer); 1941 del_timer_sync(&pool->mayday_timer);
2026 spin_lock_irq(&gcwq->lock); 1942 spin_lock_irq(&pool->lock);
2027 if (need_to_create_worker(pool)) 1943 if (need_to_create_worker(pool))
2028 goto restart; 1944 goto restart;
2029 return true; 1945 return true;
@@ -2037,11 +1953,11 @@ restart:
2037 * IDLE_WORKER_TIMEOUT. 1953 * IDLE_WORKER_TIMEOUT.
2038 * 1954 *
2039 * LOCKING: 1955 * LOCKING:
2040 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1956 * spin_lock_irq(pool->lock) which may be released and regrabbed
2041 * multiple times. Called only from manager. 1957 * multiple times. Called only from manager.
2042 * 1958 *
2043 * RETURNS: 1959 * RETURNS:
2044 * false if no action was taken and gcwq->lock stayed locked, true 1960 * false if no action was taken and pool->lock stayed locked, true
2045 * otherwise. 1961 * otherwise.
2046 */ 1962 */
2047static bool maybe_destroy_workers(struct worker_pool *pool) 1963static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2071,21 +1987,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2071 * manage_workers - manage worker pool 1987 * manage_workers - manage worker pool
2072 * @worker: self 1988 * @worker: self
2073 * 1989 *
2074 * Assume the manager role and manage gcwq worker pool @worker belongs 1990 * Assume the manager role and manage the worker pool @worker belongs
2075 * to. At any given time, there can be only zero or one manager per 1991 * to. At any given time, there can be only zero or one manager per
2076 * gcwq. The exclusion is handled automatically by this function. 1992 * pool. The exclusion is handled automatically by this function.
2077 * 1993 *
2078 * The caller can safely start processing works on false return. On 1994 * The caller can safely start processing works on false return. On
2079 * true return, it's guaranteed that need_to_create_worker() is false 1995 * true return, it's guaranteed that need_to_create_worker() is false
2080 * and may_start_working() is true. 1996 * and may_start_working() is true.
2081 * 1997 *
2082 * CONTEXT: 1998 * CONTEXT:
2083 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1999 * spin_lock_irq(pool->lock) which may be released and regrabbed
2084 * multiple times. Does GFP_KERNEL allocations. 2000 * multiple times. Does GFP_KERNEL allocations.
2085 * 2001 *
2086 * RETURNS: 2002 * RETURNS:
2087 * false if no action was taken and gcwq->lock stayed locked, true if 2003 * spin_lock_irq(pool->lock) which may be released and regrabbed
2088 * some action was taken. 2004 * multiple times. Does GFP_KERNEL allocations.
2089 */ 2005 */
2090static bool manage_workers(struct worker *worker) 2006static bool manage_workers(struct worker *worker)
2091{ 2007{
@@ -2107,20 +2023,20 @@ static bool manage_workers(struct worker *worker)
2107 * manager against CPU hotplug. 2023 * manager against CPU hotplug.
2108 * 2024 *
2109 * assoc_mutex would always be free unless CPU hotplug is in 2025 * assoc_mutex would always be free unless CPU hotplug is in
2110 * progress. trylock first without dropping @gcwq->lock. 2026 * progress. trylock first without dropping @pool->lock.
2111 */ 2027 */
2112 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { 2028 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
2113 spin_unlock_irq(&pool->gcwq->lock); 2029 spin_unlock_irq(&pool->lock);
2114 mutex_lock(&pool->assoc_mutex); 2030 mutex_lock(&pool->assoc_mutex);
2115 /* 2031 /*
2116 * CPU hotplug could have happened while we were waiting 2032 * CPU hotplug could have happened while we were waiting
2117 * for assoc_mutex. Hotplug itself can't handle us 2033 * for assoc_mutex. Hotplug itself can't handle us
2118 * because manager isn't either on idle or busy list, and 2034 * because manager isn't either on idle or busy list, and
2119 * @gcwq's state and ours could have deviated. 2035 * @pool's state and ours could have deviated.
2120 * 2036 *
2121 * As hotplug is now excluded via assoc_mutex, we can 2037 * As hotplug is now excluded via assoc_mutex, we can
2122 * simply try to bind. It will succeed or fail depending 2038 * simply try to bind. It will succeed or fail depending
2123 * on @gcwq's current state. Try it and adjust 2039 * on @pool's current state. Try it and adjust
2124 * %WORKER_UNBOUND accordingly. 2040 * %WORKER_UNBOUND accordingly.
2125 */ 2041 */
2126 if (worker_maybe_bind_and_lock(worker)) 2042 if (worker_maybe_bind_and_lock(worker))
@@ -2157,18 +2073,15 @@ static bool manage_workers(struct worker *worker)
2157 * call this function to process a work. 2073 * call this function to process a work.
2158 * 2074 *
2159 * CONTEXT: 2075 * CONTEXT:
2160 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 2076 * spin_lock_irq(pool->lock) which is released and regrabbed.
2161 */ 2077 */
2162static void process_one_work(struct worker *worker, struct work_struct *work) 2078static void process_one_work(struct worker *worker, struct work_struct *work)
2163__releases(&gcwq->lock) 2079__releases(&pool->lock)
2164__acquires(&gcwq->lock) 2080__acquires(&pool->lock)
2165{ 2081{
2166 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 2082 struct pool_workqueue *pwq = get_work_pwq(work);
2167 struct worker_pool *pool = worker->pool; 2083 struct worker_pool *pool = worker->pool;
2168 struct global_cwq *gcwq = pool->gcwq; 2084 bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
2169 struct hlist_head *bwh = busy_worker_head(gcwq, work);
2170 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
2171 work_func_t f = work->func;
2172 int work_color; 2085 int work_color;
2173 struct worker *collision; 2086 struct worker *collision;
2174#ifdef CONFIG_LOCKDEP 2087#ifdef CONFIG_LOCKDEP
@@ -2186,11 +2099,11 @@ __acquires(&gcwq->lock)
2186 /* 2099 /*
2187 * Ensure we're on the correct CPU. DISASSOCIATED test is 2100 * Ensure we're on the correct CPU. DISASSOCIATED test is
2188 * necessary to avoid spurious warnings from rescuers servicing the 2101 * necessary to avoid spurious warnings from rescuers servicing the
2189 * unbound or a disassociated gcwq. 2102 * unbound or a disassociated pool.
2190 */ 2103 */
2191 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && 2104 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2192 !(gcwq->flags & GCWQ_DISASSOCIATED) && 2105 !(pool->flags & POOL_DISASSOCIATED) &&
2193 raw_smp_processor_id() != gcwq->cpu); 2106 raw_smp_processor_id() != pool->cpu);
2194 2107
2195 /* 2108 /*
2196 * A single work shouldn't be executed concurrently by 2109 * A single work shouldn't be executed concurrently by
@@ -2198,7 +2111,7 @@ __acquires(&gcwq->lock)
2198 * already processing the work. If so, defer the work to the 2111 * already processing the work. If so, defer the work to the
2199 * currently executing one. 2112 * currently executing one.
2200 */ 2113 */
2201 collision = __find_worker_executing_work(gcwq, bwh, work); 2114 collision = find_worker_executing_work(pool, work);
2202 if (unlikely(collision)) { 2115 if (unlikely(collision)) {
2203 move_linked_works(work, &collision->scheduled, NULL); 2116 move_linked_works(work, &collision->scheduled, NULL);
2204 return; 2117 return;
@@ -2206,9 +2119,10 @@ __acquires(&gcwq->lock)
2206 2119
2207 /* claim and dequeue */ 2120 /* claim and dequeue */
2208 debug_work_deactivate(work); 2121 debug_work_deactivate(work);
2209 hlist_add_head(&worker->hentry, bwh); 2122 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
2210 worker->current_work = work; 2123 worker->current_work = work;
2211 worker->current_cwq = cwq; 2124 worker->current_func = work->func;
2125 worker->current_pwq = pwq;
2212 work_color = get_work_color(work); 2126 work_color = get_work_color(work);
2213 2127
2214 list_del_init(&work->entry); 2128 list_del_init(&work->entry);
@@ -2221,53 +2135,55 @@ __acquires(&gcwq->lock)
2221 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 2135 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
2222 2136
2223 /* 2137 /*
2224 * Unbound gcwq isn't concurrency managed and work items should be 2138 * Unbound pool isn't concurrency managed and work items should be
2225 * executed ASAP. Wake up another worker if necessary. 2139 * executed ASAP. Wake up another worker if necessary.
2226 */ 2140 */
2227 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 2141 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2228 wake_up_worker(pool); 2142 wake_up_worker(pool);
2229 2143
2230 /* 2144 /*
2231 * Record the last CPU and clear PENDING which should be the last 2145 * Record the last pool and clear PENDING which should be the last
2232 * update to @work. Also, do this inside @gcwq->lock so that 2146 * update to @work. Also, do this inside @pool->lock so that
2233 * PENDING and queued state changes happen together while IRQ is 2147 * PENDING and queued state changes happen together while IRQ is
2234 * disabled. 2148 * disabled.
2235 */ 2149 */
2236 set_work_cpu_and_clear_pending(work, gcwq->cpu); 2150 set_work_pool_and_clear_pending(work, pool->id);
2237 2151
2238 spin_unlock_irq(&gcwq->lock); 2152 spin_unlock_irq(&pool->lock);
2239 2153
2240 lock_map_acquire_read(&cwq->wq->lockdep_map); 2154 lock_map_acquire_read(&pwq->wq->lockdep_map);
2241 lock_map_acquire(&lockdep_map); 2155 lock_map_acquire(&lockdep_map);
2242 trace_workqueue_execute_start(work); 2156 trace_workqueue_execute_start(work);
2243 f(work); 2157 worker->current_func(work);
2244 /* 2158 /*
2245 * While we must be careful to not use "work" after this, the trace 2159 * While we must be careful to not use "work" after this, the trace
2246 * point will only record its address. 2160 * point will only record its address.
2247 */ 2161 */
2248 trace_workqueue_execute_end(work); 2162 trace_workqueue_execute_end(work);
2249 lock_map_release(&lockdep_map); 2163 lock_map_release(&lockdep_map);
2250 lock_map_release(&cwq->wq->lockdep_map); 2164 lock_map_release(&pwq->wq->lockdep_map);
2251 2165
2252 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2166 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2253 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" 2167 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2254 " last function: %pf\n", 2168 " last function: %pf\n",
2255 current->comm, preempt_count(), task_pid_nr(current), f); 2169 current->comm, preempt_count(), task_pid_nr(current),
2170 worker->current_func);
2256 debug_show_held_locks(current); 2171 debug_show_held_locks(current);
2257 dump_stack(); 2172 dump_stack();
2258 } 2173 }
2259 2174
2260 spin_lock_irq(&gcwq->lock); 2175 spin_lock_irq(&pool->lock);
2261 2176
2262 /* clear cpu intensive status */ 2177 /* clear cpu intensive status */
2263 if (unlikely(cpu_intensive)) 2178 if (unlikely(cpu_intensive))
2264 worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 2179 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
2265 2180
2266 /* we're done with it, release */ 2181 /* we're done with it, release */
2267 hlist_del_init(&worker->hentry); 2182 hash_del(&worker->hentry);
2268 worker->current_work = NULL; 2183 worker->current_work = NULL;
2269 worker->current_cwq = NULL; 2184 worker->current_func = NULL;
2270 cwq_dec_nr_in_flight(cwq, work_color); 2185 worker->current_pwq = NULL;
2186 pwq_dec_nr_in_flight(pwq, work_color);
2271} 2187}
2272 2188
2273/** 2189/**
@@ -2279,7 +2195,7 @@ __acquires(&gcwq->lock)
2279 * fetches a work from the top and executes it. 2195 * fetches a work from the top and executes it.
2280 * 2196 *
2281 * CONTEXT: 2197 * CONTEXT:
2282 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 2198 * spin_lock_irq(pool->lock) which may be released and regrabbed
2283 * multiple times. 2199 * multiple times.
2284 */ 2200 */
2285static void process_scheduled_works(struct worker *worker) 2201static void process_scheduled_works(struct worker *worker)
@@ -2295,8 +2211,8 @@ static void process_scheduled_works(struct worker *worker)
2295 * worker_thread - the worker thread function 2211 * worker_thread - the worker thread function
2296 * @__worker: self 2212 * @__worker: self
2297 * 2213 *
2298 * The gcwq worker thread function. There's a single dynamic pool of 2214 * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools
2299 * these per each cpu. These workers process all works regardless of 2215 * of these per each cpu. These workers process all works regardless of
2300 * their specific target workqueue. The only exception is works which 2216 * their specific target workqueue. The only exception is works which
2301 * belong to workqueues with a rescuer which will be explained in 2217 * belong to workqueues with a rescuer which will be explained in
2302 * rescuer_thread(). 2218 * rescuer_thread().
@@ -2305,16 +2221,15 @@ static int worker_thread(void *__worker)
2305{ 2221{
2306 struct worker *worker = __worker; 2222 struct worker *worker = __worker;
2307 struct worker_pool *pool = worker->pool; 2223 struct worker_pool *pool = worker->pool;
2308 struct global_cwq *gcwq = pool->gcwq;
2309 2224
2310 /* tell the scheduler that this is a workqueue worker */ 2225 /* tell the scheduler that this is a workqueue worker */
2311 worker->task->flags |= PF_WQ_WORKER; 2226 worker->task->flags |= PF_WQ_WORKER;
2312woke_up: 2227woke_up:
2313 spin_lock_irq(&gcwq->lock); 2228 spin_lock_irq(&pool->lock);
2314 2229
2315 /* we are off idle list if destruction or rebind is requested */ 2230 /* we are off idle list if destruction or rebind is requested */
2316 if (unlikely(list_empty(&worker->entry))) { 2231 if (unlikely(list_empty(&worker->entry))) {
2317 spin_unlock_irq(&gcwq->lock); 2232 spin_unlock_irq(&pool->lock);
2318 2233
2319 /* if DIE is set, destruction is requested */ 2234 /* if DIE is set, destruction is requested */
2320 if (worker->flags & WORKER_DIE) { 2235 if (worker->flags & WORKER_DIE) {
@@ -2373,52 +2288,59 @@ sleep:
2373 goto recheck; 2288 goto recheck;
2374 2289
2375 /* 2290 /*
2376 * gcwq->lock is held and there's no work to process and no 2291 * pool->lock is held and there's no work to process and no need to
2377 * need to manage, sleep. Workers are woken up only while 2292 * manage, sleep. Workers are woken up only while holding
2378 * holding gcwq->lock or from local cpu, so setting the 2293 * pool->lock or from local cpu, so setting the current state
2379 * current state before releasing gcwq->lock is enough to 2294 * before releasing pool->lock is enough to prevent losing any
2380 * prevent losing any event. 2295 * event.
2381 */ 2296 */
2382 worker_enter_idle(worker); 2297 worker_enter_idle(worker);
2383 __set_current_state(TASK_INTERRUPTIBLE); 2298 __set_current_state(TASK_INTERRUPTIBLE);
2384 spin_unlock_irq(&gcwq->lock); 2299 spin_unlock_irq(&pool->lock);
2385 schedule(); 2300 schedule();
2386 goto woke_up; 2301 goto woke_up;
2387} 2302}
2388 2303
2389/** 2304/**
2390 * rescuer_thread - the rescuer thread function 2305 * rescuer_thread - the rescuer thread function
2391 * @__wq: the associated workqueue 2306 * @__rescuer: self
2392 * 2307 *
2393 * Workqueue rescuer thread function. There's one rescuer for each 2308 * Workqueue rescuer thread function. There's one rescuer for each
2394 * workqueue which has WQ_RESCUER set. 2309 * workqueue which has WQ_RESCUER set.
2395 * 2310 *
2396 * Regular work processing on a gcwq may block trying to create a new 2311 * Regular work processing on a pool may block trying to create a new
2397 * worker which uses GFP_KERNEL allocation which has slight chance of 2312 * worker which uses GFP_KERNEL allocation which has slight chance of
2398 * developing into deadlock if some works currently on the same queue 2313 * developing into deadlock if some works currently on the same queue
2399 * need to be processed to satisfy the GFP_KERNEL allocation. This is 2314 * need to be processed to satisfy the GFP_KERNEL allocation. This is
2400 * the problem rescuer solves. 2315 * the problem rescuer solves.
2401 * 2316 *
2402 * When such condition is possible, the gcwq summons rescuers of all 2317 * When such condition is possible, the pool summons rescuers of all
2403 * workqueues which have works queued on the gcwq and let them process 2318 * workqueues which have works queued on the pool and let them process
2404 * those works so that forward progress can be guaranteed. 2319 * those works so that forward progress can be guaranteed.
2405 * 2320 *
2406 * This should happen rarely. 2321 * This should happen rarely.
2407 */ 2322 */
2408static int rescuer_thread(void *__wq) 2323static int rescuer_thread(void *__rescuer)
2409{ 2324{
2410 struct workqueue_struct *wq = __wq; 2325 struct worker *rescuer = __rescuer;
2411 struct worker *rescuer = wq->rescuer; 2326 struct workqueue_struct *wq = rescuer->rescue_wq;
2412 struct list_head *scheduled = &rescuer->scheduled; 2327 struct list_head *scheduled = &rescuer->scheduled;
2413 bool is_unbound = wq->flags & WQ_UNBOUND; 2328 bool is_unbound = wq->flags & WQ_UNBOUND;
2414 unsigned int cpu; 2329 unsigned int cpu;
2415 2330
2416 set_user_nice(current, RESCUER_NICE_LEVEL); 2331 set_user_nice(current, RESCUER_NICE_LEVEL);
2332
2333 /*
2334 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
2335 * doesn't participate in concurrency management.
2336 */
2337 rescuer->task->flags |= PF_WQ_WORKER;
2417repeat: 2338repeat:
2418 set_current_state(TASK_INTERRUPTIBLE); 2339 set_current_state(TASK_INTERRUPTIBLE);
2419 2340
2420 if (kthread_should_stop()) { 2341 if (kthread_should_stop()) {
2421 __set_current_state(TASK_RUNNING); 2342 __set_current_state(TASK_RUNNING);
2343 rescuer->task->flags &= ~PF_WQ_WORKER;
2422 return 0; 2344 return 0;
2423 } 2345 }
2424 2346
@@ -2428,9 +2350,8 @@ repeat:
2428 */ 2350 */
2429 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2351 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2430 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2352 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2431 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); 2353 struct pool_workqueue *pwq = get_pwq(tcpu, wq);
2432 struct worker_pool *pool = cwq->pool; 2354 struct worker_pool *pool = pwq->pool;
2433 struct global_cwq *gcwq = pool->gcwq;
2434 struct work_struct *work, *n; 2355 struct work_struct *work, *n;
2435 2356
2436 __set_current_state(TASK_RUNNING); 2357 __set_current_state(TASK_RUNNING);
@@ -2446,22 +2367,24 @@ repeat:
2446 */ 2367 */
2447 BUG_ON(!list_empty(&rescuer->scheduled)); 2368 BUG_ON(!list_empty(&rescuer->scheduled));
2448 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2369 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2449 if (get_work_cwq(work) == cwq) 2370 if (get_work_pwq(work) == pwq)
2450 move_linked_works(work, scheduled, &n); 2371 move_linked_works(work, scheduled, &n);
2451 2372
2452 process_scheduled_works(rescuer); 2373 process_scheduled_works(rescuer);
2453 2374
2454 /* 2375 /*
2455 * Leave this gcwq. If keep_working() is %true, notify a 2376 * Leave this pool. If keep_working() is %true, notify a
2456 * regular worker; otherwise, we end up with 0 concurrency 2377 * regular worker; otherwise, we end up with 0 concurrency
2457 * and stalling the execution. 2378 * and stalling the execution.
2458 */ 2379 */
2459 if (keep_working(pool)) 2380 if (keep_working(pool))
2460 wake_up_worker(pool); 2381 wake_up_worker(pool);
2461 2382
2462 spin_unlock_irq(&gcwq->lock); 2383 spin_unlock_irq(&pool->lock);
2463 } 2384 }
2464 2385
2386 /* rescuers should never participate in concurrency management */
2387 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2465 schedule(); 2388 schedule();
2466 goto repeat; 2389 goto repeat;
2467} 2390}
@@ -2479,7 +2402,7 @@ static void wq_barrier_func(struct work_struct *work)
2479 2402
2480/** 2403/**
2481 * insert_wq_barrier - insert a barrier work 2404 * insert_wq_barrier - insert a barrier work
2482 * @cwq: cwq to insert barrier into 2405 * @pwq: pwq to insert barrier into
2483 * @barr: wq_barrier to insert 2406 * @barr: wq_barrier to insert
2484 * @target: target work to attach @barr to 2407 * @target: target work to attach @barr to
2485 * @worker: worker currently executing @target, NULL if @target is not executing 2408 * @worker: worker currently executing @target, NULL if @target is not executing
@@ -2496,12 +2419,12 @@ static void wq_barrier_func(struct work_struct *work)
2496 * after a work with LINKED flag set. 2419 * after a work with LINKED flag set.
2497 * 2420 *
2498 * Note that when @worker is non-NULL, @target may be modified 2421 * Note that when @worker is non-NULL, @target may be modified
2499 * underneath us, so we can't reliably determine cwq from @target. 2422 * underneath us, so we can't reliably determine pwq from @target.
2500 * 2423 *
2501 * CONTEXT: 2424 * CONTEXT:
2502 * spin_lock_irq(gcwq->lock). 2425 * spin_lock_irq(pool->lock).
2503 */ 2426 */
2504static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2427static void insert_wq_barrier(struct pool_workqueue *pwq,
2505 struct wq_barrier *barr, 2428 struct wq_barrier *barr,
2506 struct work_struct *target, struct worker *worker) 2429 struct work_struct *target, struct worker *worker)
2507{ 2430{
@@ -2509,7 +2432,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2509 unsigned int linked = 0; 2432 unsigned int linked = 0;
2510 2433
2511 /* 2434 /*
2512 * debugobject calls are safe here even with gcwq->lock locked 2435 * debugobject calls are safe here even with pool->lock locked
2513 * as we know for sure that this will not trigger any of the 2436 * as we know for sure that this will not trigger any of the
2514 * checks and call back into the fixup functions where we 2437 * checks and call back into the fixup functions where we
2515 * might deadlock. 2438 * might deadlock.
@@ -2534,23 +2457,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2534 } 2457 }
2535 2458
2536 debug_work_activate(&barr->work); 2459 debug_work_activate(&barr->work);
2537 insert_work(cwq, &barr->work, head, 2460 insert_work(pwq, &barr->work, head,
2538 work_color_to_flags(WORK_NO_COLOR) | linked); 2461 work_color_to_flags(WORK_NO_COLOR) | linked);
2539} 2462}
2540 2463
2541/** 2464/**
2542 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing 2465 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
2543 * @wq: workqueue being flushed 2466 * @wq: workqueue being flushed
2544 * @flush_color: new flush color, < 0 for no-op 2467 * @flush_color: new flush color, < 0 for no-op
2545 * @work_color: new work color, < 0 for no-op 2468 * @work_color: new work color, < 0 for no-op
2546 * 2469 *
2547 * Prepare cwqs for workqueue flushing. 2470 * Prepare pwqs for workqueue flushing.
2548 * 2471 *
2549 * If @flush_color is non-negative, flush_color on all cwqs should be 2472 * If @flush_color is non-negative, flush_color on all pwqs should be
2550 * -1. If no cwq has in-flight commands at the specified color, all 2473 * -1. If no pwq has in-flight commands at the specified color, all
2551 * cwq->flush_color's stay at -1 and %false is returned. If any cwq 2474 * pwq->flush_color's stay at -1 and %false is returned. If any pwq
2552 * has in flight commands, its cwq->flush_color is set to 2475 * has in flight commands, its pwq->flush_color is set to
2553 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq 2476 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
2554 * wakeup logic is armed and %true is returned. 2477 * wakeup logic is armed and %true is returned.
2555 * 2478 *
2556 * The caller should have initialized @wq->first_flusher prior to 2479 * The caller should have initialized @wq->first_flusher prior to
@@ -2558,7 +2481,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2558 * @flush_color is negative, no flush color update is done and %false 2481 * @flush_color is negative, no flush color update is done and %false
2559 * is returned. 2482 * is returned.
2560 * 2483 *
2561 * If @work_color is non-negative, all cwqs should have the same 2484 * If @work_color is non-negative, all pwqs should have the same
2562 * work_color which is previous to @work_color and all will be 2485 * work_color which is previous to @work_color and all will be
2563 * advanced to @work_color. 2486 * advanced to @work_color.
2564 * 2487 *
@@ -2569,42 +2492,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2569 * %true if @flush_color >= 0 and there's something to flush. %false 2492 * %true if @flush_color >= 0 and there's something to flush. %false
2570 * otherwise. 2493 * otherwise.
2571 */ 2494 */
2572static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, 2495static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2573 int flush_color, int work_color) 2496 int flush_color, int work_color)
2574{ 2497{
2575 bool wait = false; 2498 bool wait = false;
2576 unsigned int cpu; 2499 unsigned int cpu;
2577 2500
2578 if (flush_color >= 0) { 2501 if (flush_color >= 0) {
2579 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); 2502 BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
2580 atomic_set(&wq->nr_cwqs_to_flush, 1); 2503 atomic_set(&wq->nr_pwqs_to_flush, 1);
2581 } 2504 }
2582 2505
2583 for_each_cwq_cpu(cpu, wq) { 2506 for_each_pwq_cpu(cpu, wq) {
2584 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2507 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2585 struct global_cwq *gcwq = cwq->pool->gcwq; 2508 struct worker_pool *pool = pwq->pool;
2586 2509
2587 spin_lock_irq(&gcwq->lock); 2510 spin_lock_irq(&pool->lock);
2588 2511
2589 if (flush_color >= 0) { 2512 if (flush_color >= 0) {
2590 BUG_ON(cwq->flush_color != -1); 2513 BUG_ON(pwq->flush_color != -1);
2591 2514
2592 if (cwq->nr_in_flight[flush_color]) { 2515 if (pwq->nr_in_flight[flush_color]) {
2593 cwq->flush_color = flush_color; 2516 pwq->flush_color = flush_color;
2594 atomic_inc(&wq->nr_cwqs_to_flush); 2517 atomic_inc(&wq->nr_pwqs_to_flush);
2595 wait = true; 2518 wait = true;
2596 } 2519 }
2597 } 2520 }
2598 2521
2599 if (work_color >= 0) { 2522 if (work_color >= 0) {
2600 BUG_ON(work_color != work_next_color(cwq->work_color)); 2523 BUG_ON(work_color != work_next_color(pwq->work_color));
2601 cwq->work_color = work_color; 2524 pwq->work_color = work_color;
2602 } 2525 }
2603 2526
2604 spin_unlock_irq(&gcwq->lock); 2527 spin_unlock_irq(&pool->lock);
2605 } 2528 }
2606 2529
2607 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) 2530 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
2608 complete(&wq->first_flusher->done); 2531 complete(&wq->first_flusher->done);
2609 2532
2610 return wait; 2533 return wait;
@@ -2655,7 +2578,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2655 2578
2656 wq->first_flusher = &this_flusher; 2579 wq->first_flusher = &this_flusher;
2657 2580
2658 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, 2581 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
2659 wq->work_color)) { 2582 wq->work_color)) {
2660 /* nothing to flush, done */ 2583 /* nothing to flush, done */
2661 wq->flush_color = next_color; 2584 wq->flush_color = next_color;
@@ -2666,7 +2589,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2666 /* wait in queue */ 2589 /* wait in queue */
2667 BUG_ON(wq->flush_color == this_flusher.flush_color); 2590 BUG_ON(wq->flush_color == this_flusher.flush_color);
2668 list_add_tail(&this_flusher.list, &wq->flusher_queue); 2591 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2669 flush_workqueue_prep_cwqs(wq, -1, wq->work_color); 2592 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2670 } 2593 }
2671 } else { 2594 } else {
2672 /* 2595 /*
@@ -2733,7 +2656,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2733 2656
2734 list_splice_tail_init(&wq->flusher_overflow, 2657 list_splice_tail_init(&wq->flusher_overflow,
2735 &wq->flusher_queue); 2658 &wq->flusher_queue);
2736 flush_workqueue_prep_cwqs(wq, -1, wq->work_color); 2659 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2737 } 2660 }
2738 2661
2739 if (list_empty(&wq->flusher_queue)) { 2662 if (list_empty(&wq->flusher_queue)) {
@@ -2743,7 +2666,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2743 2666
2744 /* 2667 /*
2745 * Need to flush more colors. Make the next flusher 2668 * Need to flush more colors. Make the next flusher
2746 * the new first flusher and arm cwqs. 2669 * the new first flusher and arm pwqs.
2747 */ 2670 */
2748 BUG_ON(wq->flush_color == wq->work_color); 2671 BUG_ON(wq->flush_color == wq->work_color);
2749 BUG_ON(wq->flush_color != next->flush_color); 2672 BUG_ON(wq->flush_color != next->flush_color);
@@ -2751,7 +2674,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2751 list_del_init(&next->list); 2674 list_del_init(&next->list);
2752 wq->first_flusher = next; 2675 wq->first_flusher = next;
2753 2676
2754 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) 2677 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
2755 break; 2678 break;
2756 2679
2757 /* 2680 /*
@@ -2794,13 +2717,13 @@ void drain_workqueue(struct workqueue_struct *wq)
2794reflush: 2717reflush:
2795 flush_workqueue(wq); 2718 flush_workqueue(wq);
2796 2719
2797 for_each_cwq_cpu(cpu, wq) { 2720 for_each_pwq_cpu(cpu, wq) {
2798 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2721 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2799 bool drained; 2722 bool drained;
2800 2723
2801 spin_lock_irq(&cwq->pool->gcwq->lock); 2724 spin_lock_irq(&pwq->pool->lock);
2802 drained = !cwq->nr_active && list_empty(&cwq->delayed_works); 2725 drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
2803 spin_unlock_irq(&cwq->pool->gcwq->lock); 2726 spin_unlock_irq(&pwq->pool->lock);
2804 2727
2805 if (drained) 2728 if (drained)
2806 continue; 2729 continue;
@@ -2822,34 +2745,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue);
2822static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) 2745static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2823{ 2746{
2824 struct worker *worker = NULL; 2747 struct worker *worker = NULL;
2825 struct global_cwq *gcwq; 2748 struct worker_pool *pool;
2826 struct cpu_workqueue_struct *cwq; 2749 struct pool_workqueue *pwq;
2827 2750
2828 might_sleep(); 2751 might_sleep();
2829 gcwq = get_work_gcwq(work); 2752 pool = get_work_pool(work);
2830 if (!gcwq) 2753 if (!pool)
2831 return false; 2754 return false;
2832 2755
2833 spin_lock_irq(&gcwq->lock); 2756 spin_lock_irq(&pool->lock);
2834 if (!list_empty(&work->entry)) { 2757 /* see the comment in try_to_grab_pending() with the same code */
2835 /* 2758 pwq = get_work_pwq(work);
2836 * See the comment near try_to_grab_pending()->smp_rmb(). 2759 if (pwq) {
2837 * If it was re-queued to a different gcwq under us, we 2760 if (unlikely(pwq->pool != pool))
2838 * are not going to wait.
2839 */
2840 smp_rmb();
2841 cwq = get_work_cwq(work);
2842 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2843 goto already_gone; 2761 goto already_gone;
2844 } else { 2762 } else {
2845 worker = find_worker_executing_work(gcwq, work); 2763 worker = find_worker_executing_work(pool, work);
2846 if (!worker) 2764 if (!worker)
2847 goto already_gone; 2765 goto already_gone;
2848 cwq = worker->current_cwq; 2766 pwq = worker->current_pwq;
2849 } 2767 }
2850 2768
2851 insert_wq_barrier(cwq, barr, work, worker); 2769 insert_wq_barrier(pwq, barr, work, worker);
2852 spin_unlock_irq(&gcwq->lock); 2770 spin_unlock_irq(&pool->lock);
2853 2771
2854 /* 2772 /*
2855 * If @max_active is 1 or rescuer is in use, flushing another work 2773 * If @max_active is 1 or rescuer is in use, flushing another work
@@ -2857,15 +2775,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2857 * flusher is not running on the same workqueue by verifying write 2775 * flusher is not running on the same workqueue by verifying write
2858 * access. 2776 * access.
2859 */ 2777 */
2860 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) 2778 if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
2861 lock_map_acquire(&cwq->wq->lockdep_map); 2779 lock_map_acquire(&pwq->wq->lockdep_map);
2862 else 2780 else
2863 lock_map_acquire_read(&cwq->wq->lockdep_map); 2781 lock_map_acquire_read(&pwq->wq->lockdep_map);
2864 lock_map_release(&cwq->wq->lockdep_map); 2782 lock_map_release(&pwq->wq->lockdep_map);
2865 2783
2866 return true; 2784 return true;
2867already_gone: 2785already_gone:
2868 spin_unlock_irq(&gcwq->lock); 2786 spin_unlock_irq(&pool->lock);
2869 return false; 2787 return false;
2870} 2788}
2871 2789
@@ -2961,8 +2879,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
2961{ 2879{
2962 local_irq_disable(); 2880 local_irq_disable();
2963 if (del_timer_sync(&dwork->timer)) 2881 if (del_timer_sync(&dwork->timer))
2964 __queue_work(dwork->cpu, 2882 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
2965 get_work_cwq(&dwork->work)->wq, &dwork->work);
2966 local_irq_enable(); 2883 local_irq_enable();
2967 return flush_work(&dwork->work); 2884 return flush_work(&dwork->work);
2968} 2885}
@@ -2992,7 +2909,8 @@ bool cancel_delayed_work(struct delayed_work *dwork)
2992 if (unlikely(ret < 0)) 2909 if (unlikely(ret < 0))
2993 return false; 2910 return false;
2994 2911
2995 set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); 2912 set_work_pool_and_clear_pending(&dwork->work,
2913 get_work_pool_id(&dwork->work));
2996 local_irq_restore(flags); 2914 local_irq_restore(flags);
2997 return ret; 2915 return ret;
2998} 2916}
@@ -3171,46 +3089,46 @@ int keventd_up(void)
3171 return system_wq != NULL; 3089 return system_wq != NULL;
3172} 3090}
3173 3091
3174static int alloc_cwqs(struct workqueue_struct *wq) 3092static int alloc_pwqs(struct workqueue_struct *wq)
3175{ 3093{
3176 /* 3094 /*
3177 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. 3095 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
3178 * Make sure that the alignment isn't lower than that of 3096 * Make sure that the alignment isn't lower than that of
3179 * unsigned long long. 3097 * unsigned long long.
3180 */ 3098 */
3181 const size_t size = sizeof(struct cpu_workqueue_struct); 3099 const size_t size = sizeof(struct pool_workqueue);
3182 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 3100 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
3183 __alignof__(unsigned long long)); 3101 __alignof__(unsigned long long));
3184 3102
3185 if (!(wq->flags & WQ_UNBOUND)) 3103 if (!(wq->flags & WQ_UNBOUND))
3186 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 3104 wq->pool_wq.pcpu = __alloc_percpu(size, align);
3187 else { 3105 else {
3188 void *ptr; 3106 void *ptr;
3189 3107
3190 /* 3108 /*
3191 * Allocate enough room to align cwq and put an extra 3109 * Allocate enough room to align pwq and put an extra
3192 * pointer at the end pointing back to the originally 3110 * pointer at the end pointing back to the originally
3193 * allocated pointer which will be used for free. 3111 * allocated pointer which will be used for free.
3194 */ 3112 */
3195 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); 3113 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
3196 if (ptr) { 3114 if (ptr) {
3197 wq->cpu_wq.single = PTR_ALIGN(ptr, align); 3115 wq->pool_wq.single = PTR_ALIGN(ptr, align);
3198 *(void **)(wq->cpu_wq.single + 1) = ptr; 3116 *(void **)(wq->pool_wq.single + 1) = ptr;
3199 } 3117 }
3200 } 3118 }
3201 3119
3202 /* just in case, make sure it's actually aligned */ 3120 /* just in case, make sure it's actually aligned */
3203 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 3121 BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
3204 return wq->cpu_wq.v ? 0 : -ENOMEM; 3122 return wq->pool_wq.v ? 0 : -ENOMEM;
3205} 3123}
3206 3124
3207static void free_cwqs(struct workqueue_struct *wq) 3125static void free_pwqs(struct workqueue_struct *wq)
3208{ 3126{
3209 if (!(wq->flags & WQ_UNBOUND)) 3127 if (!(wq->flags & WQ_UNBOUND))
3210 free_percpu(wq->cpu_wq.pcpu); 3128 free_percpu(wq->pool_wq.pcpu);
3211 else if (wq->cpu_wq.single) { 3129 else if (wq->pool_wq.single) {
3212 /* the pointer to free is stored right after the cwq */ 3130 /* the pointer to free is stored right after the pwq */
3213 kfree(*(void **)(wq->cpu_wq.single + 1)); 3131 kfree(*(void **)(wq->pool_wq.single + 1));
3214 } 3132 }
3215} 3133}
3216 3134
@@ -3264,27 +3182,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3264 wq->flags = flags; 3182 wq->flags = flags;
3265 wq->saved_max_active = max_active; 3183 wq->saved_max_active = max_active;
3266 mutex_init(&wq->flush_mutex); 3184 mutex_init(&wq->flush_mutex);
3267 atomic_set(&wq->nr_cwqs_to_flush, 0); 3185 atomic_set(&wq->nr_pwqs_to_flush, 0);
3268 INIT_LIST_HEAD(&wq->flusher_queue); 3186 INIT_LIST_HEAD(&wq->flusher_queue);
3269 INIT_LIST_HEAD(&wq->flusher_overflow); 3187 INIT_LIST_HEAD(&wq->flusher_overflow);
3270 3188
3271 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 3189 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3272 INIT_LIST_HEAD(&wq->list); 3190 INIT_LIST_HEAD(&wq->list);
3273 3191
3274 if (alloc_cwqs(wq) < 0) 3192 if (alloc_pwqs(wq) < 0)
3275 goto err; 3193 goto err;
3276 3194
3277 for_each_cwq_cpu(cpu, wq) { 3195 for_each_pwq_cpu(cpu, wq) {
3278 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3196 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3279 struct global_cwq *gcwq = get_gcwq(cpu); 3197
3280 int pool_idx = (bool)(flags & WQ_HIGHPRI); 3198 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3281 3199 pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
3282 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3200 pwq->wq = wq;
3283 cwq->pool = &gcwq->pools[pool_idx]; 3201 pwq->flush_color = -1;
3284 cwq->wq = wq; 3202 pwq->max_active = max_active;
3285 cwq->flush_color = -1; 3203 INIT_LIST_HEAD(&pwq->delayed_works);
3286 cwq->max_active = max_active;
3287 INIT_LIST_HEAD(&cwq->delayed_works);
3288 } 3204 }
3289 3205
3290 if (flags & WQ_RESCUER) { 3206 if (flags & WQ_RESCUER) {
@@ -3297,7 +3213,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3297 if (!rescuer) 3213 if (!rescuer)
3298 goto err; 3214 goto err;
3299 3215
3300 rescuer->task = kthread_create(rescuer_thread, wq, "%s", 3216 rescuer->rescue_wq = wq;
3217 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3301 wq->name); 3218 wq->name);
3302 if (IS_ERR(rescuer->task)) 3219 if (IS_ERR(rescuer->task))
3303 goto err; 3220 goto err;
@@ -3314,8 +3231,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3314 spin_lock(&workqueue_lock); 3231 spin_lock(&workqueue_lock);
3315 3232
3316 if (workqueue_freezing && wq->flags & WQ_FREEZABLE) 3233 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3317 for_each_cwq_cpu(cpu, wq) 3234 for_each_pwq_cpu(cpu, wq)
3318 get_cwq(cpu, wq)->max_active = 0; 3235 get_pwq(cpu, wq)->max_active = 0;
3319 3236
3320 list_add(&wq->list, &workqueues); 3237 list_add(&wq->list, &workqueues);
3321 3238
@@ -3324,7 +3241,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3324 return wq; 3241 return wq;
3325err: 3242err:
3326 if (wq) { 3243 if (wq) {
3327 free_cwqs(wq); 3244 free_pwqs(wq);
3328 free_mayday_mask(wq->mayday_mask); 3245 free_mayday_mask(wq->mayday_mask);
3329 kfree(wq->rescuer); 3246 kfree(wq->rescuer);
3330 kfree(wq); 3247 kfree(wq);
@@ -3355,14 +3272,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
3355 spin_unlock(&workqueue_lock); 3272 spin_unlock(&workqueue_lock);
3356 3273
3357 /* sanity check */ 3274 /* sanity check */
3358 for_each_cwq_cpu(cpu, wq) { 3275 for_each_pwq_cpu(cpu, wq) {
3359 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3276 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3360 int i; 3277 int i;
3361 3278
3362 for (i = 0; i < WORK_NR_COLORS; i++) 3279 for (i = 0; i < WORK_NR_COLORS; i++)
3363 BUG_ON(cwq->nr_in_flight[i]); 3280 BUG_ON(pwq->nr_in_flight[i]);
3364 BUG_ON(cwq->nr_active); 3281 BUG_ON(pwq->nr_active);
3365 BUG_ON(!list_empty(&cwq->delayed_works)); 3282 BUG_ON(!list_empty(&pwq->delayed_works));
3366 } 3283 }
3367 3284
3368 if (wq->flags & WQ_RESCUER) { 3285 if (wq->flags & WQ_RESCUER) {
@@ -3371,29 +3288,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
3371 kfree(wq->rescuer); 3288 kfree(wq->rescuer);
3372 } 3289 }
3373 3290
3374 free_cwqs(wq); 3291 free_pwqs(wq);
3375 kfree(wq); 3292 kfree(wq);
3376} 3293}
3377EXPORT_SYMBOL_GPL(destroy_workqueue); 3294EXPORT_SYMBOL_GPL(destroy_workqueue);
3378 3295
3379/** 3296/**
3380 * cwq_set_max_active - adjust max_active of a cwq 3297 * pwq_set_max_active - adjust max_active of a pwq
3381 * @cwq: target cpu_workqueue_struct 3298 * @pwq: target pool_workqueue
3382 * @max_active: new max_active value. 3299 * @max_active: new max_active value.
3383 * 3300 *
3384 * Set @cwq->max_active to @max_active and activate delayed works if 3301 * Set @pwq->max_active to @max_active and activate delayed works if
3385 * increased. 3302 * increased.
3386 * 3303 *
3387 * CONTEXT: 3304 * CONTEXT:
3388 * spin_lock_irq(gcwq->lock). 3305 * spin_lock_irq(pool->lock).
3389 */ 3306 */
3390static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) 3307static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3391{ 3308{
3392 cwq->max_active = max_active; 3309 pwq->max_active = max_active;
3393 3310
3394 while (!list_empty(&cwq->delayed_works) && 3311 while (!list_empty(&pwq->delayed_works) &&
3395 cwq->nr_active < cwq->max_active) 3312 pwq->nr_active < pwq->max_active)
3396 cwq_activate_first_delayed(cwq); 3313 pwq_activate_first_delayed(pwq);
3397} 3314}
3398 3315
3399/** 3316/**
@@ -3416,16 +3333,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3416 3333
3417 wq->saved_max_active = max_active; 3334 wq->saved_max_active = max_active;
3418 3335
3419 for_each_cwq_cpu(cpu, wq) { 3336 for_each_pwq_cpu(cpu, wq) {
3420 struct global_cwq *gcwq = get_gcwq(cpu); 3337 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3338 struct worker_pool *pool = pwq->pool;
3421 3339
3422 spin_lock_irq(&gcwq->lock); 3340 spin_lock_irq(&pool->lock);
3423 3341
3424 if (!(wq->flags & WQ_FREEZABLE) || 3342 if (!(wq->flags & WQ_FREEZABLE) ||
3425 !(gcwq->flags & GCWQ_FREEZING)) 3343 !(pool->flags & POOL_FREEZING))
3426 cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); 3344 pwq_set_max_active(pwq, max_active);
3427 3345
3428 spin_unlock_irq(&gcwq->lock); 3346 spin_unlock_irq(&pool->lock);
3429 } 3347 }
3430 3348
3431 spin_unlock(&workqueue_lock); 3349 spin_unlock(&workqueue_lock);
@@ -3446,57 +3364,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3446 */ 3364 */
3447bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) 3365bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3448{ 3366{
3449 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3367 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3450 3368
3451 return !list_empty(&cwq->delayed_works); 3369 return !list_empty(&pwq->delayed_works);
3452} 3370}
3453EXPORT_SYMBOL_GPL(workqueue_congested); 3371EXPORT_SYMBOL_GPL(workqueue_congested);
3454 3372
3455/** 3373/**
3456 * work_cpu - return the last known associated cpu for @work
3457 * @work: the work of interest
3458 *
3459 * RETURNS:
3460 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
3461 */
3462unsigned int work_cpu(struct work_struct *work)
3463{
3464 struct global_cwq *gcwq = get_work_gcwq(work);
3465
3466 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3467}
3468EXPORT_SYMBOL_GPL(work_cpu);
3469
3470/**
3471 * work_busy - test whether a work is currently pending or running 3374 * work_busy - test whether a work is currently pending or running
3472 * @work: the work to be tested 3375 * @work: the work to be tested
3473 * 3376 *
3474 * Test whether @work is currently pending or running. There is no 3377 * Test whether @work is currently pending or running. There is no
3475 * synchronization around this function and the test result is 3378 * synchronization around this function and the test result is
3476 * unreliable and only useful as advisory hints or for debugging. 3379 * unreliable and only useful as advisory hints or for debugging.
3477 * Especially for reentrant wqs, the pending state might hide the
3478 * running state.
3479 * 3380 *
3480 * RETURNS: 3381 * RETURNS:
3481 * OR'd bitmask of WORK_BUSY_* bits. 3382 * OR'd bitmask of WORK_BUSY_* bits.
3482 */ 3383 */
3483unsigned int work_busy(struct work_struct *work) 3384unsigned int work_busy(struct work_struct *work)
3484{ 3385{
3485 struct global_cwq *gcwq = get_work_gcwq(work); 3386 struct worker_pool *pool = get_work_pool(work);
3486 unsigned long flags; 3387 unsigned long flags;
3487 unsigned int ret = 0; 3388 unsigned int ret = 0;
3488 3389
3489 if (!gcwq)
3490 return 0;
3491
3492 spin_lock_irqsave(&gcwq->lock, flags);
3493
3494 if (work_pending(work)) 3390 if (work_pending(work))
3495 ret |= WORK_BUSY_PENDING; 3391 ret |= WORK_BUSY_PENDING;
3496 if (find_worker_executing_work(gcwq, work))
3497 ret |= WORK_BUSY_RUNNING;
3498 3392
3499 spin_unlock_irqrestore(&gcwq->lock, flags); 3393 if (pool) {
3394 spin_lock_irqsave(&pool->lock, flags);
3395 if (find_worker_executing_work(pool, work))
3396 ret |= WORK_BUSY_RUNNING;
3397 spin_unlock_irqrestore(&pool->lock, flags);
3398 }
3500 3399
3501 return ret; 3400 return ret;
3502} 3401}
@@ -3506,65 +3405,48 @@ EXPORT_SYMBOL_GPL(work_busy);
3506 * CPU hotplug. 3405 * CPU hotplug.
3507 * 3406 *
3508 * There are two challenges in supporting CPU hotplug. Firstly, there 3407 * There are two challenges in supporting CPU hotplug. Firstly, there
3509 * are a lot of assumptions on strong associations among work, cwq and 3408 * are a lot of assumptions on strong associations among work, pwq and
3510 * gcwq which make migrating pending and scheduled works very 3409 * pool which make migrating pending and scheduled works very
3511 * difficult to implement without impacting hot paths. Secondly, 3410 * difficult to implement without impacting hot paths. Secondly,
3512 * gcwqs serve mix of short, long and very long running works making 3411 * worker pools serve mix of short, long and very long running works making
3513 * blocked draining impractical. 3412 * blocked draining impractical.
3514 * 3413 *
3515 * This is solved by allowing a gcwq to be disassociated from the CPU 3414 * This is solved by allowing the pools to be disassociated from the CPU
3516 * running as an unbound one and allowing it to be reattached later if the 3415 * running as an unbound one and allowing it to be reattached later if the
3517 * cpu comes back online. 3416 * cpu comes back online.
3518 */ 3417 */
3519 3418
3520/* claim manager positions of all pools */ 3419static void wq_unbind_fn(struct work_struct *work)
3521static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
3522{ 3420{
3523 struct worker_pool *pool; 3421 int cpu = smp_processor_id();
3524
3525 for_each_worker_pool(pool, gcwq)
3526 mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
3527 spin_lock_irq(&gcwq->lock);
3528}
3529
3530/* release manager positions */
3531static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
3532{
3533 struct worker_pool *pool;
3534
3535 spin_unlock_irq(&gcwq->lock);
3536 for_each_worker_pool(pool, gcwq)
3537 mutex_unlock(&pool->assoc_mutex);
3538}
3539
3540static void gcwq_unbind_fn(struct work_struct *work)
3541{
3542 struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3543 struct worker_pool *pool; 3422 struct worker_pool *pool;
3544 struct worker *worker; 3423 struct worker *worker;
3545 struct hlist_node *pos;
3546 int i; 3424 int i;
3547 3425
3548 BUG_ON(gcwq->cpu != smp_processor_id()); 3426 for_each_std_worker_pool(pool, cpu) {
3427 BUG_ON(cpu != smp_processor_id());
3549 3428
3550 gcwq_claim_assoc_and_lock(gcwq); 3429 mutex_lock(&pool->assoc_mutex);
3430 spin_lock_irq(&pool->lock);
3551 3431
3552 /* 3432 /*
3553 * We've claimed all manager positions. Make all workers unbound 3433 * We've claimed all manager positions. Make all workers
3554 * and set DISASSOCIATED. Before this, all workers except for the 3434 * unbound and set DISASSOCIATED. Before this, all workers
3555 * ones which are still executing works from before the last CPU 3435 * except for the ones which are still executing works from
3556 * down must be on the cpu. After this, they may become diasporas. 3436 * before the last CPU down must be on the cpu. After
3557 */ 3437 * this, they may become diasporas.
3558 for_each_worker_pool(pool, gcwq) 3438 */
3559 list_for_each_entry(worker, &pool->idle_list, entry) 3439 list_for_each_entry(worker, &pool->idle_list, entry)
3560 worker->flags |= WORKER_UNBOUND; 3440 worker->flags |= WORKER_UNBOUND;
3561 3441
3562 for_each_busy_worker(worker, i, pos, gcwq) 3442 for_each_busy_worker(worker, i, pool)
3563 worker->flags |= WORKER_UNBOUND; 3443 worker->flags |= WORKER_UNBOUND;
3564 3444
3565 gcwq->flags |= GCWQ_DISASSOCIATED; 3445 pool->flags |= POOL_DISASSOCIATED;
3566 3446
3567 gcwq_release_assoc_and_unlock(gcwq); 3447 spin_unlock_irq(&pool->lock);
3448 mutex_unlock(&pool->assoc_mutex);
3449 }
3568 3450
3569 /* 3451 /*
3570 * Call schedule() so that we cross rq->lock and thus can guarantee 3452 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3576,16 +3458,16 @@ static void gcwq_unbind_fn(struct work_struct *work)
3576 /* 3458 /*
3577 * Sched callbacks are disabled now. Zap nr_running. After this, 3459 * Sched callbacks are disabled now. Zap nr_running. After this,
3578 * nr_running stays zero and need_more_worker() and keep_working() 3460 * nr_running stays zero and need_more_worker() and keep_working()
3579 * are always true as long as the worklist is not empty. @gcwq now 3461 * are always true as long as the worklist is not empty. Pools on
3580 * behaves as unbound (in terms of concurrency management) gcwq 3462 * @cpu now behave as unbound (in terms of concurrency management)
3581 * which is served by workers tied to the CPU. 3463 * pools which are served by workers tied to the CPU.
3582 * 3464 *
3583 * On return from this function, the current worker would trigger 3465 * On return from this function, the current worker would trigger
3584 * unbound chain execution of pending work items if other workers 3466 * unbound chain execution of pending work items if other workers
3585 * didn't already. 3467 * didn't already.
3586 */ 3468 */
3587 for_each_worker_pool(pool, gcwq) 3469 for_each_std_worker_pool(pool, cpu)
3588 atomic_set(get_pool_nr_running(pool), 0); 3470 atomic_set(&pool->nr_running, 0);
3589} 3471}
3590 3472
3591/* 3473/*
@@ -3597,12 +3479,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3597 void *hcpu) 3479 void *hcpu)
3598{ 3480{
3599 unsigned int cpu = (unsigned long)hcpu; 3481 unsigned int cpu = (unsigned long)hcpu;
3600 struct global_cwq *gcwq = get_gcwq(cpu);
3601 struct worker_pool *pool; 3482 struct worker_pool *pool;
3602 3483
3603 switch (action & ~CPU_TASKS_FROZEN) { 3484 switch (action & ~CPU_TASKS_FROZEN) {
3604 case CPU_UP_PREPARE: 3485 case CPU_UP_PREPARE:
3605 for_each_worker_pool(pool, gcwq) { 3486 for_each_std_worker_pool(pool, cpu) {
3606 struct worker *worker; 3487 struct worker *worker;
3607 3488
3608 if (pool->nr_workers) 3489 if (pool->nr_workers)
@@ -3612,18 +3493,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3612 if (!worker) 3493 if (!worker)
3613 return NOTIFY_BAD; 3494 return NOTIFY_BAD;
3614 3495
3615 spin_lock_irq(&gcwq->lock); 3496 spin_lock_irq(&pool->lock);
3616 start_worker(worker); 3497 start_worker(worker);
3617 spin_unlock_irq(&gcwq->lock); 3498 spin_unlock_irq(&pool->lock);
3618 } 3499 }
3619 break; 3500 break;
3620 3501
3621 case CPU_DOWN_FAILED: 3502 case CPU_DOWN_FAILED:
3622 case CPU_ONLINE: 3503 case CPU_ONLINE:
3623 gcwq_claim_assoc_and_lock(gcwq); 3504 for_each_std_worker_pool(pool, cpu) {
3624 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3505 mutex_lock(&pool->assoc_mutex);
3625 rebind_workers(gcwq); 3506 spin_lock_irq(&pool->lock);
3626 gcwq_release_assoc_and_unlock(gcwq); 3507
3508 pool->flags &= ~POOL_DISASSOCIATED;
3509 rebind_workers(pool);
3510
3511 spin_unlock_irq(&pool->lock);
3512 mutex_unlock(&pool->assoc_mutex);
3513 }
3627 break; 3514 break;
3628 } 3515 }
3629 return NOTIFY_OK; 3516 return NOTIFY_OK;
@@ -3643,7 +3530,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3643 switch (action & ~CPU_TASKS_FROZEN) { 3530 switch (action & ~CPU_TASKS_FROZEN) {
3644 case CPU_DOWN_PREPARE: 3531 case CPU_DOWN_PREPARE:
3645 /* unbinding should happen on the local CPU */ 3532 /* unbinding should happen on the local CPU */
3646 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); 3533 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3647 queue_work_on(cpu, system_highpri_wq, &unbind_work); 3534 queue_work_on(cpu, system_highpri_wq, &unbind_work);
3648 flush_work(&unbind_work); 3535 flush_work(&unbind_work);
3649 break; 3536 break;
@@ -3696,10 +3583,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3696 * 3583 *
3697 * Start freezing workqueues. After this function returns, all freezable 3584 * Start freezing workqueues. After this function returns, all freezable
3698 * workqueues will queue new works to their frozen_works list instead of 3585 * workqueues will queue new works to their frozen_works list instead of
3699 * gcwq->worklist. 3586 * pool->worklist.
3700 * 3587 *
3701 * CONTEXT: 3588 * CONTEXT:
3702 * Grabs and releases workqueue_lock and gcwq->lock's. 3589 * Grabs and releases workqueue_lock and pool->lock's.
3703 */ 3590 */
3704void freeze_workqueues_begin(void) 3591void freeze_workqueues_begin(void)
3705{ 3592{
@@ -3710,23 +3597,26 @@ void freeze_workqueues_begin(void)
3710 BUG_ON(workqueue_freezing); 3597 BUG_ON(workqueue_freezing);
3711 workqueue_freezing = true; 3598 workqueue_freezing = true;
3712 3599
3713 for_each_gcwq_cpu(cpu) { 3600 for_each_wq_cpu(cpu) {
3714 struct global_cwq *gcwq = get_gcwq(cpu); 3601 struct worker_pool *pool;
3715 struct workqueue_struct *wq; 3602 struct workqueue_struct *wq;
3716 3603
3717 spin_lock_irq(&gcwq->lock); 3604 for_each_std_worker_pool(pool, cpu) {
3605 spin_lock_irq(&pool->lock);
3718 3606
3719 BUG_ON(gcwq->flags & GCWQ_FREEZING); 3607 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3720 gcwq->flags |= GCWQ_FREEZING; 3608 pool->flags |= POOL_FREEZING;
3721 3609
3722 list_for_each_entry(wq, &workqueues, list) { 3610 list_for_each_entry(wq, &workqueues, list) {
3723 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3611 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3724 3612
3725 if (cwq && wq->flags & WQ_FREEZABLE) 3613 if (pwq && pwq->pool == pool &&
3726 cwq->max_active = 0; 3614 (wq->flags & WQ_FREEZABLE))
3727 } 3615 pwq->max_active = 0;
3616 }
3728 3617
3729 spin_unlock_irq(&gcwq->lock); 3618 spin_unlock_irq(&pool->lock);
3619 }
3730 } 3620 }
3731 3621
3732 spin_unlock(&workqueue_lock); 3622 spin_unlock(&workqueue_lock);
@@ -3754,20 +3644,20 @@ bool freeze_workqueues_busy(void)
3754 3644
3755 BUG_ON(!workqueue_freezing); 3645 BUG_ON(!workqueue_freezing);
3756 3646
3757 for_each_gcwq_cpu(cpu) { 3647 for_each_wq_cpu(cpu) {
3758 struct workqueue_struct *wq; 3648 struct workqueue_struct *wq;
3759 /* 3649 /*
3760 * nr_active is monotonically decreasing. It's safe 3650 * nr_active is monotonically decreasing. It's safe
3761 * to peek without lock. 3651 * to peek without lock.
3762 */ 3652 */
3763 list_for_each_entry(wq, &workqueues, list) { 3653 list_for_each_entry(wq, &workqueues, list) {
3764 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3654 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3765 3655
3766 if (!cwq || !(wq->flags & WQ_FREEZABLE)) 3656 if (!pwq || !(wq->flags & WQ_FREEZABLE))
3767 continue; 3657 continue;
3768 3658
3769 BUG_ON(cwq->nr_active < 0); 3659 BUG_ON(pwq->nr_active < 0);
3770 if (cwq->nr_active) { 3660 if (pwq->nr_active) {
3771 busy = true; 3661 busy = true;
3772 goto out_unlock; 3662 goto out_unlock;
3773 } 3663 }
@@ -3782,10 +3672,10 @@ out_unlock:
3782 * thaw_workqueues - thaw workqueues 3672 * thaw_workqueues - thaw workqueues
3783 * 3673 *
3784 * Thaw workqueues. Normal queueing is restored and all collected 3674 * Thaw workqueues. Normal queueing is restored and all collected
3785 * frozen works are transferred to their respective gcwq worklists. 3675 * frozen works are transferred to their respective pool worklists.
3786 * 3676 *
3787 * CONTEXT: 3677 * CONTEXT:
3788 * Grabs and releases workqueue_lock and gcwq->lock's. 3678 * Grabs and releases workqueue_lock and pool->lock's.
3789 */ 3679 */
3790void thaw_workqueues(void) 3680void thaw_workqueues(void)
3791{ 3681{
@@ -3796,30 +3686,31 @@ void thaw_workqueues(void)
3796 if (!workqueue_freezing) 3686 if (!workqueue_freezing)
3797 goto out_unlock; 3687 goto out_unlock;
3798 3688
3799 for_each_gcwq_cpu(cpu) { 3689 for_each_wq_cpu(cpu) {
3800 struct global_cwq *gcwq = get_gcwq(cpu);
3801 struct worker_pool *pool; 3690 struct worker_pool *pool;
3802 struct workqueue_struct *wq; 3691 struct workqueue_struct *wq;
3803 3692
3804 spin_lock_irq(&gcwq->lock); 3693 for_each_std_worker_pool(pool, cpu) {
3694 spin_lock_irq(&pool->lock);
3805 3695
3806 BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); 3696 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
3807 gcwq->flags &= ~GCWQ_FREEZING; 3697 pool->flags &= ~POOL_FREEZING;
3808 3698
3809 list_for_each_entry(wq, &workqueues, list) { 3699 list_for_each_entry(wq, &workqueues, list) {
3810 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3700 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3811 3701
3812 if (!cwq || !(wq->flags & WQ_FREEZABLE)) 3702 if (!pwq || pwq->pool != pool ||
3813 continue; 3703 !(wq->flags & WQ_FREEZABLE))
3704 continue;
3814 3705
3815 /* restore max_active and repopulate worklist */ 3706 /* restore max_active and repopulate worklist */
3816 cwq_set_max_active(cwq, wq->saved_max_active); 3707 pwq_set_max_active(pwq, wq->saved_max_active);
3817 } 3708 }
3818 3709
3819 for_each_worker_pool(pool, gcwq)
3820 wake_up_worker(pool); 3710 wake_up_worker(pool);
3821 3711
3822 spin_unlock_irq(&gcwq->lock); 3712 spin_unlock_irq(&pool->lock);
3713 }
3823 } 3714 }
3824 3715
3825 workqueue_freezing = false; 3716 workqueue_freezing = false;
@@ -3831,60 +3722,56 @@ out_unlock:
3831static int __init init_workqueues(void) 3722static int __init init_workqueues(void)
3832{ 3723{
3833 unsigned int cpu; 3724 unsigned int cpu;
3834 int i;
3835 3725
3836 /* make sure we have enough bits for OFFQ CPU number */ 3726 /* make sure we have enough bits for OFFQ pool ID */
3837 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < 3727 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3838 WORK_CPU_LAST); 3728 WORK_CPU_END * NR_STD_WORKER_POOLS);
3839 3729
3840 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 3730 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3841 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 3731 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3842 3732
3843 /* initialize gcwqs */ 3733 /* initialize CPU pools */
3844 for_each_gcwq_cpu(cpu) { 3734 for_each_wq_cpu(cpu) {
3845 struct global_cwq *gcwq = get_gcwq(cpu);
3846 struct worker_pool *pool; 3735 struct worker_pool *pool;
3847 3736
3848 spin_lock_init(&gcwq->lock); 3737 for_each_std_worker_pool(pool, cpu) {
3849 gcwq->cpu = cpu; 3738 spin_lock_init(&pool->lock);
3850 gcwq->flags |= GCWQ_DISASSOCIATED; 3739 pool->cpu = cpu;
3851 3740 pool->flags |= POOL_DISASSOCIATED;
3852 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3853 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3854
3855 for_each_worker_pool(pool, gcwq) {
3856 pool->gcwq = gcwq;
3857 INIT_LIST_HEAD(&pool->worklist); 3741 INIT_LIST_HEAD(&pool->worklist);
3858 INIT_LIST_HEAD(&pool->idle_list); 3742 INIT_LIST_HEAD(&pool->idle_list);
3743 hash_init(pool->busy_hash);
3859 3744
3860 init_timer_deferrable(&pool->idle_timer); 3745 init_timer_deferrable(&pool->idle_timer);
3861 pool->idle_timer.function = idle_worker_timeout; 3746 pool->idle_timer.function = idle_worker_timeout;
3862 pool->idle_timer.data = (unsigned long)pool; 3747 pool->idle_timer.data = (unsigned long)pool;
3863 3748
3864 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, 3749 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3865 (unsigned long)pool); 3750 (unsigned long)pool);
3866 3751
3867 mutex_init(&pool->assoc_mutex); 3752 mutex_init(&pool->assoc_mutex);
3868 ida_init(&pool->worker_ida); 3753 ida_init(&pool->worker_ida);
3754
3755 /* alloc pool ID */
3756 BUG_ON(worker_pool_assign_id(pool));
3869 } 3757 }
3870 } 3758 }
3871 3759
3872 /* create the initial worker */ 3760 /* create the initial worker */
3873 for_each_online_gcwq_cpu(cpu) { 3761 for_each_online_wq_cpu(cpu) {
3874 struct global_cwq *gcwq = get_gcwq(cpu);
3875 struct worker_pool *pool; 3762 struct worker_pool *pool;
3876 3763
3877 if (cpu != WORK_CPU_UNBOUND) 3764 for_each_std_worker_pool(pool, cpu) {
3878 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3879
3880 for_each_worker_pool(pool, gcwq) {
3881 struct worker *worker; 3765 struct worker *worker;
3882 3766
3767 if (cpu != WORK_CPU_UNBOUND)
3768 pool->flags &= ~POOL_DISASSOCIATED;
3769
3883 worker = create_worker(pool); 3770 worker = create_worker(pool);
3884 BUG_ON(!worker); 3771 BUG_ON(!worker);
3885 spin_lock_irq(&gcwq->lock); 3772 spin_lock_irq(&pool->lock);
3886 start_worker(worker); 3773 start_worker(worker);
3887 spin_unlock_irq(&gcwq->lock); 3774 spin_unlock_irq(&pool->lock);
3888 } 3775 }
3889 } 3776 }
3890 3777
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 000000000000..07650264ec15
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,65 @@
1/*
2 * kernel/workqueue_internal.h
3 *
4 * Workqueue internal header file. Only to be included by workqueue and
5 * core kernel subsystems.
6 */
7#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
8#define _KERNEL_WORKQUEUE_INTERNAL_H
9
10#include <linux/workqueue.h>
11#include <linux/kthread.h>
12
13struct worker_pool;
14
15/*
16 * The poor guys doing the actual heavy lifting. All on-duty workers are
17 * either serving the manager role, on idle list or on busy hash. For
18 * details on the locking annotation (L, I, X...), refer to workqueue.c.
19 *
20 * Only to be used in workqueue and async.
21 */
22struct worker {
23 /* on idle list while idle, on busy hash table while busy */
24 union {
25 struct list_head entry; /* L: while idle */
26 struct hlist_node hentry; /* L: while busy */
27 };
28
29 struct work_struct *current_work; /* L: work being processed */
30 work_func_t current_func; /* L: current_work's fn */
31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */
32 struct list_head scheduled; /* L: scheduled works */
33 struct task_struct *task; /* I: worker task */
34 struct worker_pool *pool; /* I: the associated pool */
35 /* 64 bytes boundary on 64bit, 32 on 32bit */
36 unsigned long last_active; /* L: last active timestamp */
37 unsigned int flags; /* X: flags */
38 int id; /* I: worker id */
39
40 /* for rebinding worker to CPU */
41 struct work_struct rebind_work; /* L: for busy worker */
42
43 /* used only by rescuers to point to the target workqueue */
44 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
45};
46
47/**
48 * current_wq_worker - return struct worker if %current is a workqueue worker
49 */
50static inline struct worker *current_wq_worker(void)
51{
52 if (current->flags & PF_WQ_WORKER)
53 return kthread_data(current);
54 return NULL;
55}
56
57/*
58 * Scheduler hooks for concurrency managed workqueue. Only to be used from
59 * sched.c and workqueue.c.
60 */
61void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
62struct task_struct *wq_worker_sleeping(struct task_struct *task,
63 unsigned int cpu);
64
65#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc98dc79..000000000000
--- a/kernel/workqueue_sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);