aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2012-08-21 05:27:00 -0400
committerIngo Molnar <mingo@kernel.org>2012-08-21 05:27:00 -0400
commitbcada3d4b8c96b8792c2306f363992ca5ab9da42 (patch)
treee420679a5db6ea4e1694eef57f9abb6acac8d4d3 /kernel
parent26198c21d1b286a084fe5d514a30bc7e6c712a34 (diff)
parent000078bc3ee69efb1124b8478c7527389a826074 (diff)
Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: * Fix include order for bison/flex-generated C files, from Ben Hutchings * Build fixes and documentation corrections from David Ahern * Group parsing support, from Jiri Olsa * UI/gtk refactorings and improvements from Namhyung Kim * NULL deref fix for perf script, from Namhyung Kim * Assorted cleanups from Robert Richter * Let O= makes handle relative paths, from Steven Rostedt * perf script python fixes, from Feng Tang. * Improve 'perf lock' error message when the needed tracepoints are not present, from David Ahern. * Initial bash completion support, from Frederic Weisbecker * Allow building without libelf, from Namhyung Kim. * Support DWARF CFI based unwind to have callchains when %bp based unwinding is not possible, from Jiri Olsa. * Symbol resolution fixes, while fixing support PPC64 files with an .opt ELF section was the end goal, several fixes for code that handles all architectures and cleanups are included, from Cody Schafer. * Add a description for the JIT interface, from Andi Kleen. * Assorted fixes for Documentation and build in 32 bit, from Robert Richter * Add support for non-tracepoint events in perf script python, from Feng Tang * Cache the libtraceevent event_format associated to each evsel early, so that we avoid relookups, i.e. calling pevent_find_event repeatedly when processing tracepoint events. [ This is to reduce the surface contact with libtraceevents and make clear what is that the perf tools needs from that lib: so far parsing the common and per event fields. ] Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c76
-rw-r--r--kernel/audit.c51
-rw-r--r--kernel/audit_tree.c10
-rw-r--r--kernel/audit_watch.c25
-rw-r--r--kernel/cgroup.c53
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c130
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c11
-rw-r--r--kernel/debug/kdb/kdb_main.c15
-rw-r--r--kernel/events/callchain.c35
-rw-r--r--kernel/events/core.c244
-rw-r--r--kernel/events/internal.h85
-rw-r--r--kernel/events/ring_buffer.c10
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c48
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/irq/handle.c7
-rw-r--r--kernel/irq/irqdomain.c370
-rw-r--r--kernel/irq/manage.c38
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c37
-rw-r--r--kernel/kthread.c88
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c42
-rw-r--r--kernel/power/main.c45
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/power/swap.c82
-rw-r--r--kernel/power/wakelock.c7
-rw-r--r--kernel/printk.c191
-rw-r--r--kernel/resource.c37
-rw-r--r--kernel/sched/core.c96
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/fair.c142
-rw-r--r--kernel/sched/sched.h23
-rw-r--r--kernel/signal.c15
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c57
-rw-r--r--kernel/sysctl.c69
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/task_work.c94
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/timekeeping.c376
-rw-r--r--kernel/trace/trace.c7
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_functions.c36
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/watchdog.c21
-rw-r--r--kernel/workqueue.c1144
55 files changed, 2399 insertions, 1516 deletions
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bbe..9d3118384858 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1;
62#define MAX_WORK 32768 62#define MAX_WORK 32768
63 63
64static LIST_HEAD(async_pending); 64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running); 65static ASYNC_DOMAIN(async_running);
66static LIST_HEAD(async_domains);
66static DEFINE_SPINLOCK(async_lock); 67static DEFINE_SPINLOCK(async_lock);
68static DEFINE_MUTEX(async_register_mutex);
67 69
68struct async_entry { 70struct async_entry {
69 struct list_head list; 71 struct list_head list;
@@ -71,7 +73,7 @@ struct async_entry {
71 async_cookie_t cookie; 73 async_cookie_t cookie;
72 async_func_ptr *func; 74 async_func_ptr *func;
73 void *data; 75 void *data;
74 struct list_head *running; 76 struct async_domain *running;
75}; 77};
76 78
77static DECLARE_WAIT_QUEUE_HEAD(async_done); 79static DECLARE_WAIT_QUEUE_HEAD(async_done);
@@ -82,13 +84,12 @@ static atomic_t entry_count;
82/* 84/*
83 * MUST be called with the lock held! 85 * MUST be called with the lock held!
84 */ 86 */
85static async_cookie_t __lowest_in_progress(struct list_head *running) 87static async_cookie_t __lowest_in_progress(struct async_domain *running)
86{ 88{
87 struct async_entry *entry; 89 struct async_entry *entry;
88 90
89 if (!list_empty(running)) { 91 if (!list_empty(&running->domain)) {
90 entry = list_first_entry(running, 92 entry = list_first_entry(&running->domain, typeof(*entry), list);
91 struct async_entry, list);
92 return entry->cookie; 93 return entry->cookie;
93 } 94 }
94 95
@@ -99,7 +100,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running)
99 return next_cookie; /* "infinity" value */ 100 return next_cookie; /* "infinity" value */
100} 101}
101 102
102static async_cookie_t lowest_in_progress(struct list_head *running) 103static async_cookie_t lowest_in_progress(struct async_domain *running)
103{ 104{
104 unsigned long flags; 105 unsigned long flags;
105 async_cookie_t ret; 106 async_cookie_t ret;
@@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work)
119 container_of(work, struct async_entry, work); 120 container_of(work, struct async_entry, work);
120 unsigned long flags; 121 unsigned long flags;
121 ktime_t uninitialized_var(calltime), delta, rettime; 122 ktime_t uninitialized_var(calltime), delta, rettime;
123 struct async_domain *running = entry->running;
122 124
123 /* 1) move self to the running queue */ 125 /* 1) move self to the running queue */
124 spin_lock_irqsave(&async_lock, flags); 126 spin_lock_irqsave(&async_lock, flags);
125 list_move_tail(&entry->list, entry->running); 127 list_move_tail(&entry->list, &running->domain);
126 spin_unlock_irqrestore(&async_lock, flags); 128 spin_unlock_irqrestore(&async_lock, flags);
127 129
128 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
@@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work)
145 /* 3) remove self from the running queue */ 147 /* 3) remove self from the running queue */
146 spin_lock_irqsave(&async_lock, flags); 148 spin_lock_irqsave(&async_lock, flags);
147 list_del(&entry->list); 149 list_del(&entry->list);
150 if (running->registered && --running->count == 0)
151 list_del_init(&running->node);
148 152
149 /* 4) free the entry */ 153 /* 4) free the entry */
150 kfree(entry); 154 kfree(entry);
@@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work)
156 wake_up(&async_done); 160 wake_up(&async_done);
157} 161}
158 162
159static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) 163static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
160{ 164{
161 struct async_entry *entry; 165 struct async_entry *entry;
162 unsigned long flags; 166 unsigned long flags;
@@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
187 spin_lock_irqsave(&async_lock, flags); 191 spin_lock_irqsave(&async_lock, flags);
188 newcookie = entry->cookie = next_cookie++; 192 newcookie = entry->cookie = next_cookie++;
189 list_add_tail(&entry->list, &async_pending); 193 list_add_tail(&entry->list, &async_pending);
194 if (running->registered && running->count++ == 0)
195 list_add_tail(&running->node, &async_domains);
190 atomic_inc(&entry_count); 196 atomic_inc(&entry_count);
191 spin_unlock_irqrestore(&async_lock, flags); 197 spin_unlock_irqrestore(&async_lock, flags);
192 198
@@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
223 * Note: This function may be called from atomic or non-atomic contexts. 229 * Note: This function may be called from atomic or non-atomic contexts.
224 */ 230 */
225async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 231async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
226 struct list_head *running) 232 struct async_domain *running)
227{ 233{
228 return __async_schedule(ptr, data, running); 234 return __async_schedule(ptr, data, running);
229} 235}
@@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
236 */ 242 */
237void async_synchronize_full(void) 243void async_synchronize_full(void)
238{ 244{
245 mutex_lock(&async_register_mutex);
239 do { 246 do {
240 async_synchronize_cookie(next_cookie); 247 struct async_domain *domain = NULL;
241 } while (!list_empty(&async_running) || !list_empty(&async_pending)); 248
249 spin_lock_irq(&async_lock);
250 if (!list_empty(&async_domains))
251 domain = list_first_entry(&async_domains, typeof(*domain), node);
252 spin_unlock_irq(&async_lock);
253
254 async_synchronize_cookie_domain(next_cookie, domain);
255 } while (!list_empty(&async_domains));
256 mutex_unlock(&async_register_mutex);
242} 257}
243EXPORT_SYMBOL_GPL(async_synchronize_full); 258EXPORT_SYMBOL_GPL(async_synchronize_full);
244 259
245/** 260/**
261 * async_unregister_domain - ensure no more anonymous waiters on this domain
262 * @domain: idle domain to flush out of any async_synchronize_full instances
263 *
264 * async_synchronize_{cookie|full}_domain() are not flushed since callers
265 * of these routines should know the lifetime of @domain
266 *
267 * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
268 */
269void async_unregister_domain(struct async_domain *domain)
270{
271 mutex_lock(&async_register_mutex);
272 spin_lock_irq(&async_lock);
273 WARN_ON(!domain->registered || !list_empty(&domain->node) ||
274 !list_empty(&domain->domain));
275 domain->registered = 0;
276 spin_unlock_irq(&async_lock);
277 mutex_unlock(&async_register_mutex);
278}
279EXPORT_SYMBOL_GPL(async_unregister_domain);
280
281/**
246 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 282 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
247 * @list: running list to synchronize on 283 * @domain: running list to synchronize on
248 * 284 *
249 * This function waits until all asynchronous function calls for the 285 * This function waits until all asynchronous function calls for the
250 * synchronization domain specified by the running list @list have been done. 286 * synchronization domain specified by the running list @domain have been done.
251 */ 287 */
252void async_synchronize_full_domain(struct list_head *list) 288void async_synchronize_full_domain(struct async_domain *domain)
253{ 289{
254 async_synchronize_cookie_domain(next_cookie, list); 290 async_synchronize_cookie_domain(next_cookie, domain);
255} 291}
256EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 292EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
257 293
@@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
261 * @running: running list to synchronize on 297 * @running: running list to synchronize on
262 * 298 *
263 * This function waits until all asynchronous function calls for the 299 * This function waits until all asynchronous function calls for the
264 * synchronization domain specified by the running list @list submitted 300 * synchronization domain specified by running list @running submitted
265 * prior to @cookie have been done. 301 * prior to @cookie have been done.
266 */ 302 */
267void async_synchronize_cookie_domain(async_cookie_t cookie, 303void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
268 struct list_head *running)
269{ 304{
270 ktime_t uninitialized_var(starttime), delta, endtime; 305 ktime_t uninitialized_var(starttime), delta, endtime;
271 306
307 if (!running)
308 return;
309
272 if (initcall_debug && system_state == SYSTEM_BOOTING) { 310 if (initcall_debug && system_state == SYSTEM_BOOTING) {
273 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 311 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
274 starttime = ktime_get(); 312 starttime = ktime_get();
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c7f2c61416b..ea3b7b6191c7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
384static void audit_printk_skb(struct sk_buff *skb) 384static void audit_printk_skb(struct sk_buff *skb)
385{ 385{
386 struct nlmsghdr *nlh = nlmsg_hdr(skb); 386 struct nlmsghdr *nlh = nlmsg_hdr(skb);
387 char *data = NLMSG_DATA(nlh); 387 char *data = nlmsg_data(nlh);
388 388
389 if (nlh->nlmsg_type != AUDIT_EOE) { 389 if (nlh->nlmsg_type != AUDIT_EOE) {
390 if (printk_ratelimit()) 390 if (printk_ratelimit())
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
516 if (!skb) 516 if (!skb)
517 return NULL; 517 return NULL;
518 518
519 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); 519 nlh = nlmsg_put(skb, pid, seq, t, size, flags);
520 data = NLMSG_DATA(nlh); 520 if (!nlh)
521 goto out_kfree_skb;
522 data = nlmsg_data(nlh);
521 memcpy(data, payload, size); 523 memcpy(data, payload, size);
522 return skb; 524 return skb;
523 525
524nlmsg_failure: /* Used by NLMSG_NEW */ 526out_kfree_skb:
525 if (skb) 527 kfree_skb(skb);
526 kfree_skb(skb);
527 return NULL; 528 return NULL;
528} 529}
529 530
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
680 sessionid = audit_get_sessionid(current); 681 sessionid = audit_get_sessionid(current);
681 security_task_getsecid(current, &sid); 682 security_task_getsecid(current, &sid);
682 seq = nlh->nlmsg_seq; 683 seq = nlh->nlmsg_seq;
683 data = NLMSG_DATA(nlh); 684 data = nlmsg_data(nlh);
684 685
685 switch (msg_type) { 686 switch (msg_type) {
686 case AUDIT_GET: 687 case AUDIT_GET:
@@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff *skb)
961static int __init audit_init(void) 962static int __init audit_init(void)
962{ 963{
963 int i; 964 int i;
965 struct netlink_kernel_cfg cfg = {
966 .input = audit_receive,
967 };
964 968
965 if (audit_initialized == AUDIT_DISABLED) 969 if (audit_initialized == AUDIT_DISABLED)
966 return 0; 970 return 0;
967 971
968 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 972 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
969 audit_default ? "enabled" : "disabled"); 973 audit_default ? "enabled" : "disabled");
970 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, 974 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
971 audit_receive, NULL, THIS_MODULE); 975 THIS_MODULE, &cfg);
972 if (!audit_sock) 976 if (!audit_sock)
973 audit_panic("cannot initialize netlink socket"); 977 audit_panic("cannot initialize netlink socket");
974 else 978 else
@@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1060 1064
1061 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); 1065 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1062 if (!ab->skb) 1066 if (!ab->skb)
1063 goto nlmsg_failure; 1067 goto err;
1064 1068
1065 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); 1069 nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
1070 if (!nlh)
1071 goto out_kfree_skb;
1066 1072
1067 return ab; 1073 return ab;
1068 1074
1069nlmsg_failure: /* Used by NLMSG_NEW */ 1075out_kfree_skb:
1070 kfree_skb(ab->skb); 1076 kfree_skb(ab->skb);
1071 ab->skb = NULL; 1077 ab->skb = NULL;
1072err: 1078err:
@@ -1450,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key)
1450} 1456}
1451 1457
1452/** 1458/**
1459 * audit_log_link_denied - report a link restriction denial
1460 * @operation: specific link opreation
1461 * @link: the path that triggered the restriction
1462 */
1463void audit_log_link_denied(const char *operation, struct path *link)
1464{
1465 struct audit_buffer *ab;
1466
1467 ab = audit_log_start(current->audit_context, GFP_KERNEL,
1468 AUDIT_ANOM_LINK);
1469 audit_log_format(ab, "op=%s action=denied", operation);
1470 audit_log_format(ab, " pid=%d comm=", current->pid);
1471 audit_log_untrustedstring(ab, current->comm);
1472 audit_log_d_path(ab, " path=", link);
1473 audit_log_format(ab, " dev=");
1474 audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
1475 audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
1476 audit_log_end(ab);
1477}
1478
1479/**
1453 * audit_log_end - end one audit record 1480 * audit_log_end - end one audit record
1454 * @ab: the audit_buffer 1481 * @ab: the audit_buffer
1455 * 1482 *
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e7..3a5ca582ba1e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -595,7 +595,7 @@ void audit_trim_trees(void)
595 595
596 root_mnt = collect_mounts(&path); 596 root_mnt = collect_mounts(&path);
597 path_put(&path); 597 path_put(&path);
598 if (!root_mnt) 598 if (IS_ERR(root_mnt))
599 goto skip_it; 599 goto skip_it;
600 600
601 spin_lock(&hash_lock); 601 spin_lock(&hash_lock);
@@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
669 goto Err; 669 goto Err;
670 mnt = collect_mounts(&path); 670 mnt = collect_mounts(&path);
671 path_put(&path); 671 path_put(&path);
672 if (!mnt) { 672 if (IS_ERR(mnt)) {
673 err = -ENOMEM; 673 err = PTR_ERR(mnt);
674 goto Err; 674 goto Err;
675 } 675 }
676 676
@@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
719 return err; 719 return err;
720 tagged = collect_mounts(&path2); 720 tagged = collect_mounts(&path2);
721 path_put(&path2); 721 path_put(&path2);
722 if (!tagged) 722 if (IS_ERR(tagged))
723 return -ENOMEM; 723 return PTR_ERR(tagged);
724 724
725 err = kern_path(old, 0, &path1); 725 err = kern_path(old, 0, &path1);
726 if (err) { 726 if (err) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d9..3823281401b5 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(struct audit_watch *watch, struct path *parent) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata nd; 358 struct dentry *d = kern_path_locked(watch->path, parent);
359 struct dentry *d; 359 if (IS_ERR(d))
360 int err;
361
362 err = kern_path_parent(watch->path, &nd);
363 if (err)
364 return err;
365
366 if (nd.last_type != LAST_NORM) {
367 path_put(&nd.path);
368 return -EINVAL;
369 }
370
371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 if (IS_ERR(d)) {
374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 path_put(&nd.path);
376 return PTR_ERR(d); 360 return PTR_ERR(d);
377 } 361 mutex_unlock(&parent->dentry->d_inode->i_mutex);
378 if (d->d_inode) { 362 if (d->d_inode) {
379 /* update watch filter fields */ 363 /* update watch filter fields */
380 watch->dev = d->d_inode->i_sb->s_dev; 364 watch->dev = d->d_inode->i_sb->s_dev;
381 watch->ino = d->d_inode->i_ino; 365 watch->ino = d->d_inode->i_ino;
382 } 366 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
384
385 *parent = nd.path;
386 dput(d); 367 dput(d);
387 return 0; 368 return 0;
388} 369}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b303dfc7dce0..79818507e444 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
822 */ 822 */
823 823
824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); 825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
827static int cgroup_populate_dir(struct cgroup *cgrp); 827static int cgroup_populate_dir(struct cgroup *cgrp);
828static const struct inode_operations cgroup_dir_inode_operations; 828static const struct inode_operations cgroup_dir_inode_operations;
@@ -954,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
954 954
955 dget(d); 955 dget(d);
956 d_delete(d); 956 d_delete(d);
957 simple_unlink(d->d_inode, d); 957 simple_unlink(cgrp->dentry->d_inode, d);
958 list_del_init(&cfe->node); 958 list_del_init(&cfe->node);
959 dput(d); 959 dput(d);
960 960
@@ -1068,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1068 BUG_ON(cgrp->subsys[i]); 1068 BUG_ON(cgrp->subsys[i]);
1069 BUG_ON(!dummytop->subsys[i]); 1069 BUG_ON(!dummytop->subsys[i]);
1070 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1070 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
1071 mutex_lock(&ss->hierarchy_mutex);
1072 cgrp->subsys[i] = dummytop->subsys[i]; 1071 cgrp->subsys[i] = dummytop->subsys[i];
1073 cgrp->subsys[i]->cgroup = cgrp; 1072 cgrp->subsys[i]->cgroup = cgrp;
1074 list_move(&ss->sibling, &root->subsys_list); 1073 list_move(&ss->sibling, &root->subsys_list);
1075 ss->root = root; 1074 ss->root = root;
1076 if (ss->bind) 1075 if (ss->bind)
1077 ss->bind(cgrp); 1076 ss->bind(cgrp);
1078 mutex_unlock(&ss->hierarchy_mutex);
1079 /* refcount was already taken, and we're keeping it */ 1077 /* refcount was already taken, and we're keeping it */
1080 } else if (bit & removed_bits) { 1078 } else if (bit & removed_bits) {
1081 /* We're removing this subsystem */ 1079 /* We're removing this subsystem */
1082 BUG_ON(ss == NULL); 1080 BUG_ON(ss == NULL);
1083 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1081 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1084 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1082 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1085 mutex_lock(&ss->hierarchy_mutex);
1086 if (ss->bind) 1083 if (ss->bind)
1087 ss->bind(dummytop); 1084 ss->bind(dummytop);
1088 dummytop->subsys[i]->cgroup = dummytop; 1085 dummytop->subsys[i]->cgroup = dummytop;
1089 cgrp->subsys[i] = NULL; 1086 cgrp->subsys[i] = NULL;
1090 subsys[i]->root = &rootnode; 1087 subsys[i]->root = &rootnode;
1091 list_move(&ss->sibling, &rootnode.subsys_list); 1088 list_move(&ss->sibling, &rootnode.subsys_list);
1092 mutex_unlock(&ss->hierarchy_mutex);
1093 /* subsystem is now free - drop reference on module */ 1089 /* subsystem is now free - drop reference on module */
1094 module_put(ss->module); 1090 module_put(ss->module);
1095 } else if (bit & final_bits) { 1091 } else if (bit & final_bits) {
@@ -1587,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1587 opts.new_root = new_root; 1583 opts.new_root = new_root;
1588 1584
1589 /* Locate an existing or new sb for this hierarchy */ 1585 /* Locate an existing or new sb for this hierarchy */
1590 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); 1586 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1591 if (IS_ERR(sb)) { 1587 if (IS_ERR(sb)) {
1592 ret = PTR_ERR(sb); 1588 ret = PTR_ERR(sb);
1593 cgroup_drop_root(opts.new_root); 1589 cgroup_drop_root(opts.new_root);
@@ -2570,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2570 .rename = cgroup_rename, 2566 .rename = cgroup_rename,
2571}; 2567};
2572 2568
2573static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 2569static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2574{ 2570{
2575 if (dentry->d_name.len > NAME_MAX) 2571 if (dentry->d_name.len > NAME_MAX)
2576 return ERR_PTR(-ENAMETOOLONG); 2572 return ERR_PTR(-ENAMETOOLONG);
@@ -3915,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3915 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 3911 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3916} 3912}
3917 3913
3918static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
3919{
3920 /* We need to take each hierarchy_mutex in a consistent order */
3921 int i;
3922
3923 /*
3924 * No worry about a race with rebind_subsystems that might mess up the
3925 * locking order, since both parties are under cgroup_mutex.
3926 */
3927 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3928 struct cgroup_subsys *ss = subsys[i];
3929 if (ss == NULL)
3930 continue;
3931 if (ss->root == root)
3932 mutex_lock(&ss->hierarchy_mutex);
3933 }
3934}
3935
3936static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3937{
3938 int i;
3939
3940 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3941 struct cgroup_subsys *ss = subsys[i];
3942 if (ss == NULL)
3943 continue;
3944 if (ss->root == root)
3945 mutex_unlock(&ss->hierarchy_mutex);
3946 }
3947}
3948
3949/* 3914/*
3950 * cgroup_create - create a cgroup 3915 * cgroup_create - create a cgroup
3951 * @parent: cgroup that will be parent of the new cgroup 3916 * @parent: cgroup that will be parent of the new cgroup
@@ -4006,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4006 ss->post_clone(cgrp); 3971 ss->post_clone(cgrp);
4007 } 3972 }
4008 3973
4009 cgroup_lock_hierarchy(root);
4010 list_add(&cgrp->sibling, &cgrp->parent->children); 3974 list_add(&cgrp->sibling, &cgrp->parent->children);
4011 cgroup_unlock_hierarchy(root);
4012 root->number_of_cgroups++; 3975 root->number_of_cgroups++;
4013 3976
4014 err = cgroup_create_dir(cgrp, dentry, mode); 3977 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -4035,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4035 3998
4036 err_remove: 3999 err_remove:
4037 4000
4038 cgroup_lock_hierarchy(root);
4039 list_del(&cgrp->sibling); 4001 list_del(&cgrp->sibling);
4040 cgroup_unlock_hierarchy(root);
4041 root->number_of_cgroups--; 4002 root->number_of_cgroups--;
4042 4003
4043 err_destroy: 4004 err_destroy:
@@ -4245,10 +4206,8 @@ again:
4245 list_del_init(&cgrp->release_list); 4206 list_del_init(&cgrp->release_list);
4246 raw_spin_unlock(&release_list_lock); 4207 raw_spin_unlock(&release_list_lock);
4247 4208
4248 cgroup_lock_hierarchy(cgrp->root);
4249 /* delete this cgroup from parent->children */ 4209 /* delete this cgroup from parent->children */
4250 list_del_init(&cgrp->sibling); 4210 list_del_init(&cgrp->sibling);
4251 cgroup_unlock_hierarchy(cgrp->root);
4252 4211
4253 list_del_init(&cgrp->allcg_node); 4212 list_del_init(&cgrp->allcg_node);
4254 4213
@@ -4322,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4322 * need to invoke fork callbacks here. */ 4281 * need to invoke fork callbacks here. */
4323 BUG_ON(!list_empty(&init_task.tasks)); 4282 BUG_ON(!list_empty(&init_task.tasks));
4324 4283
4325 mutex_init(&ss->hierarchy_mutex);
4326 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4327 ss->active = 1; 4284 ss->active = 1;
4328 4285
4329 /* this function shouldn't be used with modular subsystems, since they 4286 /* this function shouldn't be used with modular subsystems, since they
@@ -4450,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4450 } 4407 }
4451 write_unlock(&css_set_lock); 4408 write_unlock(&css_set_lock);
4452 4409
4453 mutex_init(&ss->hierarchy_mutex);
4454 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4455 ss->active = 1; 4410 ss->active = 1;
4456 4411
4457 /* success! */ 4412 /* success! */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..14d32588cccd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
416 416
417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
418 mutex_lock(&zonelists_mutex); 418 mutex_lock(&zonelists_mutex);
419 build_all_zonelists(NULL); 419 build_all_zonelists(NULL, NULL);
420 mutex_unlock(&zonelists_mutex); 420 mutex_unlock(&zonelists_mutex);
421 } 421 }
422#endif 422#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 147 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 148} cpuset_flagbits_t;
149 149
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
150/* convenient tests for these bits */ 156/* convenient tests for these bits */
151static inline int is_cpu_exclusive(const struct cpuset *cs) 157static inline int is_cpu_exclusive(const struct cpuset *cs)
152{ 158{
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1990} 1996}
1991 1997
1992/* 1998/*
1993 * Walk the specified cpuset subtree and look for empty cpusets. 1999 * Helper function to traverse cpusets.
1994 * The tasks of such cpuset must be moved to a parent cpuset. 2000 * It can be used to walk the cpuset tree from top to bottom, completing
2001 * one layer before dropping down to the next (thus always processing a
2002 * node before any of its children).
2003 */
2004static struct cpuset *cpuset_next(struct list_head *queue)
2005{
2006 struct cpuset *cp;
2007 struct cpuset *child; /* scans child cpusets of cp */
2008 struct cgroup *cont;
2009
2010 if (list_empty(queue))
2011 return NULL;
2012
2013 cp = list_first_entry(queue, struct cpuset, stack_list);
2014 list_del(queue->next);
2015 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2016 child = cgroup_cs(cont);
2017 list_add_tail(&child->stack_list, queue);
2018 }
2019
2020 return cp;
2021}
2022
2023
2024/*
2025 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
2026 * online/offline) and update the cpusets accordingly.
2027 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
2028 * cpuset must be moved to a parent cpuset.
1995 * 2029 *
1996 * Called with cgroup_mutex held. We take callback_mutex to modify 2030 * Called with cgroup_mutex held. We take callback_mutex to modify
1997 * cpus_allowed and mems_allowed. 2031 * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2000 * before dropping down to the next. It always processes a node before 2034 * before dropping down to the next. It always processes a node before
2001 * any of its children. 2035 * any of its children.
2002 * 2036 *
2003 * For now, since we lack memory hot unplug, we'll never see a cpuset 2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
2004 * that has tasks along with an empty 'mems'. But if we did see such 2038 * if all present pages from a node are offlined.
2005 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
2006 */ 2039 */
2007static void scan_for_empty_cpusets(struct cpuset *root) 2040static void
2041scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2008{ 2042{
2009 LIST_HEAD(queue); 2043 LIST_HEAD(queue);
2010 struct cpuset *cp; /* scans cpusets being updated */ 2044 struct cpuset *cp; /* scans cpusets being updated */
2011 struct cpuset *child; /* scans child cpusets of cp */
2012 struct cgroup *cont;
2013 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2045 static nodemask_t oldmems; /* protected by cgroup_mutex */
2014 2046
2015 list_add_tail((struct list_head *)&root->stack_list, &queue); 2047 list_add_tail((struct list_head *)&root->stack_list, &queue);
2016 2048
2017 while (!list_empty(&queue)) { 2049 switch (event) {
2018 cp = list_first_entry(&queue, struct cpuset, stack_list); 2050 case CPUSET_CPU_OFFLINE:
2019 list_del(queue.next); 2051 while ((cp = cpuset_next(&queue)) != NULL) {
2020 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2052
2021 child = cgroup_cs(cont); 2053 /* Continue past cpusets with all cpus online */
2022 list_add_tail(&child->stack_list, &queue); 2054 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
2055 continue;
2056
2057 /* Remove offline cpus from this cpuset. */
2058 mutex_lock(&callback_mutex);
2059 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2060 cpu_active_mask);
2061 mutex_unlock(&callback_mutex);
2062
2063 /* Move tasks from the empty cpuset to a parent */
2064 if (cpumask_empty(cp->cpus_allowed))
2065 remove_tasks_in_empty_cpuset(cp);
2066 else
2067 update_tasks_cpumask(cp, NULL);
2023 } 2068 }
2069 break;
2024 2070
2025 /* Continue past cpusets with all cpus, mems online */ 2071 case CPUSET_MEM_OFFLINE:
2026 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && 2072 while ((cp = cpuset_next(&queue)) != NULL) {
2027 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2028 continue;
2029 2073
2030 oldmems = cp->mems_allowed; 2074 /* Continue past cpusets with all mems online */
2075 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY]))
2077 continue;
2031 2078
2032 /* Remove offline cpus and mems from this cpuset. */ 2079 oldmems = cp->mems_allowed;
2033 mutex_lock(&callback_mutex); 2080
2034 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2081 /* Remove offline mems from this cpuset. */
2035 cpu_active_mask); 2082 mutex_lock(&callback_mutex);
2036 nodes_and(cp->mems_allowed, cp->mems_allowed, 2083 nodes_and(cp->mems_allowed, cp->mems_allowed,
2037 node_states[N_HIGH_MEMORY]); 2084 node_states[N_HIGH_MEMORY]);
2038 mutex_unlock(&callback_mutex); 2085 mutex_unlock(&callback_mutex);
2039 2086
2040 /* Move tasks from the empty cpuset to a parent */ 2087 /* Move tasks from the empty cpuset to a parent */
2041 if (cpumask_empty(cp->cpus_allowed) || 2088 if (nodes_empty(cp->mems_allowed))
2042 nodes_empty(cp->mems_allowed)) 2089 remove_tasks_in_empty_cpuset(cp);
2043 remove_tasks_in_empty_cpuset(cp); 2090 else
2044 else { 2091 update_tasks_nodemask(cp, &oldmems, NULL);
2045 update_tasks_cpumask(cp, NULL);
2046 update_tasks_nodemask(cp, &oldmems, NULL);
2047 } 2092 }
2048 } 2093 }
2049} 2094}
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2054 * (of no affect) on systems that are actively using CPU hotplug 2099 * (of no affect) on systems that are actively using CPU hotplug
2055 * but making no active use of cpusets. 2100 * but making no active use of cpusets.
2056 * 2101 *
2102 * The only exception to this is suspend/resume, where we don't
2103 * modify cpusets at all.
2104 *
2057 * This routine ensures that top_cpuset.cpus_allowed tracks 2105 * This routine ensures that top_cpuset.cpus_allowed tracks
2058 * cpu_active_mask on each CPU hotplug (cpuhp) event. 2106 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2059 * 2107 *
2060 * Called within get_online_cpus(). Needs to call cgroup_lock() 2108 * Called within get_online_cpus(). Needs to call cgroup_lock()
2061 * before calling generate_sched_domains(). 2109 * before calling generate_sched_domains().
2110 *
2111 * @cpu_online: Indicates whether this is a CPU online event (true) or
2112 * a CPU offline event (false).
2062 */ 2113 */
2063void cpuset_update_active_cpus(void) 2114void cpuset_update_active_cpus(bool cpu_online)
2064{ 2115{
2065 struct sched_domain_attr *attr; 2116 struct sched_domain_attr *attr;
2066 cpumask_var_t *doms; 2117 cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
2070 mutex_lock(&callback_mutex); 2121 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2122 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2123 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2124
2125 if (!cpu_online)
2126 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
2127
2074 ndoms = generate_sched_domains(&doms, &attr); 2128 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock(); 2129 cgroup_unlock();
2076 2130
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
2082/* 2136/*
2083 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
2084 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2085 * See also the previous routine cpuset_track_online_cpus(). 2139 * See cpuset_update_active_cpus() for CPU hotplug handling.
2086 */ 2140 */
2087static int cpuset_track_online_nodes(struct notifier_block *self, 2141static int cpuset_track_online_nodes(struct notifier_block *self,
2088 unsigned long action, void *arg) 2142 unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2101 case MEM_OFFLINE: 2155 case MEM_OFFLINE:
2102 /* 2156 /*
2103 * needn't update top_cpuset.mems_allowed explicitly because 2157 * needn't update top_cpuset.mems_allowed explicitly because
2104 * scan_for_empty_cpusets() will update it. 2158 * scan_cpusets_upon_hotplug() will update it.
2105 */ 2159 */
2106 scan_for_empty_cpusets(&top_cpuset); 2160 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2107 break; 2161 break;
2108 default: 2162 default:
2109 break; 2163 break;
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff17..be7b33b73d30 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
12#include <linux/kdb.h> 12#include <linux/kdb.h>
13#include <linux/kdebug.h> 13#include <linux/kdebug.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/hardirq.h>
15#include "kdb_private.h" 16#include "kdb_private.h"
16#include "../debug_core.h" 17#include "../debug_core.h"
17 18
@@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks)
52 if (atomic_read(&kgdb_setting_breakpoint)) 53 if (atomic_read(&kgdb_setting_breakpoint))
53 reason = KDB_REASON_KEYBOARD; 54 reason = KDB_REASON_KEYBOARD;
54 55
56 if (in_nmi())
57 reason = KDB_REASON_NMI;
58
55 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { 59 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
56 if ((bp->bp_enabled) && (bp->bp_addr == addr)) { 60 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
57 reason = KDB_REASON_BREAK; 61 reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index bb9520f0f6ff..0a69d2adc4f3 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -715,9 +715,6 @@ kdb_printit:
715 /* check for having reached the LINES number of printed lines */ 715 /* check for having reached the LINES number of printed lines */
716 if (kdb_nextline == linecount) { 716 if (kdb_nextline == linecount) {
717 char buf1[16] = ""; 717 char buf1[16] = "";
718#if defined(CONFIG_SMP)
719 char buf2[32];
720#endif
721 718
722 /* Watch out for recursion here. Any routine that calls 719 /* Watch out for recursion here. Any routine that calls
723 * kdb_printf will come back through here. And kdb_read 720 * kdb_printf will come back through here. And kdb_read
@@ -732,14 +729,6 @@ kdb_printit:
732 if (moreprompt == NULL) 729 if (moreprompt == NULL)
733 moreprompt = "more> "; 730 moreprompt = "more> ";
734 731
735#if defined(CONFIG_SMP)
736 if (strchr(moreprompt, '%')) {
737 sprintf(buf2, moreprompt, get_cpu());
738 put_cpu();
739 moreprompt = buf2;
740 }
741#endif
742
743 kdb_input_flush(); 732 kdb_input_flush();
744 c = console_drivers; 733 c = console_drivers;
745 734
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 1f91413edb87..31df1706b9a9 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
139static char *__env[] = { 139static char *__env[] = {
140#if defined(CONFIG_SMP) 140#if defined(CONFIG_SMP)
141 "PROMPT=[%d]kdb> ", 141 "PROMPT=[%d]kdb> ",
142 "MOREPROMPT=[%d]more> ",
143#else 142#else
144 "PROMPT=kdb> ", 143 "PROMPT=kdb> ",
145 "MOREPROMPT=more> ",
146#endif 144#endif
145 "MOREPROMPT=more> ",
147 "RADIX=16", 146 "RADIX=16",
148 "MDCOUNT=8", /* lines of md output */ 147 "MDCOUNT=8", /* lines of md output */
149 KDB_PLATFORM_ENV, 148 KDB_PLATFORM_ENV,
@@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1236 *cmdbuf = '\0'; 1235 *cmdbuf = '\0';
1237 *(cmd_hist[cmd_head]) = '\0'; 1236 *(cmd_hist[cmd_head]) = '\0';
1238 1237
1239 if (KDB_FLAG(ONLY_DO_DUMP)) {
1240 /* kdb is off but a catastrophic error requires a dump.
1241 * Take the dump and reboot.
1242 * Turn on logging so the kdb output appears in the log
1243 * buffer in the dump.
1244 */
1245 const char *setargs[] = { "set", "LOGGING", "1" };
1246 kdb_set(2, setargs);
1247 kdb_reboot(0, NULL);
1248 /*NOTREACHED*/
1249 }
1250
1251do_full_getstr: 1238do_full_getstr:
1252#if defined(CONFIG_SMP) 1239#if defined(CONFIG_SMP)
1253 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), 1240 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f399..c77206184b8b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,11 +153,17 @@ put_callchain_entry(int rctx)
153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx); 153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
154} 154}
155 155
156struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) 156struct perf_callchain_entry *
157perf_callchain(struct perf_event *event, struct pt_regs *regs)
157{ 158{
158 int rctx; 159 int rctx;
159 struct perf_callchain_entry *entry; 160 struct perf_callchain_entry *entry;
160 161
162 int kernel = !event->attr.exclude_callchain_kernel;
163 int user = !event->attr.exclude_callchain_user;
164
165 if (!kernel && !user)
166 return NULL;
161 167
162 entry = get_callchain_entry(&rctx); 168 entry = get_callchain_entry(&rctx);
163 if (rctx == -1) 169 if (rctx == -1)
@@ -168,18 +174,29 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
168 174
169 entry->nr = 0; 175 entry->nr = 0;
170 176
171 if (!user_mode(regs)) { 177 if (kernel && !user_mode(regs)) {
172 perf_callchain_store(entry, PERF_CONTEXT_KERNEL); 178 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
173 perf_callchain_kernel(entry, regs); 179 perf_callchain_kernel(entry, regs);
174 if (current->mm)
175 regs = task_pt_regs(current);
176 else
177 regs = NULL;
178 } 180 }
179 181
180 if (regs) { 182 if (user) {
181 perf_callchain_store(entry, PERF_CONTEXT_USER); 183 if (!user_mode(regs)) {
182 perf_callchain_user(entry, regs); 184 if (current->mm)
185 regs = task_pt_regs(current);
186 else
187 regs = NULL;
188 }
189
190 if (regs) {
191 /*
192 * Disallow cross-task user callchains.
193 */
194 if (event->ctx->task && event->ctx->task != current)
195 goto exit_put;
196
197 perf_callchain_store(entry, PERF_CONTEXT_USER);
198 perf_callchain_user(entry, regs);
199 }
183 } 200 }
184 201
185exit_put: 202exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f1cf0edeb39a..2ba890450d15 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,7 @@
36#include <linux/perf_event.h> 36#include <linux/perf_event.h>
37#include <linux/ftrace_event.h> 37#include <linux/ftrace_event.h>
38#include <linux/hw_breakpoint.h> 38#include <linux/hw_breakpoint.h>
39#include <linux/mm_types.h>
39 40
40#include "internal.h" 41#include "internal.h"
41 42
@@ -3756,6 +3757,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3756} 3757}
3757EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3758EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3758 3759
3760static void
3761perf_output_sample_regs(struct perf_output_handle *handle,
3762 struct pt_regs *regs, u64 mask)
3763{
3764 int bit;
3765
3766 for_each_set_bit(bit, (const unsigned long *) &mask,
3767 sizeof(mask) * BITS_PER_BYTE) {
3768 u64 val;
3769
3770 val = perf_reg_value(regs, bit);
3771 perf_output_put(handle, val);
3772 }
3773}
3774
3775static void perf_sample_regs_user(struct perf_regs_user *regs_user,
3776 struct pt_regs *regs)
3777{
3778 if (!user_mode(regs)) {
3779 if (current->mm)
3780 regs = task_pt_regs(current);
3781 else
3782 regs = NULL;
3783 }
3784
3785 if (regs) {
3786 regs_user->regs = regs;
3787 regs_user->abi = perf_reg_abi(current);
3788 }
3789}
3790
3791/*
3792 * Get remaining task size from user stack pointer.
3793 *
3794 * It'd be better to take stack vma map and limit this more
3795 * precisly, but there's no way to get it safely under interrupt,
3796 * so using TASK_SIZE as limit.
3797 */
3798static u64 perf_ustack_task_size(struct pt_regs *regs)
3799{
3800 unsigned long addr = perf_user_stack_pointer(regs);
3801
3802 if (!addr || addr >= TASK_SIZE)
3803 return 0;
3804
3805 return TASK_SIZE - addr;
3806}
3807
3808static u16
3809perf_sample_ustack_size(u16 stack_size, u16 header_size,
3810 struct pt_regs *regs)
3811{
3812 u64 task_size;
3813
3814 /* No regs, no stack pointer, no dump. */
3815 if (!regs)
3816 return 0;
3817
3818 /*
3819 * Check if we fit in with the requested stack size into the:
3820 * - TASK_SIZE
3821 * If we don't, we limit the size to the TASK_SIZE.
3822 *
3823 * - remaining sample size
3824 * If we don't, we customize the stack size to
3825 * fit in to the remaining sample size.
3826 */
3827
3828 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
3829 stack_size = min(stack_size, (u16) task_size);
3830
3831 /* Current header size plus static size and dynamic size. */
3832 header_size += 2 * sizeof(u64);
3833
3834 /* Do we fit in with the current stack dump size? */
3835 if ((u16) (header_size + stack_size) < header_size) {
3836 /*
3837 * If we overflow the maximum size for the sample,
3838 * we customize the stack dump size to fit in.
3839 */
3840 stack_size = USHRT_MAX - header_size - sizeof(u64);
3841 stack_size = round_up(stack_size, sizeof(u64));
3842 }
3843
3844 return stack_size;
3845}
3846
3847static void
3848perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
3849 struct pt_regs *regs)
3850{
3851 /* Case of a kernel thread, nothing to dump */
3852 if (!regs) {
3853 u64 size = 0;
3854 perf_output_put(handle, size);
3855 } else {
3856 unsigned long sp;
3857 unsigned int rem;
3858 u64 dyn_size;
3859
3860 /*
3861 * We dump:
3862 * static size
3863 * - the size requested by user or the best one we can fit
3864 * in to the sample max size
3865 * data
3866 * - user stack dump data
3867 * dynamic size
3868 * - the actual dumped size
3869 */
3870
3871 /* Static size. */
3872 perf_output_put(handle, dump_size);
3873
3874 /* Data. */
3875 sp = perf_user_stack_pointer(regs);
3876 rem = __output_copy_user(handle, (void *) sp, dump_size);
3877 dyn_size = dump_size - rem;
3878
3879 perf_output_skip(handle, rem);
3880
3881 /* Dynamic size. */
3882 perf_output_put(handle, dyn_size);
3883 }
3884}
3885
3759static void __perf_event_header__init_id(struct perf_event_header *header, 3886static void __perf_event_header__init_id(struct perf_event_header *header,
3760 struct perf_sample_data *data, 3887 struct perf_sample_data *data,
3761 struct perf_event *event) 3888 struct perf_event *event)
@@ -4016,6 +4143,28 @@ void perf_output_sample(struct perf_output_handle *handle,
4016 perf_output_put(handle, nr); 4143 perf_output_put(handle, nr);
4017 } 4144 }
4018 } 4145 }
4146
4147 if (sample_type & PERF_SAMPLE_REGS_USER) {
4148 u64 abi = data->regs_user.abi;
4149
4150 /*
4151 * If there are no regs to dump, notice it through
4152 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
4153 */
4154 perf_output_put(handle, abi);
4155
4156 if (abi) {
4157 u64 mask = event->attr.sample_regs_user;
4158 perf_output_sample_regs(handle,
4159 data->regs_user.regs,
4160 mask);
4161 }
4162 }
4163
4164 if (sample_type & PERF_SAMPLE_STACK_USER)
4165 perf_output_sample_ustack(handle,
4166 data->stack_user_size,
4167 data->regs_user.regs);
4019} 4168}
4020 4169
4021void perf_prepare_sample(struct perf_event_header *header, 4170void perf_prepare_sample(struct perf_event_header *header,
@@ -4039,7 +4188,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4039 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 4188 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4040 int size = 1; 4189 int size = 1;
4041 4190
4042 data->callchain = perf_callchain(regs); 4191 data->callchain = perf_callchain(event, regs);
4043 4192
4044 if (data->callchain) 4193 if (data->callchain)
4045 size += data->callchain->nr; 4194 size += data->callchain->nr;
@@ -4067,6 +4216,49 @@ void perf_prepare_sample(struct perf_event_header *header,
4067 } 4216 }
4068 header->size += size; 4217 header->size += size;
4069 } 4218 }
4219
4220 if (sample_type & PERF_SAMPLE_REGS_USER) {
4221 /* regs dump ABI info */
4222 int size = sizeof(u64);
4223
4224 perf_sample_regs_user(&data->regs_user, regs);
4225
4226 if (data->regs_user.regs) {
4227 u64 mask = event->attr.sample_regs_user;
4228 size += hweight64(mask) * sizeof(u64);
4229 }
4230
4231 header->size += size;
4232 }
4233
4234 if (sample_type & PERF_SAMPLE_STACK_USER) {
4235 /*
4236 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
4237 * processed as the last one or have additional check added
4238 * in case new sample type is added, because we could eat
4239 * up the rest of the sample size.
4240 */
4241 struct perf_regs_user *uregs = &data->regs_user;
4242 u16 stack_size = event->attr.sample_stack_user;
4243 u16 size = sizeof(u64);
4244
4245 if (!uregs->abi)
4246 perf_sample_regs_user(uregs, regs);
4247
4248 stack_size = perf_sample_ustack_size(stack_size, header->size,
4249 uregs->regs);
4250
4251 /*
4252 * If there is something to dump, add space for the dump
4253 * itself and for the field that tells the dynamic size,
4254 * which is how many have been actually dumped.
4255 */
4256 if (stack_size)
4257 size += sizeof(u64) + stack_size;
4258
4259 data->stack_user_size = stack_size;
4260 header->size += size;
4261 }
4070} 4262}
4071 4263
4072static void perf_event_output(struct perf_event *event, 4264static void perf_event_output(struct perf_event *event,
@@ -5209,7 +5401,8 @@ static int perf_tp_event_match(struct perf_event *event,
5209} 5401}
5210 5402
5211void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 5403void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5212 struct pt_regs *regs, struct hlist_head *head, int rctx) 5404 struct pt_regs *regs, struct hlist_head *head, int rctx,
5405 struct task_struct *task)
5213{ 5406{
5214 struct perf_sample_data data; 5407 struct perf_sample_data data;
5215 struct perf_event *event; 5408 struct perf_event *event;
@@ -5228,6 +5421,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5228 perf_swevent_event(event, count, &data, regs); 5421 perf_swevent_event(event, count, &data, regs);
5229 } 5422 }
5230 5423
5424 /*
5425 * If we got specified a target task, also iterate its context and
5426 * deliver this event there too.
5427 */
5428 if (task && task != current) {
5429 struct perf_event_context *ctx;
5430 struct trace_entry *entry = record;
5431
5432 rcu_read_lock();
5433 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
5434 if (!ctx)
5435 goto unlock;
5436
5437 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5438 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5439 continue;
5440 if (event->attr.config != entry->type)
5441 continue;
5442 if (perf_tp_event_match(event, &data, regs))
5443 perf_swevent_event(event, count, &data, regs);
5444 }
5445unlock:
5446 rcu_read_unlock();
5447 }
5448
5231 perf_swevent_put_recursion_context(rctx); 5449 perf_swevent_put_recursion_context(rctx);
5232} 5450}
5233EXPORT_SYMBOL_GPL(perf_tp_event); 5451EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -6116,6 +6334,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6116 attr->branch_sample_type = mask; 6334 attr->branch_sample_type = mask;
6117 } 6335 }
6118 } 6336 }
6337
6338 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
6339 ret = perf_reg_validate(attr->sample_regs_user);
6340 if (ret)
6341 return ret;
6342 }
6343
6344 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
6345 if (!arch_perf_have_user_stack_dump())
6346 return -ENOSYS;
6347
6348 /*
6349 * We have __u32 type for the size, but so far
6350 * we can only use __u16 as maximum due to the
6351 * __u16 sample size limit.
6352 */
6353 if (attr->sample_stack_user >= USHRT_MAX)
6354 ret = -EINVAL;
6355 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
6356 ret = -EINVAL;
6357 }
6358
6119out: 6359out:
6120 return ret; 6360 return ret;
6121 6361
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90afc..d56a64c99a8b 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -2,6 +2,7 @@
2#define _KERNEL_EVENTS_INTERNAL_H 2#define _KERNEL_EVENTS_INTERNAL_H
3 3
4#include <linux/hardirq.h> 4#include <linux/hardirq.h>
5#include <linux/uaccess.h>
5 6
6/* Buffer handling */ 7/* Buffer handling */
7 8
@@ -76,32 +77,56 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
76 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 77 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
77} 78}
78 79
79static inline void 80#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
80__output_copy(struct perf_output_handle *handle, 81static inline unsigned int \
81 const void *buf, unsigned int len) 82func_name(struct perf_output_handle *handle, \
83 const void *buf, unsigned int len) \
84{ \
85 unsigned long size, written; \
86 \
87 do { \
88 size = min_t(unsigned long, handle->size, len); \
89 \
90 written = memcpy_func(handle->addr, buf, size); \
91 \
92 len -= written; \
93 handle->addr += written; \
94 buf += written; \
95 handle->size -= written; \
96 if (!handle->size) { \
97 struct ring_buffer *rb = handle->rb; \
98 \
99 handle->page++; \
100 handle->page &= rb->nr_pages - 1; \
101 handle->addr = rb->data_pages[handle->page]; \
102 handle->size = PAGE_SIZE << page_order(rb); \
103 } \
104 } while (len && written == size); \
105 \
106 return len; \
107}
108
109static inline int memcpy_common(void *dst, const void *src, size_t n)
82{ 110{
83 do { 111 memcpy(dst, src, n);
84 unsigned long size = min_t(unsigned long, handle->size, len); 112 return n;
85
86 memcpy(handle->addr, buf, size);
87
88 len -= size;
89 handle->addr += size;
90 buf += size;
91 handle->size -= size;
92 if (!handle->size) {
93 struct ring_buffer *rb = handle->rb;
94
95 handle->page++;
96 handle->page &= rb->nr_pages - 1;
97 handle->addr = rb->data_pages[handle->page];
98 handle->size = PAGE_SIZE << page_order(rb);
99 }
100 } while (len);
101} 113}
102 114
115DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
116
117#define MEMCPY_SKIP(dst, src, n) (n)
118
119DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
120
121#ifndef arch_perf_out_copy_user
122#define arch_perf_out_copy_user __copy_from_user_inatomic
123#endif
124
125DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
126
103/* Callchain handling */ 127/* Callchain handling */
104extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); 128extern struct perf_callchain_entry *
129perf_callchain(struct perf_event *event, struct pt_regs *regs);
105extern int get_callchain_buffers(void); 130extern int get_callchain_buffers(void);
106extern void put_callchain_buffers(void); 131extern void put_callchain_buffers(void);
107 132
@@ -133,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx)
133 recursion[rctx]--; 158 recursion[rctx]--;
134} 159}
135 160
161#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
162static inline bool arch_perf_have_user_stack_dump(void)
163{
164 return true;
165}
166
167#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
168#else
169static inline bool arch_perf_have_user_stack_dump(void)
170{
171 return false;
172}
173
174#define perf_user_stack_pointer(regs) 0
175#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
176
136#endif /* _KERNEL_EVENTS_INTERNAL_H */ 177#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6ddaba43fb7a..23cb34ff3973 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -182,10 +182,16 @@ out:
182 return -ENOSPC; 182 return -ENOSPC;
183} 183}
184 184
185void perf_output_copy(struct perf_output_handle *handle, 185unsigned int perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len) 186 const void *buf, unsigned int len)
187{ 187{
188 __output_copy(handle, buf, len); 188 return __output_copy(handle, buf, len);
189}
190
191unsigned int perf_output_skip(struct perf_output_handle *handle,
192 unsigned int len)
193{
194 return __output_skip(handle, NULL, len);
189} 195}
190 196
191void perf_output_end(struct perf_output_handle *handle) 197void perf_output_end(struct perf_output_handle *handle)
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f59cc334516..f65345f9e5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
483 rcu_read_unlock(); 483 rcu_read_unlock();
484 for (;;) { 484 for (;;) {
485 unsigned long set; 485 unsigned long set;
486 i = j * __NFDBITS; 486 i = j * BITS_PER_LONG;
487 if (i >= fdt->max_fds) 487 if (i >= fdt->max_fds)
488 break; 488 break;
489 set = fdt->open_fds[j++]; 489 set = fdt->open_fds[j++];
@@ -953,14 +953,11 @@ void do_exit(long code)
953 exit_signals(tsk); /* sets PF_EXITING */ 953 exit_signals(tsk); /* sets PF_EXITING */
954 /* 954 /*
955 * tsk->flags are checked in the futex code to protect against 955 * tsk->flags are checked in the futex code to protect against
956 * an exiting task cleaning up the robust pi futexes, and in 956 * an exiting task cleaning up the robust pi futexes.
957 * task_work_add() to avoid the race with exit_task_work().
958 */ 957 */
959 smp_mb(); 958 smp_mb();
960 raw_spin_unlock_wait(&tsk->pi_lock); 959 raw_spin_unlock_wait(&tsk->pi_lock);
961 960
962 exit_task_work(tsk);
963
964 if (unlikely(in_atomic())) 961 if (unlikely(in_atomic()))
965 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 962 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
966 current->comm, task_pid_nr(current), 963 current->comm, task_pid_nr(current),
@@ -995,6 +992,7 @@ void do_exit(long code)
995 exit_shm(tsk); 992 exit_shm(tsk);
996 exit_files(tsk); 993 exit_files(tsk);
997 exit_fs(tsk); 994 exit_fs(tsk);
995 exit_task_work(tsk);
998 check_stack_usage(); 996 check_stack_usage();
999 exit_thread(); 997 exit_thread();
1000 998
diff --git a/kernel/fork.c b/kernel/fork.c
index f00e319d8376..3bd2280d79f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,6 +114,10 @@ int nr_processes(void)
114 return total; 114 return total;
115} 115}
116 116
117void __weak arch_release_task_struct(struct task_struct *tsk)
118{
119}
120
117#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 121#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
118static struct kmem_cache *task_struct_cachep; 122static struct kmem_cache *task_struct_cachep;
119 123
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
122 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); 126 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
123} 127}
124 128
125void __weak arch_release_task_struct(struct task_struct *tsk) { }
126
127static inline void free_task_struct(struct task_struct *tsk) 129static inline void free_task_struct(struct task_struct *tsk)
128{ 130{
129 arch_release_task_struct(tsk);
130 kmem_cache_free(task_struct_cachep, tsk); 131 kmem_cache_free(task_struct_cachep, tsk);
131} 132}
132#endif 133#endif
133 134
135void __weak arch_release_thread_info(struct thread_info *ti)
136{
137}
138
134#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR 139#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
135void __weak arch_release_thread_info(struct thread_info *ti) { }
136 140
137/* 141/*
138 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 142 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
150 154
151static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
152{ 156{
153 arch_release_thread_info(ti);
154 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
155} 158}
156# else 159# else
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
164 167
165static void free_thread_info(struct thread_info *ti) 168static void free_thread_info(struct thread_info *ti)
166{ 169{
167 arch_release_thread_info(ti);
168 kmem_cache_free(thread_info_cache, ti); 170 kmem_cache_free(thread_info_cache, ti);
169} 171}
170 172
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
205void free_task(struct task_struct *tsk) 207void free_task(struct task_struct *tsk)
206{ 208{
207 account_kernel_stack(tsk->stack, -1); 209 account_kernel_stack(tsk->stack, -1);
210 arch_release_thread_info(tsk->stack);
208 free_thread_info(tsk->stack); 211 free_thread_info(tsk->stack);
209 rt_mutex_debug_task_free(tsk); 212 rt_mutex_debug_task_free(tsk);
210 ftrace_graph_exit_task(tsk); 213 ftrace_graph_exit_task(tsk);
211 put_seccomp_filter(tsk); 214 put_seccomp_filter(tsk);
215 arch_release_task_struct(tsk);
212 free_task_struct(tsk); 216 free_task_struct(tsk);
213} 217}
214EXPORT_SYMBOL(free_task); 218EXPORT_SYMBOL(free_task);
@@ -298,23 +302,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
298 return NULL; 302 return NULL;
299 303
300 ti = alloc_thread_info_node(tsk, node); 304 ti = alloc_thread_info_node(tsk, node);
301 if (!ti) { 305 if (!ti)
302 free_task_struct(tsk); 306 goto free_tsk;
303 return NULL;
304 }
305 307
306 err = arch_dup_task_struct(tsk, orig); 308 err = arch_dup_task_struct(tsk, orig);
309 if (err)
310 goto free_ti;
307 311
308 /*
309 * We defer looking at err, because we will need this setup
310 * for the clean up path to work correctly.
311 */
312 tsk->stack = ti; 312 tsk->stack = ti;
313 setup_thread_stack(tsk, orig);
314
315 if (err)
316 goto out;
317 313
314 setup_thread_stack(tsk, orig);
318 clear_user_return_notifier(tsk); 315 clear_user_return_notifier(tsk);
319 clear_tsk_need_resched(tsk); 316 clear_tsk_need_resched(tsk);
320 stackend = end_of_stack(tsk); 317 stackend = end_of_stack(tsk);
@@ -338,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
338 335
339 return tsk; 336 return tsk;
340 337
341out: 338free_ti:
342 free_thread_info(ti); 339 free_thread_info(ti);
340free_tsk:
343 free_task_struct(tsk); 341 free_task_struct(tsk);
344 return NULL; 342 return NULL;
345} 343}
@@ -383,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
383 struct file *file; 381 struct file *file;
384 382
385 if (mpnt->vm_flags & VM_DONTCOPY) { 383 if (mpnt->vm_flags & VM_DONTCOPY) {
386 long pages = vma_pages(mpnt);
387 mm->total_vm -= pages;
388 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 384 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
389 -pages); 385 -vma_pages(mpnt));
390 continue; 386 continue;
391 } 387 }
392 charge = 0; 388 charge = 0;
393 if (mpnt->vm_flags & VM_ACCOUNT) { 389 if (mpnt->vm_flags & VM_ACCOUNT) {
394 unsigned long len; 390 unsigned long len = vma_pages(mpnt);
395 len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 391
396 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 392 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
397 goto fail_nomem; 393 goto fail_nomem;
398 charge = len; 394 charge = len;
@@ -1310,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1310#ifdef CONFIG_DEBUG_MUTEXES 1306#ifdef CONFIG_DEBUG_MUTEXES
1311 p->blocked_on = NULL; /* not blocked yet */ 1307 p->blocked_on = NULL; /* not blocked yet */
1312#endif 1308#endif
1313#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1309#ifdef CONFIG_MEMCG
1314 p->memcg_batch.do_batch = 0; 1310 p->memcg_batch.do_batch = 0;
1315 p->memcg_batch.memcg = NULL; 1311 p->memcg_batch.memcg = NULL;
1316#endif 1312#endif
@@ -1420,7 +1416,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1420 */ 1416 */
1421 p->group_leader = p; 1417 p->group_leader = p;
1422 INIT_LIST_HEAD(&p->thread_group); 1418 INIT_LIST_HEAD(&p->thread_group);
1423 INIT_HLIST_HEAD(&p->task_works); 1419 p->task_works = NULL;
1424 1420
1425 /* Now that the task is set up, run cgroup callbacks if 1421 /* Now that the task is set up, run cgroup callbacks if
1426 * necessary. We need to run them before the task is visible 1422 * necessary. We need to run them before the task is visible
diff --git a/kernel/futex.c b/kernel/futex.c
index e2b0fb9a0b3b..3717e7b306e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2231 * @uaddr2: the pi futex we will take prior to returning to user-space 2231 * @uaddr2: the pi futex we will take prior to returning to user-space
2232 * 2232 *
2233 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2233 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2234 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and 2234 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
2235 * complete the acquisition of the rt_mutex prior to returning to userspace. 2235 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2236 * This ensures the rt_mutex maintains an owner when it has waiters; without 2236 * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
2237 * one, the pi logic wouldn't know which task to boost/deboost, if there was a 2237 * without one, the pi logic would not know which task to boost/deboost, if
2238 * need to. 2238 * there was a need to.
2239 * 2239 *
2240 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2240 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2241 * via the following: 2241 * via the following:
@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2272 struct futex_q q = futex_q_init; 2272 struct futex_q q = futex_q_init;
2273 int res, ret; 2273 int res, ret;
2274 2274
2275 if (uaddr == uaddr2)
2276 return -EINVAL;
2277
2275 if (!bitset) 2278 if (!bitset)
2276 return -EINVAL; 2279 return -EINVAL;
2277 2280
@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2343 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 2346 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2344 * the pi_state. 2347 * the pi_state.
2345 */ 2348 */
2346 WARN_ON(!&q.pi_state); 2349 WARN_ON(!q.pi_state);
2347 pi_mutex = &q.pi_state->pi_mutex; 2350 pi_mutex = &q.pi_state->pi_mutex;
2348 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); 2351 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2349 debug_rt_mutex_free_waiter(&rt_waiter); 2352 debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2370 * fault, unlock the rt_mutex and return the fault to userspace. 2373 * fault, unlock the rt_mutex and return the fault to userspace.
2371 */ 2374 */
2372 if (ret == -EFAULT) { 2375 if (ret == -EFAULT) {
2373 if (rt_mutex_owner(pi_mutex) == current) 2376 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2374 rt_mutex_unlock(pi_mutex); 2377 rt_mutex_unlock(pi_mutex);
2375 } else if (ret == -EINTR) { 2378 } else if (ret == -EINTR) {
2376 /* 2379 /*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bdb180325551..131ca176b497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,7 @@ irqreturn_t
133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) 133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
134{ 134{
135 irqreturn_t retval = IRQ_NONE; 135 irqreturn_t retval = IRQ_NONE;
136 unsigned int random = 0, irq = desc->irq_data.irq; 136 unsigned int flags = 0, irq = desc->irq_data.irq;
137 137
138 do { 138 do {
139 irqreturn_t res; 139 irqreturn_t res;
@@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
161 161
162 /* Fall through to add to randomness */ 162 /* Fall through to add to randomness */
163 case IRQ_HANDLED: 163 case IRQ_HANDLED:
164 random |= action->flags; 164 flags |= action->flags;
165 break; 165 break;
166 166
167 default: 167 default:
@@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
172 action = action->next; 172 action = action->next;
173 } while (action); 173 } while (action);
174 174
175 if (random & IRQF_SAMPLE_RANDOM) 175 add_interrupt_randomness(irq, flags);
176 add_interrupt_randomness(irq);
177 176
178 if (!noirqdebug) 177 if (!noirqdebug)
179 note_interrupt(irq, desc, retval); 178 note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 41c1564103f1..49a77727db42 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
10#include <linux/mutex.h> 10#include <linux/mutex.h>
11#include <linux/of.h> 11#include <linux/of.h>
12#include <linux/of_address.h> 12#include <linux/of_address.h>
13#include <linux/topology.h>
13#include <linux/seq_file.h> 14#include <linux/seq_file.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -45,7 +46,8 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
45{ 46{
46 struct irq_domain *domain; 47 struct irq_domain *domain;
47 48
48 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 49 domain = kzalloc_node(sizeof(*domain), GFP_KERNEL,
50 of_node_to_nid(of_node));
49 if (WARN_ON(!domain)) 51 if (WARN_ON(!domain))
50 return NULL; 52 return NULL;
51 53
@@ -138,6 +140,36 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
138} 140}
139 141
140/** 142/**
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain.
144 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain
147 * @ops: map/unmap domain callbacks
148 * @host_data: Controller private data pointer
149 *
150 * Allocates a legacy irq_domain if irq_base is positive or a linear
151 * domain otherwise.
152 *
153 * This is intended to implement the expected behaviour for most
154 * interrupt controllers which is that a linear mapping should
155 * normally be used unless the system requires a legacy mapping in
156 * order to support supplying interrupt numbers during non-DT
157 * registration of devices.
158 */
159struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
160 unsigned int size,
161 unsigned int first_irq,
162 const struct irq_domain_ops *ops,
163 void *host_data)
164{
165 if (first_irq > 0)
166 return irq_domain_add_legacy(of_node, size, first_irq, 0,
167 ops, host_data);
168 else
169 return irq_domain_add_linear(of_node, size, ops, host_data);
170}
171
172/**
141 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. 173 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
142 * @of_node: pointer to interrupt controller's device tree node. 174 * @of_node: pointer to interrupt controller's device tree node.
143 * @size: total number of irqs in legacy mapping 175 * @size: total number of irqs in legacy mapping
@@ -203,7 +235,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
203 * one can then use irq_create_mapping() to 235 * one can then use irq_create_mapping() to
204 * explicitly change them 236 * explicitly change them
205 */ 237 */
206 ops->map(domain, irq, hwirq); 238 if (ops->map)
239 ops->map(domain, irq, hwirq);
207 240
208 /* Clear norequest flags */ 241 /* Clear norequest flags */
209 irq_clear_status_flags(irq, IRQ_NOREQUEST); 242 irq_clear_status_flags(irq, IRQ_NOREQUEST);
@@ -215,7 +248,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
215EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 248EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
216 249
217/** 250/**
218 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. 251 * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
219 * @of_node: pointer to interrupt controller's device tree node. 252 * @of_node: pointer to interrupt controller's device tree node.
220 * @size: Number of interrupts in the domain. 253 * @size: Number of interrupts in the domain.
221 * @ops: map/unmap domain callbacks 254 * @ops: map/unmap domain callbacks
@@ -229,7 +262,8 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
229 struct irq_domain *domain; 262 struct irq_domain *domain;
230 unsigned int *revmap; 263 unsigned int *revmap;
231 264
232 revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); 265 revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
266 of_node_to_nid(of_node));
233 if (WARN_ON(!revmap)) 267 if (WARN_ON(!revmap))
234 return NULL; 268 return NULL;
235 269
@@ -330,24 +364,112 @@ void irq_set_default_host(struct irq_domain *domain)
330} 364}
331EXPORT_SYMBOL_GPL(irq_set_default_host); 365EXPORT_SYMBOL_GPL(irq_set_default_host);
332 366
333static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, 367static void irq_domain_disassociate_many(struct irq_domain *domain,
334 irq_hw_number_t hwirq) 368 unsigned int irq_base, int count)
335{ 369{
336 struct irq_data *irq_data = irq_get_irq_data(virq); 370 /*
371 * disassociate in reverse order;
372 * not strictly necessary, but nice for unwinding
373 */
374 while (count--) {
375 int irq = irq_base + count;
376 struct irq_data *irq_data = irq_get_irq_data(irq);
377 irq_hw_number_t hwirq = irq_data->hwirq;
378
379 if (WARN_ON(!irq_data || irq_data->domain != domain))
380 continue;
381
382 irq_set_status_flags(irq, IRQ_NOREQUEST);
383
384 /* remove chip and handler */
385 irq_set_chip_and_handler(irq, NULL, NULL);
386
387 /* Make sure it's completed */
388 synchronize_irq(irq);
389
390 /* Tell the PIC about it */
391 if (domain->ops->unmap)
392 domain->ops->unmap(domain, irq);
393 smp_mb();
337 394
338 irq_data->hwirq = hwirq;
339 irq_data->domain = domain;
340 if (domain->ops->map(domain, virq, hwirq)) {
341 pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq);
342 irq_data->domain = NULL; 395 irq_data->domain = NULL;
343 irq_data->hwirq = 0; 396 irq_data->hwirq = 0;
344 return -1; 397
398 /* Clear reverse map */
399 switch(domain->revmap_type) {
400 case IRQ_DOMAIN_MAP_LINEAR:
401 if (hwirq < domain->revmap_data.linear.size)
402 domain->revmap_data.linear.revmap[hwirq] = 0;
403 break;
404 case IRQ_DOMAIN_MAP_TREE:
405 mutex_lock(&revmap_trees_mutex);
406 radix_tree_delete(&domain->revmap_data.tree, hwirq);
407 mutex_unlock(&revmap_trees_mutex);
408 break;
409 }
345 } 410 }
411}
412
413int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
414 irq_hw_number_t hwirq_base, int count)
415{
416 unsigned int virq = irq_base;
417 irq_hw_number_t hwirq = hwirq_base;
418 int i, ret;
419
420 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
421 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
422
423 for (i = 0; i < count; i++) {
424 struct irq_data *irq_data = irq_get_irq_data(virq + i);
425
426 if (WARN(!irq_data, "error: irq_desc not allocated; "
427 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
428 return -EINVAL;
429 if (WARN(irq_data->domain, "error: irq_desc already associated; "
430 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
431 return -EINVAL;
432 };
433
434 for (i = 0; i < count; i++, virq++, hwirq++) {
435 struct irq_data *irq_data = irq_get_irq_data(virq);
436
437 irq_data->hwirq = hwirq;
438 irq_data->domain = domain;
439 if (domain->ops->map) {
440 ret = domain->ops->map(domain, virq, hwirq);
441 if (ret != 0) {
442 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
443 virq, hwirq, ret);
444 WARN_ON(1);
445 irq_data->domain = NULL;
446 irq_data->hwirq = 0;
447 goto err_unmap;
448 }
449 }
346 450
347 irq_clear_status_flags(virq, IRQ_NOREQUEST); 451 switch (domain->revmap_type) {
452 case IRQ_DOMAIN_MAP_LINEAR:
453 if (hwirq < domain->revmap_data.linear.size)
454 domain->revmap_data.linear.revmap[hwirq] = virq;
455 break;
456 case IRQ_DOMAIN_MAP_TREE:
457 mutex_lock(&revmap_trees_mutex);
458 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
459 mutex_unlock(&revmap_trees_mutex);
460 break;
461 }
462
463 irq_clear_status_flags(virq, IRQ_NOREQUEST);
464 }
348 465
349 return 0; 466 return 0;
467
468 err_unmap:
469 irq_domain_disassociate_many(domain, irq_base, i);
470 return -EINVAL;
350} 471}
472EXPORT_SYMBOL_GPL(irq_domain_associate_many);
351 473
352/** 474/**
353 * irq_create_direct_mapping() - Allocate an irq for direct mapping 475 * irq_create_direct_mapping() - Allocate an irq for direct mapping
@@ -364,10 +486,10 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
364 if (domain == NULL) 486 if (domain == NULL)
365 domain = irq_default_domain; 487 domain = irq_default_domain;
366 488
367 BUG_ON(domain == NULL); 489 if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
368 WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); 490 return 0;
369 491
370 virq = irq_alloc_desc_from(1, 0); 492 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
371 if (!virq) { 493 if (!virq) {
372 pr_debug("create_direct virq allocation failed\n"); 494 pr_debug("create_direct virq allocation failed\n");
373 return 0; 495 return 0;
@@ -380,7 +502,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
380 } 502 }
381 pr_debug("create_direct obtained virq %d\n", virq); 503 pr_debug("create_direct obtained virq %d\n", virq);
382 504
383 if (irq_setup_virq(domain, virq, virq)) { 505 if (irq_domain_associate(domain, virq, virq)) {
384 irq_free_desc(virq); 506 irq_free_desc(virq);
385 return 0; 507 return 0;
386 } 508 }
@@ -433,27 +555,64 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
433 hint = hwirq % nr_irqs; 555 hint = hwirq % nr_irqs;
434 if (hint == 0) 556 if (hint == 0)
435 hint++; 557 hint++;
436 virq = irq_alloc_desc_from(hint, 0); 558 virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
437 if (virq <= 0) 559 if (virq <= 0)
438 virq = irq_alloc_desc_from(1, 0); 560 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
439 if (virq <= 0) { 561 if (virq <= 0) {
440 pr_debug("-> virq allocation failed\n"); 562 pr_debug("-> virq allocation failed\n");
441 return 0; 563 return 0;
442 } 564 }
443 565
444 if (irq_setup_virq(domain, virq, hwirq)) { 566 if (irq_domain_associate(domain, virq, hwirq)) {
445 if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) 567 irq_free_desc(virq);
446 irq_free_desc(virq);
447 return 0; 568 return 0;
448 } 569 }
449 570
450 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", 571 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
451 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); 572 hwirq, of_node_full_name(domain->of_node), virq);
452 573
453 return virq; 574 return virq;
454} 575}
455EXPORT_SYMBOL_GPL(irq_create_mapping); 576EXPORT_SYMBOL_GPL(irq_create_mapping);
456 577
578/**
579 * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
580 * @domain: domain owning the interrupt range
581 * @irq_base: beginning of linux IRQ range
582 * @hwirq_base: beginning of hardware IRQ range
583 * @count: Number of interrupts to map
584 *
585 * This routine is used for allocating and mapping a range of hardware
586 * irqs to linux irqs where the linux irq numbers are at pre-defined
587 * locations. For use by controllers that already have static mappings
588 * to insert in to the domain.
589 *
590 * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
591 * domain insertion.
592 *
593 * 0 is returned upon success, while any failure to establish a static
594 * mapping is treated as an error.
595 */
596int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
597 irq_hw_number_t hwirq_base, int count)
598{
599 int ret;
600
601 ret = irq_alloc_descs(irq_base, irq_base, count,
602 of_node_to_nid(domain->of_node));
603 if (unlikely(ret < 0))
604 return ret;
605
606 ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count);
607 if (unlikely(ret < 0)) {
608 irq_free_descs(irq_base, count);
609 return ret;
610 }
611
612 return 0;
613}
614EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
615
457unsigned int irq_create_of_mapping(struct device_node *controller, 616unsigned int irq_create_of_mapping(struct device_node *controller,
458 const u32 *intspec, unsigned int intsize) 617 const u32 *intspec, unsigned int intsize)
459{ 618{
@@ -477,7 +636,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
477 return intspec[0]; 636 return intspec[0];
478#endif 637#endif
479 pr_warning("no irq domain found for %s !\n", 638 pr_warning("no irq domain found for %s !\n",
480 controller->full_name); 639 of_node_full_name(controller));
481 return 0; 640 return 0;
482 } 641 }
483 642
@@ -511,7 +670,6 @@ void irq_dispose_mapping(unsigned int virq)
511{ 670{
512 struct irq_data *irq_data = irq_get_irq_data(virq); 671 struct irq_data *irq_data = irq_get_irq_data(virq);
513 struct irq_domain *domain; 672 struct irq_domain *domain;
514 irq_hw_number_t hwirq;
515 673
516 if (!virq || !irq_data) 674 if (!virq || !irq_data)
517 return; 675 return;
@@ -524,33 +682,7 @@ void irq_dispose_mapping(unsigned int virq)
524 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) 682 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
525 return; 683 return;
526 684
527 irq_set_status_flags(virq, IRQ_NOREQUEST); 685 irq_domain_disassociate_many(domain, virq, 1);
528
529 /* remove chip and handler */
530 irq_set_chip_and_handler(virq, NULL, NULL);
531
532 /* Make sure it's completed */
533 synchronize_irq(virq);
534
535 /* Tell the PIC about it */
536 if (domain->ops->unmap)
537 domain->ops->unmap(domain, virq);
538 smp_mb();
539
540 /* Clear reverse map */
541 hwirq = irq_data->hwirq;
542 switch(domain->revmap_type) {
543 case IRQ_DOMAIN_MAP_LINEAR:
544 if (hwirq < domain->revmap_data.linear.size)
545 domain->revmap_data.linear.revmap[hwirq] = 0;
546 break;
547 case IRQ_DOMAIN_MAP_TREE:
548 mutex_lock(&revmap_trees_mutex);
549 radix_tree_delete(&domain->revmap_data.tree, hwirq);
550 mutex_unlock(&revmap_trees_mutex);
551 break;
552 }
553
554 irq_free_desc(virq); 686 irq_free_desc(virq);
555} 687}
556EXPORT_SYMBOL_GPL(irq_dispose_mapping); 688EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -559,16 +691,11 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping);
559 * irq_find_mapping() - Find a linux irq from an hw irq number. 691 * irq_find_mapping() - Find a linux irq from an hw irq number.
560 * @domain: domain owning this hardware interrupt 692 * @domain: domain owning this hardware interrupt
561 * @hwirq: hardware irq number in that domain space 693 * @hwirq: hardware irq number in that domain space
562 *
563 * This is a slow path, for use by generic code. It's expected that an
564 * irq controller implementation directly calls the appropriate low level
565 * mapping function.
566 */ 694 */
567unsigned int irq_find_mapping(struct irq_domain *domain, 695unsigned int irq_find_mapping(struct irq_domain *domain,
568 irq_hw_number_t hwirq) 696 irq_hw_number_t hwirq)
569{ 697{
570 unsigned int i; 698 struct irq_data *data;
571 unsigned int hint = hwirq % nr_irqs;
572 699
573 /* Look for default domain if nececssary */ 700 /* Look for default domain if nececssary */
574 if (domain == NULL) 701 if (domain == NULL)
@@ -576,115 +703,47 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
576 if (domain == NULL) 703 if (domain == NULL)
577 return 0; 704 return 0;
578 705
579 /* legacy -> bail early */ 706 switch (domain->revmap_type) {
580 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) 707 case IRQ_DOMAIN_MAP_LEGACY:
581 return irq_domain_legacy_revmap(domain, hwirq); 708 return irq_domain_legacy_revmap(domain, hwirq);
582 709 case IRQ_DOMAIN_MAP_LINEAR:
583 /* Slow path does a linear search of the map */ 710 return irq_linear_revmap(domain, hwirq);
584 if (hint == 0) 711 case IRQ_DOMAIN_MAP_TREE:
585 hint = 1; 712 rcu_read_lock();
586 i = hint; 713 data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
587 do { 714 rcu_read_unlock();
588 struct irq_data *data = irq_get_irq_data(i); 715 if (data)
716 return data->irq;
717 break;
718 case IRQ_DOMAIN_MAP_NOMAP:
719 data = irq_get_irq_data(hwirq);
589 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 720 if (data && (data->domain == domain) && (data->hwirq == hwirq))
590 return i; 721 return hwirq;
591 i++; 722 break;
592 if (i >= nr_irqs) 723 }
593 i = 1; 724
594 } while(i != hint);
595 return 0; 725 return 0;
596} 726}
597EXPORT_SYMBOL_GPL(irq_find_mapping); 727EXPORT_SYMBOL_GPL(irq_find_mapping);
598 728
599/** 729/**
600 * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
601 * @domain: domain owning this hardware interrupt
602 * @hwirq: hardware irq number in that domain space
603 *
604 * This is a fast path, for use by irq controller code that uses radix tree
605 * revmaps
606 */
607unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
608 irq_hw_number_t hwirq)
609{
610 struct irq_data *irq_data;
611
612 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
613 return irq_find_mapping(domain, hwirq);
614
615 /*
616 * Freeing an irq can delete nodes along the path to
617 * do the lookup via call_rcu.
618 */
619 rcu_read_lock();
620 irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
621 rcu_read_unlock();
622
623 /*
624 * If found in radix tree, then fine.
625 * Else fallback to linear lookup - this should not happen in practice
626 * as it means that we failed to insert the node in the radix tree.
627 */
628 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
629}
630EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup);
631
632/**
633 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
634 * @domain: domain owning this hardware interrupt
635 * @virq: linux irq number
636 * @hwirq: hardware irq number in that domain space
637 *
638 * This is for use by irq controllers that use a radix tree reverse
639 * mapping for fast lookup.
640 */
641void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
642 irq_hw_number_t hwirq)
643{
644 struct irq_data *irq_data = irq_get_irq_data(virq);
645
646 if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
647 return;
648
649 if (virq) {
650 mutex_lock(&revmap_trees_mutex);
651 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
652 mutex_unlock(&revmap_trees_mutex);
653 }
654}
655EXPORT_SYMBOL_GPL(irq_radix_revmap_insert);
656
657/**
658 * irq_linear_revmap() - Find a linux irq from a hw irq number. 730 * irq_linear_revmap() - Find a linux irq from a hw irq number.
659 * @domain: domain owning this hardware interrupt 731 * @domain: domain owning this hardware interrupt
660 * @hwirq: hardware irq number in that domain space 732 * @hwirq: hardware irq number in that domain space
661 * 733 *
662 * This is a fast path, for use by irq controller code that uses linear 734 * This is a fast path that can be called directly by irq controller code to
663 * revmaps. It does fallback to the slow path if the revmap doesn't exist 735 * save a handful of instructions.
664 * yet and will create the revmap entry with appropriate locking
665 */ 736 */
666unsigned int irq_linear_revmap(struct irq_domain *domain, 737unsigned int irq_linear_revmap(struct irq_domain *domain,
667 irq_hw_number_t hwirq) 738 irq_hw_number_t hwirq)
668{ 739{
669 unsigned int *revmap; 740 BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
670
671 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
672 return irq_find_mapping(domain, hwirq);
673 741
674 /* Check revmap bounds */ 742 /* Check revmap bounds; complain if exceeded */
675 if (unlikely(hwirq >= domain->revmap_data.linear.size)) 743 if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
676 return irq_find_mapping(domain, hwirq); 744 return 0;
677
678 /* Check if revmap was allocated */
679 revmap = domain->revmap_data.linear.revmap;
680 if (unlikely(revmap == NULL))
681 return irq_find_mapping(domain, hwirq);
682
683 /* Fill up revmap with slow path if no mapping found */
684 if (unlikely(!revmap[hwirq]))
685 revmap[hwirq] = irq_find_mapping(domain, hwirq);
686 745
687 return revmap[hwirq]; 746 return domain->revmap_data.linear.revmap[hwirq];
688} 747}
689EXPORT_SYMBOL_GPL(irq_linear_revmap); 748EXPORT_SYMBOL_GPL(irq_linear_revmap);
690 749
@@ -725,8 +784,8 @@ static int virq_debug_show(struct seq_file *m, void *private)
725 data = irq_desc_get_chip_data(desc); 784 data = irq_desc_get_chip_data(desc);
726 seq_printf(m, data ? "0x%p " : " %p ", data); 785 seq_printf(m, data ? "0x%p " : " %p ", data);
727 786
728 if (desc->irq_data.domain && desc->irq_data.domain->of_node) 787 if (desc->irq_data.domain)
729 p = desc->irq_data.domain->of_node->full_name; 788 p = of_node_full_name(desc->irq_data.domain->of_node);
730 else 789 else
731 p = none; 790 p = none;
732 seq_printf(m, "%s\n", p); 791 seq_printf(m, "%s\n", p);
@@ -761,12 +820,6 @@ static int __init irq_debugfs_init(void)
761__initcall(irq_debugfs_init); 820__initcall(irq_debugfs_init);
762#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ 821#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
763 822
764static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
765 irq_hw_number_t hwirq)
766{
767 return 0;
768}
769
770/** 823/**
771 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings 824 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
772 * 825 *
@@ -829,7 +882,6 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
829EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); 882EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
830 883
831const struct irq_domain_ops irq_domain_simple_ops = { 884const struct irq_domain_ops irq_domain_simple_ops = {
832 .map = irq_domain_simple_map,
833 .xlate = irq_domain_xlate_onetwocell, 885 .xlate = irq_domain_xlate_onetwocell,
834}; 886};
835EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 887EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8c548232ba39..4c69326aa773 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc)
781 wake_up(&desc->wait_for_threads); 781 wake_up(&desc->wait_for_threads);
782} 782}
783 783
784static void irq_thread_dtor(struct task_work *unused) 784static void irq_thread_dtor(struct callback_head *unused)
785{ 785{
786 struct task_struct *tsk = current; 786 struct task_struct *tsk = current;
787 struct irq_desc *desc; 787 struct irq_desc *desc;
@@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused)
813 */ 813 */
814static int irq_thread(void *data) 814static int irq_thread(void *data)
815{ 815{
816 struct task_work on_exit_work; 816 struct callback_head on_exit_work;
817 static const struct sched_param param = { 817 static const struct sched_param param = {
818 .sched_priority = MAX_USER_RT_PRIO/2, 818 .sched_priority = MAX_USER_RT_PRIO/2,
819 }; 819 };
@@ -830,7 +830,7 @@ static int irq_thread(void *data)
830 830
831 sched_setscheduler(current, SCHED_FIFO, &param); 831 sched_setscheduler(current, SCHED_FIFO, &param);
832 832
833 init_task_work(&on_exit_work, irq_thread_dtor, NULL); 833 init_task_work(&on_exit_work, irq_thread_dtor);
834 task_work_add(current, &on_exit_work, false); 834 task_work_add(current, &on_exit_work, false);
835 835
836 while (!irq_wait_for_interrupt(action)) { 836 while (!irq_wait_for_interrupt(action)) {
@@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
893 return -ENOSYS; 893 return -ENOSYS;
894 if (!try_module_get(desc->owner)) 894 if (!try_module_get(desc->owner))
895 return -ENODEV; 895 return -ENODEV;
896 /*
897 * Some drivers like serial.c use request_irq() heavily,
898 * so we have to be careful not to interfere with a
899 * running system.
900 */
901 if (new->flags & IRQF_SAMPLE_RANDOM) {
902 /*
903 * This function might sleep, we want to call it first,
904 * outside of the atomic block.
905 * Yes, this might clear the entropy pool if the wrong
906 * driver is attempted to be loaded, without actually
907 * installing a new handler, but is this really a problem,
908 * only the sysadmin is able to do this.
909 */
910 rand_initialize_irq(irq);
911 }
912 896
913 /* 897 /*
914 * Check whether the interrupt nests into another interrupt 898 * Check whether the interrupt nests into another interrupt
@@ -960,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
960 } 944 }
961 945
962 /* 946 /*
947 * Drivers are often written to work w/o knowledge about the
948 * underlying irq chip implementation, so a request for a
949 * threaded irq without a primary hard irq context handler
950 * requires the ONESHOT flag to be set. Some irq chips like
951 * MSI based interrupts are per se one shot safe. Check the
952 * chip flags, so we can avoid the unmask dance at the end of
953 * the threaded handler for those.
954 */
955 if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
956 new->flags &= ~IRQF_ONESHOT;
957
958 /*
963 * The following block of code has to be executed atomically 959 * The following block of code has to be executed atomically
964 */ 960 */
965 raw_spin_lock_irqsave(&desc->lock, flags); 961 raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1033,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1033 */ 1029 */
1034 new->thread_mask = 1 << ffz(thread_mask); 1030 new->thread_mask = 1 << ffz(thread_mask);
1035 1031
1036 } else if (new->handler == irq_default_primary_handler) { 1032 } else if (new->handler == irq_default_primary_handler &&
1033 !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
1037 /* 1034 /*
1038 * The interrupt was requested with handler = NULL, so 1035 * The interrupt was requested with handler = NULL, so
1039 * we use the default primary handler for it. But it 1036 * we use the default primary handler for it. But it
@@ -1354,7 +1351,6 @@ EXPORT_SYMBOL(free_irq);
1354 * Flags: 1351 * Flags:
1355 * 1352 *
1356 * IRQF_SHARED Interrupt is shared 1353 * IRQF_SHARED Interrupt is shared
1357 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1358 * IRQF_TRIGGER_* Specify active edge(s) or level 1354 * IRQF_TRIGGER_* Specify active edge(s) or level
1359 * 1355 *
1360 */ 1356 */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e2e472f6aeb..0668d58d6413 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void)
1424 1424
1425void crash_save_vmcoreinfo(void) 1425void crash_save_vmcoreinfo(void)
1426{ 1426{
1427 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1427 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1428 update_vmcoreinfo_note(); 1428 update_vmcoreinfo_note();
1429} 1429}
1430 1430
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff2c7cb86d77..6f99aead66c6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,6 +45,13 @@ extern int max_threads;
45 45
46static struct workqueue_struct *khelper_wq; 46static struct workqueue_struct *khelper_wq;
47 47
48/*
49 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
50 * locking to protect this global - it is private to the singleton khelper
51 * thread and should only ever be modified by that thread.
52 */
53static const struct task_struct *kmod_thread_locker;
54
48#define CAP_BSET (void *)1 55#define CAP_BSET (void *)1
49#define CAP_PI (void *)2 56#define CAP_PI (void *)2
50 57
@@ -221,6 +228,13 @@ fail:
221 return 0; 228 return 0;
222} 229}
223 230
231static int call_helper(void *data)
232{
233 /* Worker thread started blocking khelper thread. */
234 kmod_thread_locker = current;
235 return ____call_usermodehelper(data);
236}
237
224static void call_usermodehelper_freeinfo(struct subprocess_info *info) 238static void call_usermodehelper_freeinfo(struct subprocess_info *info)
225{ 239{
226 if (info->cleanup) 240 if (info->cleanup)
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work)
295 if (wait == UMH_WAIT_PROC) 309 if (wait == UMH_WAIT_PROC)
296 pid = kernel_thread(wait_for_helper, sub_info, 310 pid = kernel_thread(wait_for_helper, sub_info,
297 CLONE_FS | CLONE_FILES | SIGCHLD); 311 CLONE_FS | CLONE_FILES | SIGCHLD);
298 else 312 else {
299 pid = kernel_thread(____call_usermodehelper, sub_info, 313 pid = kernel_thread(call_helper, sub_info,
300 CLONE_VFORK | SIGCHLD); 314 CLONE_VFORK | SIGCHLD);
315 /* Worker thread stopped blocking khelper thread. */
316 kmod_thread_locker = NULL;
317 }
301 318
302 switch (wait) { 319 switch (wait) {
303 case UMH_NO_WAIT: 320 case UMH_NO_WAIT:
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
548 retval = -EBUSY; 565 retval = -EBUSY;
549 goto out; 566 goto out;
550 } 567 }
568 /*
569 * Worker thread must not wait for khelper thread at below
570 * wait_for_completion() if the thread was created with CLONE_VFORK
571 * flag, for khelper thread is already waiting for the thread at
572 * wait_for_completion() in do_fork().
573 */
574 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
575 retval = -EBUSY;
576 goto out;
577 }
551 578
552 sub_info->complete = &done; 579 sub_info->complete = &done;
553 sub_info->wait = wait; 580 sub_info->wait = wait;
@@ -577,6 +604,12 @@ unlock:
577 return retval; 604 return retval;
578} 605}
579 606
607/*
608 * call_usermodehelper_fns() will not run the caller-provided cleanup function
609 * if a memory allocation failure is experienced. So the caller might need to
610 * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
611 * the necessaary cleanup within the caller.
612 */
580int call_usermodehelper_fns( 613int call_usermodehelper_fns(
581 char *path, char **argv, char **envp, int wait, 614 char *path, char **argv, char **envp, int wait,
582 int (*init)(struct subprocess_info *info, struct cred *new), 615 int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..b579af57ea10 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
360 struct kthread_work, node); 360 struct kthread_work, node);
361 list_del_init(&work->node); 361 list_del_init(&work->node);
362 } 362 }
363 worker->current_work = work;
363 spin_unlock_irq(&worker->lock); 364 spin_unlock_irq(&worker->lock);
364 365
365 if (work) { 366 if (work) {
366 __set_current_state(TASK_RUNNING); 367 __set_current_state(TASK_RUNNING);
367 work->func(work); 368 work->func(work);
368 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
369 work->done_seq = work->queue_seq;
370 smp_mb(); /* mb worker-b1 paired with flush-b0 */
371 if (atomic_read(&work->flushing))
372 wake_up_all(&work->done);
373 } else if (!freezing(current)) 369 } else if (!freezing(current))
374 schedule(); 370 schedule();
375 371
@@ -378,6 +374,19 @@ repeat:
378} 374}
379EXPORT_SYMBOL_GPL(kthread_worker_fn); 375EXPORT_SYMBOL_GPL(kthread_worker_fn);
380 376
377/* insert @work before @pos in @worker */
378static void insert_kthread_work(struct kthread_worker *worker,
379 struct kthread_work *work,
380 struct list_head *pos)
381{
382 lockdep_assert_held(&worker->lock);
383
384 list_add_tail(&work->node, pos);
385 work->worker = worker;
386 if (likely(worker->task))
387 wake_up_process(worker->task);
388}
389
381/** 390/**
382 * queue_kthread_work - queue a kthread_work 391 * queue_kthread_work - queue a kthread_work
383 * @worker: target kthread_worker 392 * @worker: target kthread_worker
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
395 404
396 spin_lock_irqsave(&worker->lock, flags); 405 spin_lock_irqsave(&worker->lock, flags);
397 if (list_empty(&work->node)) { 406 if (list_empty(&work->node)) {
398 list_add_tail(&work->node, &worker->work_list); 407 insert_kthread_work(worker, work, &worker->work_list);
399 work->queue_seq++;
400 if (likely(worker->task))
401 wake_up_process(worker->task);
402 ret = true; 408 ret = true;
403 } 409 }
404 spin_unlock_irqrestore(&worker->lock, flags); 410 spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
406} 412}
407EXPORT_SYMBOL_GPL(queue_kthread_work); 413EXPORT_SYMBOL_GPL(queue_kthread_work);
408 414
415struct kthread_flush_work {
416 struct kthread_work work;
417 struct completion done;
418};
419
420static void kthread_flush_work_fn(struct kthread_work *work)
421{
422 struct kthread_flush_work *fwork =
423 container_of(work, struct kthread_flush_work, work);
424 complete(&fwork->done);
425}
426
409/** 427/**
410 * flush_kthread_work - flush a kthread_work 428 * flush_kthread_work - flush a kthread_work
411 * @work: work to flush 429 * @work: work to flush
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
414 */ 432 */
415void flush_kthread_work(struct kthread_work *work) 433void flush_kthread_work(struct kthread_work *work)
416{ 434{
417 int seq = work->queue_seq; 435 struct kthread_flush_work fwork = {
418 436 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
419 atomic_inc(&work->flushing); 437 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
438 };
439 struct kthread_worker *worker;
440 bool noop = false;
420 441
421 /* 442retry:
422 * mb flush-b0 paired with worker-b1, to make sure either 443 worker = work->worker;
423 * worker sees the above increment or we see done_seq update. 444 if (!worker)
424 */ 445 return;
425 smp_mb__after_atomic_inc();
426 446
427 /* A - B <= 0 tests whether B is in front of A regardless of overflow */ 447 spin_lock_irq(&worker->lock);
428 wait_event(work->done, seq - work->done_seq <= 0); 448 if (work->worker != worker) {
429 atomic_dec(&work->flushing); 449 spin_unlock_irq(&worker->lock);
450 goto retry;
451 }
430 452
431 /* 453 if (!list_empty(&work->node))
432 * rmb flush-b1 paired with worker-b0, to make sure our caller 454 insert_kthread_work(worker, &fwork.work, work->node.next);
433 * sees every change made by work->func(). 455 else if (worker->current_work == work)
434 */ 456 insert_kthread_work(worker, &fwork.work, worker->work_list.next);
435 smp_mb__after_atomic_dec(); 457 else
436} 458 noop = true;
437EXPORT_SYMBOL_GPL(flush_kthread_work);
438 459
439struct kthread_flush_work { 460 spin_unlock_irq(&worker->lock);
440 struct kthread_work work;
441 struct completion done;
442};
443 461
444static void kthread_flush_work_fn(struct kthread_work *work) 462 if (!noop)
445{ 463 wait_for_completion(&fwork.done);
446 struct kthread_flush_work *fwork =
447 container_of(work, struct kthread_flush_work, work);
448 complete(&fwork->done);
449} 464}
465EXPORT_SYMBOL_GPL(flush_kthread_work);
450 466
451/** 467/**
452 * flush_kthread_worker - flush all current works on a kthread_worker 468 * flush_kthread_worker - flush all current works on a kthread_worker
diff --git a/kernel/panic.c b/kernel/panic.c
index d2a5f4ecc6dd..e1b2822fff97 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...)
75 int state = 0; 75 int state = 0;
76 76
77 /* 77 /*
78 * Disable local interrupts. This will prevent panic_smp_self_stop
79 * from deadlocking the first cpu that invokes the panic, since
80 * there is nothing to prevent an interrupt handler (that runs
81 * after the panic_lock is acquired) from invoking panic again.
82 */
83 local_irq_disable();
84
85 /*
78 * It's possible to come here directly from a panic-assertion and 86 * It's possible to come here directly from a panic-assertion and
79 * not have preempt disabled. Some functions called from here want 87 * not have preempt disabled. Some functions called from here want
80 * preempt to be disabled. No point enabling it later though... 88 * preempt to be disabled. No point enabling it later though...
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8f9b4eb974e0..a70518c9d82f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -175,7 +175,7 @@ config PM_TEST_SUSPEND
175 You probably want to have your system's RTC driver statically 175 You probably want to have your system's RTC driver statically
176 linked, ensuring that it's available when this test runs. 176 linked, ensuring that it's available when this test runs.
177 177
178config CAN_PM_TRACE 178config PM_SLEEP_DEBUG
179 def_bool y 179 def_bool y
180 depends on PM_DEBUG && PM_SLEEP 180 depends on PM_DEBUG && PM_SLEEP
181 181
@@ -196,7 +196,7 @@ config PM_TRACE
196 196
197config PM_TRACE_RTC 197config PM_TRACE_RTC
198 bool "Suspend/resume event tracing" 198 bool "Suspend/resume event tracing"
199 depends on CAN_PM_TRACE 199 depends on PM_SLEEP_DEBUG
200 depends on X86 200 depends on X86
201 select PM_TRACE 201 select PM_TRACE
202 ---help--- 202 ---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 238025f5472e..b26f5f1e773e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,6 +5,7 @@
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
8 * 9 *
9 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
10 */ 11 */
@@ -45,6 +46,9 @@ enum {
45 HIBERNATION_PLATFORM, 46 HIBERNATION_PLATFORM,
46 HIBERNATION_SHUTDOWN, 47 HIBERNATION_SHUTDOWN,
47 HIBERNATION_REBOOT, 48 HIBERNATION_REBOOT,
49#ifdef CONFIG_SUSPEND
50 HIBERNATION_SUSPEND,
51#endif
48 /* keep last */ 52 /* keep last */
49 __HIBERNATION_AFTER_LAST 53 __HIBERNATION_AFTER_LAST
50}; 54};
@@ -353,6 +357,7 @@ int hibernation_snapshot(int platform_mode)
353 } 357 }
354 358
355 suspend_console(); 359 suspend_console();
360 ftrace_stop();
356 pm_restrict_gfp_mask(); 361 pm_restrict_gfp_mask();
357 362
358 error = dpm_suspend(PMSG_FREEZE); 363 error = dpm_suspend(PMSG_FREEZE);
@@ -378,6 +383,7 @@ int hibernation_snapshot(int platform_mode)
378 if (error || !in_suspend) 383 if (error || !in_suspend)
379 pm_restore_gfp_mask(); 384 pm_restore_gfp_mask();
380 385
386 ftrace_start();
381 resume_console(); 387 resume_console();
382 dpm_complete(msg); 388 dpm_complete(msg);
383 389
@@ -480,6 +486,7 @@ int hibernation_restore(int platform_mode)
480 486
481 pm_prepare_console(); 487 pm_prepare_console();
482 suspend_console(); 488 suspend_console();
489 ftrace_stop();
483 pm_restrict_gfp_mask(); 490 pm_restrict_gfp_mask();
484 error = dpm_suspend_start(PMSG_QUIESCE); 491 error = dpm_suspend_start(PMSG_QUIESCE);
485 if (!error) { 492 if (!error) {
@@ -487,6 +494,7 @@ int hibernation_restore(int platform_mode)
487 dpm_resume_end(PMSG_RECOVER); 494 dpm_resume_end(PMSG_RECOVER);
488 } 495 }
489 pm_restore_gfp_mask(); 496 pm_restore_gfp_mask();
497 ftrace_start();
490 resume_console(); 498 resume_console();
491 pm_restore_console(); 499 pm_restore_console();
492 return error; 500 return error;
@@ -513,6 +521,7 @@ int hibernation_platform_enter(void)
513 521
514 entering_platform_hibernation = true; 522 entering_platform_hibernation = true;
515 suspend_console(); 523 suspend_console();
524 ftrace_stop();
516 error = dpm_suspend_start(PMSG_HIBERNATE); 525 error = dpm_suspend_start(PMSG_HIBERNATE);
517 if (error) { 526 if (error) {
518 if (hibernation_ops->recover) 527 if (hibernation_ops->recover)
@@ -556,6 +565,7 @@ int hibernation_platform_enter(void)
556 Resume_devices: 565 Resume_devices:
557 entering_platform_hibernation = false; 566 entering_platform_hibernation = false;
558 dpm_resume_end(PMSG_RESTORE); 567 dpm_resume_end(PMSG_RESTORE);
568 ftrace_start();
559 resume_console(); 569 resume_console();
560 570
561 Close: 571 Close:
@@ -573,6 +583,10 @@ int hibernation_platform_enter(void)
573 */ 583 */
574static void power_down(void) 584static void power_down(void)
575{ 585{
586#ifdef CONFIG_SUSPEND
587 int error;
588#endif
589
576 switch (hibernation_mode) { 590 switch (hibernation_mode) {
577 case HIBERNATION_REBOOT: 591 case HIBERNATION_REBOOT:
578 kernel_restart(NULL); 592 kernel_restart(NULL);
@@ -582,6 +596,25 @@ static void power_down(void)
582 case HIBERNATION_SHUTDOWN: 596 case HIBERNATION_SHUTDOWN:
583 kernel_power_off(); 597 kernel_power_off();
584 break; 598 break;
599#ifdef CONFIG_SUSPEND
600 case HIBERNATION_SUSPEND:
601 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
602 if (error) {
603 if (hibernation_ops)
604 hibernation_mode = HIBERNATION_PLATFORM;
605 else
606 hibernation_mode = HIBERNATION_SHUTDOWN;
607 power_down();
608 }
609 /*
610 * Restore swap signature.
611 */
612 error = swsusp_unmark();
613 if (error)
614 printk(KERN_ERR "PM: Swap will be unusable! "
615 "Try swapon -a.\n");
616 return;
617#endif
585 } 618 }
586 kernel_halt(); 619 kernel_halt();
587 /* 620 /*
@@ -819,6 +852,9 @@ static const char * const hibernation_modes[] = {
819 [HIBERNATION_PLATFORM] = "platform", 852 [HIBERNATION_PLATFORM] = "platform",
820 [HIBERNATION_SHUTDOWN] = "shutdown", 853 [HIBERNATION_SHUTDOWN] = "shutdown",
821 [HIBERNATION_REBOOT] = "reboot", 854 [HIBERNATION_REBOOT] = "reboot",
855#ifdef CONFIG_SUSPEND
856 [HIBERNATION_SUSPEND] = "suspend",
857#endif
822}; 858};
823 859
824/* 860/*
@@ -859,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
859 switch (i) { 895 switch (i) {
860 case HIBERNATION_SHUTDOWN: 896 case HIBERNATION_SHUTDOWN:
861 case HIBERNATION_REBOOT: 897 case HIBERNATION_REBOOT:
898#ifdef CONFIG_SUSPEND
899 case HIBERNATION_SUSPEND:
900#endif
862 break; 901 break;
863 case HIBERNATION_PLATFORM: 902 case HIBERNATION_PLATFORM:
864 if (hibernation_ops) 903 if (hibernation_ops)
@@ -899,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
899 switch (mode) { 938 switch (mode) {
900 case HIBERNATION_SHUTDOWN: 939 case HIBERNATION_SHUTDOWN:
901 case HIBERNATION_REBOOT: 940 case HIBERNATION_REBOOT:
941#ifdef CONFIG_SUSPEND
942 case HIBERNATION_SUSPEND:
943#endif
902 hibernation_mode = mode; 944 hibernation_mode = mode;
903 break; 945 break;
904 case HIBERNATION_PLATFORM: 946 case HIBERNATION_PLATFORM:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 428f8a034e96..f458238109cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init);
235 235
236#endif /* CONFIG_PM_SLEEP */ 236#endif /* CONFIG_PM_SLEEP */
237 237
238#ifdef CONFIG_PM_SLEEP_DEBUG
239/*
240 * pm_print_times: print time taken by devices to suspend and resume.
241 *
242 * show() returns whether printing of suspend and resume times is enabled.
243 * store() accepts 0 or 1. 0 disables printing and 1 enables it.
244 */
245bool pm_print_times_enabled;
246
247static ssize_t pm_print_times_show(struct kobject *kobj,
248 struct kobj_attribute *attr, char *buf)
249{
250 return sprintf(buf, "%d\n", pm_print_times_enabled);
251}
252
253static ssize_t pm_print_times_store(struct kobject *kobj,
254 struct kobj_attribute *attr,
255 const char *buf, size_t n)
256{
257 unsigned long val;
258
259 if (kstrtoul(buf, 10, &val))
260 return -EINVAL;
261
262 if (val > 1)
263 return -EINVAL;
264
265 pm_print_times_enabled = !!val;
266 return n;
267}
268
269power_attr(pm_print_times);
270
271static inline void pm_print_times_init(void)
272{
273 pm_print_times_enabled = !!initcall_debug;
274}
275#else /* !CONFIG_PP_SLEEP_DEBUG */
276static inline void pm_print_times_init(void) {}
277#endif /* CONFIG_PM_SLEEP_DEBUG */
278
238struct kobject *power_kobj; 279struct kobject *power_kobj;
239 280
240/** 281/**
@@ -531,6 +572,9 @@ static struct attribute * g[] = {
531#ifdef CONFIG_PM_DEBUG 572#ifdef CONFIG_PM_DEBUG
532 &pm_test_attr.attr, 573 &pm_test_attr.attr,
533#endif 574#endif
575#ifdef CONFIG_PM_SLEEP_DEBUG
576 &pm_print_times_attr.attr,
577#endif
534#endif 578#endif
535 NULL, 579 NULL,
536}; 580};
@@ -566,6 +610,7 @@ static int __init pm_init(void)
566 error = sysfs_create_group(power_kobj, &attr_group); 610 error = sysfs_create_group(power_kobj, &attr_group);
567 if (error) 611 if (error)
568 return error; 612 return error;
613 pm_print_times_init();
569 return pm_autosleep_init(); 614 return pm_autosleep_init();
570} 615}
571 616
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b0bd4beaebfe..7d4b7ffb3c1d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -156,6 +156,9 @@ extern void swsusp_free(void);
156extern int swsusp_read(unsigned int *flags_p); 156extern int swsusp_read(unsigned int *flags_p);
157extern int swsusp_write(unsigned int flags); 157extern int swsusp_write(unsigned int flags);
158extern void swsusp_close(fmode_t); 158extern void swsusp_close(fmode_t);
159#ifdef CONFIG_SUSPEND
160extern int swsusp_unmark(void);
161#endif
159 162
160/* kernel/power/block_io.c */ 163/* kernel/power/block_io.c */
161extern struct block_device *hib_resume_bdev; 164extern struct block_device *hib_resume_bdev;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 396d262b8fd0..1da39ea248fd 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -24,6 +24,7 @@
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/suspend.h> 25#include <linux/suspend.h>
26#include <linux/syscore_ops.h> 26#include <linux/syscore_ops.h>
27#include <linux/ftrace.h>
27#include <trace/events/power.h> 28#include <trace/events/power.h>
28 29
29#include "power.h" 30#include "power.h"
@@ -177,6 +178,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
177 arch_suspend_enable_irqs(); 178 arch_suspend_enable_irqs();
178 BUG_ON(irqs_disabled()); 179 BUG_ON(irqs_disabled());
179 180
181 /* Kick the lockup detector */
182 lockup_detector_bootcpu_resume();
183
180 Enable_cpus: 184 Enable_cpus:
181 enable_nonboot_cpus(); 185 enable_nonboot_cpus();
182 186
@@ -212,6 +216,7 @@ int suspend_devices_and_enter(suspend_state_t state)
212 goto Close; 216 goto Close;
213 } 217 }
214 suspend_console(); 218 suspend_console();
219 ftrace_stop();
215 suspend_test_start(); 220 suspend_test_start();
216 error = dpm_suspend_start(PMSG_SUSPEND); 221 error = dpm_suspend_start(PMSG_SUSPEND);
217 if (error) { 222 if (error) {
@@ -231,6 +236,7 @@ int suspend_devices_and_enter(suspend_state_t state)
231 suspend_test_start(); 236 suspend_test_start();
232 dpm_resume_end(PMSG_RESUME); 237 dpm_resume_end(PMSG_RESUME);
233 suspend_test_finish("resume devices"); 238 suspend_test_finish("resume devices");
239 ftrace_start();
234 resume_console(); 240 resume_console();
235 Close: 241 Close:
236 if (suspend_ops->end) 242 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11e22c068e8b..3c9d764eb0d8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle,
448 struct timeval start; 448 struct timeval start;
449 struct timeval stop; 449 struct timeval stop;
450 450
451 printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", 451 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
452 nr_to_write); 452 nr_to_write);
453 m = nr_to_write / 100; 453 m = nr_to_write / 10;
454 if (!m) 454 if (!m)
455 m = 1; 455 m = 1;
456 nr_pages = 0; 456 nr_pages = 0;
@@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle,
464 if (ret) 464 if (ret)
465 break; 465 break;
466 if (!(nr_pages % m)) 466 if (!(nr_pages % m))
467 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 467 printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
468 nr_pages / m * 10);
468 nr_pages++; 469 nr_pages++;
469 } 470 }
470 err2 = hib_wait_on_bio_chain(&bio); 471 err2 = hib_wait_on_bio_chain(&bio);
@@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle,
472 if (!ret) 473 if (!ret)
473 ret = err2; 474 ret = err2;
474 if (!ret) 475 if (!ret)
475 printk(KERN_CONT "\b\b\b\bdone\n"); 476 printk(KERN_INFO "PM: Image saving done.\n");
476 else
477 printk(KERN_CONT "\n");
478 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 477 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
479 return ret; 478 return ret;
480} 479}
@@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
668 667
669 printk(KERN_INFO 668 printk(KERN_INFO
670 "PM: Using %u thread(s) for compression.\n" 669 "PM: Using %u thread(s) for compression.\n"
671 "PM: Compressing and saving image data (%u pages) ... ", 670 "PM: Compressing and saving image data (%u pages)...\n",
672 nr_threads, nr_to_write); 671 nr_threads, nr_to_write);
673 m = nr_to_write / 100; 672 m = nr_to_write / 10;
674 if (!m) 673 if (!m)
675 m = 1; 674 m = 1;
676 nr_pages = 0; 675 nr_pages = 0;
@@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle,
690 data_of(*snapshot), PAGE_SIZE); 689 data_of(*snapshot), PAGE_SIZE);
691 690
692 if (!(nr_pages % m)) 691 if (!(nr_pages % m))
693 printk(KERN_CONT "\b\b\b\b%3d%%", 692 printk(KERN_INFO
694 nr_pages / m); 693 "PM: Image saving progress: "
694 "%3d%%\n",
695 nr_pages / m * 10);
695 nr_pages++; 696 nr_pages++;
696 } 697 }
697 if (!off) 698 if (!off)
@@ -761,11 +762,8 @@ out_finish:
761 do_gettimeofday(&stop); 762 do_gettimeofday(&stop);
762 if (!ret) 763 if (!ret)
763 ret = err2; 764 ret = err2;
764 if (!ret) { 765 if (!ret)
765 printk(KERN_CONT "\b\b\b\bdone\n"); 766 printk(KERN_INFO "PM: Image saving done.\n");
766 } else {
767 printk(KERN_CONT "\n");
768 }
769 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 767 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
770out_clean: 768out_clean:
771 if (crc) { 769 if (crc) {
@@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle,
973 int err2; 971 int err2;
974 unsigned nr_pages; 972 unsigned nr_pages;
975 973
976 printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", 974 printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
977 nr_to_read); 975 nr_to_read);
978 m = nr_to_read / 100; 976 m = nr_to_read / 10;
979 if (!m) 977 if (!m)
980 m = 1; 978 m = 1;
981 nr_pages = 0; 979 nr_pages = 0;
@@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle,
993 if (ret) 991 if (ret)
994 break; 992 break;
995 if (!(nr_pages % m)) 993 if (!(nr_pages % m))
996 printk("\b\b\b\b%3d%%", nr_pages / m); 994 printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
995 nr_pages / m * 10);
997 nr_pages++; 996 nr_pages++;
998 } 997 }
999 err2 = hib_wait_on_bio_chain(&bio); 998 err2 = hib_wait_on_bio_chain(&bio);
@@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle,
1001 if (!ret) 1000 if (!ret)
1002 ret = err2; 1001 ret = err2;
1003 if (!ret) { 1002 if (!ret) {
1004 printk("\b\b\b\bdone\n"); 1003 printk(KERN_INFO "PM: Image loading done.\n");
1005 snapshot_write_finalize(snapshot); 1004 snapshot_write_finalize(snapshot);
1006 if (!snapshot_image_loaded(snapshot)) 1005 if (!snapshot_image_loaded(snapshot))
1007 ret = -ENODATA; 1006 ret = -ENODATA;
1008 } else 1007 }
1009 printk("\n");
1010 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1008 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
1011 return ret; 1009 return ret;
1012} 1010}
@@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
1185 1183
1186 printk(KERN_INFO 1184 printk(KERN_INFO
1187 "PM: Using %u thread(s) for decompression.\n" 1185 "PM: Using %u thread(s) for decompression.\n"
1188 "PM: Loading and decompressing image data (%u pages) ... ", 1186 "PM: Loading and decompressing image data (%u pages)...\n",
1189 nr_threads, nr_to_read); 1187 nr_threads, nr_to_read);
1190 m = nr_to_read / 100; 1188 m = nr_to_read / 10;
1191 if (!m) 1189 if (!m)
1192 m = 1; 1190 m = 1;
1193 nr_pages = 0; 1191 nr_pages = 0;
@@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle,
1319 data[thr].unc + off, PAGE_SIZE); 1317 data[thr].unc + off, PAGE_SIZE);
1320 1318
1321 if (!(nr_pages % m)) 1319 if (!(nr_pages % m))
1322 printk("\b\b\b\b%3d%%", nr_pages / m); 1320 printk(KERN_INFO
1321 "PM: Image loading progress: "
1322 "%3d%%\n",
1323 nr_pages / m * 10);
1323 nr_pages++; 1324 nr_pages++;
1324 1325
1325 ret = snapshot_write_next(snapshot); 1326 ret = snapshot_write_next(snapshot);
@@ -1344,7 +1345,7 @@ out_finish:
1344 } 1345 }
1345 do_gettimeofday(&stop); 1346 do_gettimeofday(&stop);
1346 if (!ret) { 1347 if (!ret) {
1347 printk("\b\b\b\bdone\n"); 1348 printk(KERN_INFO "PM: Image loading done.\n");
1348 snapshot_write_finalize(snapshot); 1349 snapshot_write_finalize(snapshot);
1349 if (!snapshot_image_loaded(snapshot)) 1350 if (!snapshot_image_loaded(snapshot))
1350 ret = -ENODATA; 1351 ret = -ENODATA;
@@ -1357,8 +1358,7 @@ out_finish:
1357 } 1358 }
1358 } 1359 }
1359 } 1360 }
1360 } else 1361 }
1361 printk("\n");
1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
1363out_clean: 1363out_clean:
1364 for (i = 0; i < ring_size; i++) 1364 for (i = 0; i < ring_size; i++)
@@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode)
1472 blkdev_put(hib_resume_bdev, mode); 1472 blkdev_put(hib_resume_bdev, mode);
1473} 1473}
1474 1474
1475/**
1476 * swsusp_unmark - Unmark swsusp signature in the resume device
1477 */
1478
1479#ifdef CONFIG_SUSPEND
1480int swsusp_unmark(void)
1481{
1482 int error;
1483
1484 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
1485 if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
1486 memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
1487 error = hib_bio_write_page(swsusp_resume_block,
1488 swsusp_header, NULL);
1489 } else {
1490 printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
1491 error = -ENODEV;
1492 }
1493
1494 /*
1495 * We just returned from suspend, we don't need the image any more.
1496 */
1497 free_all_swap_pages(root_swap);
1498
1499 return error;
1500}
1501#endif
1502
1475static int swsusp_header_init(void) 1503static int swsusp_header_init(void)
1476{ 1504{
1477 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); 1505 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index c8fba3380076..8f50de394d22 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -9,6 +9,7 @@
9 * manipulate wakelocks on Android. 9 * manipulate wakelocks on Android.
10 */ 10 */
11 11
12#include <linux/capability.h>
12#include <linux/ctype.h> 13#include <linux/ctype.h>
13#include <linux/device.h> 14#include <linux/device.h>
14#include <linux/err.h> 15#include <linux/err.h>
@@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf)
188 size_t len; 189 size_t len;
189 int ret = 0; 190 int ret = 0;
190 191
192 if (!capable(CAP_BLOCK_SUSPEND))
193 return -EPERM;
194
191 while (*str && !isspace(*str)) 195 while (*str && !isspace(*str))
192 str++; 196 str++;
193 197
@@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf)
231 size_t len; 235 size_t len;
232 int ret = 0; 236 int ret = 0;
233 237
238 if (!capable(CAP_BLOCK_SUSPEND))
239 return -EPERM;
240
234 len = strlen(buf); 241 len = strlen(buf);
235 if (!len) 242 if (!len)
236 return -EINVAL; 243 return -EINVAL;
diff --git a/kernel/printk.c b/kernel/printk.c
index ac4bc9e79465..6a76ab9d4476 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -216,6 +216,7 @@ struct log {
216 */ 216 */
217static DEFINE_RAW_SPINLOCK(logbuf_lock); 217static DEFINE_RAW_SPINLOCK(logbuf_lock);
218 218
219#ifdef CONFIG_PRINTK
219/* the next printk record to read by syslog(READ) or /proc/kmsg */ 220/* the next printk record to read by syslog(READ) or /proc/kmsg */
220static u64 syslog_seq; 221static u64 syslog_seq;
221static u32 syslog_idx; 222static u32 syslog_idx;
@@ -228,14 +229,19 @@ static u32 log_first_idx;
228 229
229/* index and sequence number of the next record to store in the buffer */ 230/* index and sequence number of the next record to store in the buffer */
230static u64 log_next_seq; 231static u64 log_next_seq;
231#ifdef CONFIG_PRINTK
232static u32 log_next_idx; 232static u32 log_next_idx;
233 233
234/* the next printk record to write to the console */
235static u64 console_seq;
236static u32 console_idx;
237static enum log_flags console_prev;
238
234/* the next printk record to read after the last 'clear' command */ 239/* the next printk record to read after the last 'clear' command */
235static u64 clear_seq; 240static u64 clear_seq;
236static u32 clear_idx; 241static u32 clear_idx;
237 242
238#define LOG_LINE_MAX 1024 243#define PREFIX_MAX 32
244#define LOG_LINE_MAX 1024 - PREFIX_MAX
239 245
240/* record buffer */ 246/* record buffer */
241#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 247#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -360,6 +366,7 @@ static void log_store(int facility, int level,
360struct devkmsg_user { 366struct devkmsg_user {
361 u64 seq; 367 u64 seq;
362 u32 idx; 368 u32 idx;
369 enum log_flags prev;
363 struct mutex lock; 370 struct mutex lock;
364 char buf[8192]; 371 char buf[8192];
365}; 372};
@@ -382,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
382 389
383 line = buf; 390 line = buf;
384 for (i = 0; i < count; i++) { 391 for (i = 0; i < count; i++) {
385 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) 392 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
393 ret = -EFAULT;
386 goto out; 394 goto out;
395 }
387 line += iv[i].iov_len; 396 line += iv[i].iov_len;
388 } 397 }
389 398
@@ -425,6 +434,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
425 struct log *msg; 434 struct log *msg;
426 u64 ts_usec; 435 u64 ts_usec;
427 size_t i; 436 size_t i;
437 char cont = '-';
428 size_t len; 438 size_t len;
429 ssize_t ret; 439 ssize_t ret;
430 440
@@ -462,8 +472,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
462 msg = log_from_idx(user->idx); 472 msg = log_from_idx(user->idx);
463 ts_usec = msg->ts_nsec; 473 ts_usec = msg->ts_nsec;
464 do_div(ts_usec, 1000); 474 do_div(ts_usec, 1000);
465 len = sprintf(user->buf, "%u,%llu,%llu;", 475
466 (msg->facility << 3) | msg->level, user->seq, ts_usec); 476 /*
477 * If we couldn't merge continuation line fragments during the print,
478 * export the stored flags to allow an optional external merge of the
479 * records. Merging the records isn't always neccessarily correct, like
480 * when we hit a race during printing. In most cases though, it produces
481 * better readable output. 'c' in the record flags mark the first
482 * fragment of a line, '+' the following.
483 */
484 if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
485 cont = 'c';
486 else if ((msg->flags & LOG_CONT) ||
487 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
488 cont = '+';
489
490 len = sprintf(user->buf, "%u,%llu,%llu,%c;",
491 (msg->facility << 3) | msg->level,
492 user->seq, ts_usec, cont);
493 user->prev = msg->flags;
467 494
468 /* escape non-printable characters */ 495 /* escape non-printable characters */
469 for (i = 0; i < msg->text_len; i++) { 496 for (i = 0; i < msg->text_len; i++) {
@@ -646,6 +673,15 @@ void log_buf_kexec_setup(void)
646 VMCOREINFO_SYMBOL(log_buf_len); 673 VMCOREINFO_SYMBOL(log_buf_len);
647 VMCOREINFO_SYMBOL(log_first_idx); 674 VMCOREINFO_SYMBOL(log_first_idx);
648 VMCOREINFO_SYMBOL(log_next_idx); 675 VMCOREINFO_SYMBOL(log_next_idx);
676 /*
677 * Export struct log size and field offsets. User space tools can
678 * parse it and detect any changes to structure down the line.
679 */
680 VMCOREINFO_STRUCT_SIZE(log);
681 VMCOREINFO_OFFSET(log, ts_nsec);
682 VMCOREINFO_OFFSET(log, len);
683 VMCOREINFO_OFFSET(log, text_len);
684 VMCOREINFO_OFFSET(log, dict_len);
649} 685}
650#endif 686#endif
651 687
@@ -876,7 +912,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
876 912
877 if (buf) { 913 if (buf) {
878 if (print_prefix(msg, syslog, NULL) + 914 if (print_prefix(msg, syslog, NULL) +
879 text_len + 1>= size - len) 915 text_len + 1 >= size - len)
880 break; 916 break;
881 917
882 if (prefix) 918 if (prefix)
@@ -907,7 +943,7 @@ static int syslog_print(char __user *buf, int size)
907 struct log *msg; 943 struct log *msg;
908 int len = 0; 944 int len = 0;
909 945
910 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 946 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
911 if (!text) 947 if (!text)
912 return -ENOMEM; 948 return -ENOMEM;
913 949
@@ -930,7 +966,8 @@ static int syslog_print(char __user *buf, int size)
930 966
931 skip = syslog_partial; 967 skip = syslog_partial;
932 msg = log_from_idx(syslog_idx); 968 msg = log_from_idx(syslog_idx);
933 n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); 969 n = msg_print_text(msg, syslog_prev, true, text,
970 LOG_LINE_MAX + PREFIX_MAX);
934 if (n - syslog_partial <= size) { 971 if (n - syslog_partial <= size) {
935 /* message fits into buffer, move forward */ 972 /* message fits into buffer, move forward */
936 syslog_idx = log_next(syslog_idx); 973 syslog_idx = log_next(syslog_idx);
@@ -969,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
969 char *text; 1006 char *text;
970 int len = 0; 1007 int len = 0;
971 1008
972 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 1009 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
973 if (!text) 1010 if (!text)
974 return -ENOMEM; 1011 return -ENOMEM;
975 1012
@@ -1022,7 +1059,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1022 struct log *msg = log_from_idx(idx); 1059 struct log *msg = log_from_idx(idx);
1023 int textlen; 1060 int textlen;
1024 1061
1025 textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); 1062 textlen = msg_print_text(msg, prev, true, text,
1063 LOG_LINE_MAX + PREFIX_MAX);
1026 if (textlen < 0) { 1064 if (textlen < 0) {
1027 len = textlen; 1065 len = textlen;
1028 break; 1066 break;
@@ -1349,20 +1387,36 @@ static struct cont {
1349 u64 ts_nsec; /* time of first print */ 1387 u64 ts_nsec; /* time of first print */
1350 u8 level; /* log level of first message */ 1388 u8 level; /* log level of first message */
1351 u8 facility; /* log level of first message */ 1389 u8 facility; /* log level of first message */
1390 enum log_flags flags; /* prefix, newline flags */
1352 bool flushed:1; /* buffer sealed and committed */ 1391 bool flushed:1; /* buffer sealed and committed */
1353} cont; 1392} cont;
1354 1393
1355static void cont_flush(void) 1394static void cont_flush(enum log_flags flags)
1356{ 1395{
1357 if (cont.flushed) 1396 if (cont.flushed)
1358 return; 1397 return;
1359 if (cont.len == 0) 1398 if (cont.len == 0)
1360 return; 1399 return;
1361 1400
1362 log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, 1401 if (cont.cons) {
1363 NULL, 0, cont.buf, cont.len); 1402 /*
1364 1403 * If a fragment of this line was directly flushed to the
1365 cont.flushed = true; 1404 * console; wait for the console to pick up the rest of the
1405 * line. LOG_NOCONS suppresses a duplicated output.
1406 */
1407 log_store(cont.facility, cont.level, flags | LOG_NOCONS,
1408 cont.ts_nsec, NULL, 0, cont.buf, cont.len);
1409 cont.flags = flags;
1410 cont.flushed = true;
1411 } else {
1412 /*
1413 * If no fragment of this line ever reached the console,
1414 * just submit it to the store and free the buffer.
1415 */
1416 log_store(cont.facility, cont.level, flags, 0,
1417 NULL, 0, cont.buf, cont.len);
1418 cont.len = 0;
1419 }
1366} 1420}
1367 1421
1368static bool cont_add(int facility, int level, const char *text, size_t len) 1422static bool cont_add(int facility, int level, const char *text, size_t len)
@@ -1371,7 +1425,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1371 return false; 1425 return false;
1372 1426
1373 if (cont.len + len > sizeof(cont.buf)) { 1427 if (cont.len + len > sizeof(cont.buf)) {
1374 cont_flush(); 1428 /* the line gets too long, split it up in separate records */
1429 cont_flush(LOG_CONT);
1375 return false; 1430 return false;
1376 } 1431 }
1377 1432
@@ -1380,12 +1435,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1380 cont.level = level; 1435 cont.level = level;
1381 cont.owner = current; 1436 cont.owner = current;
1382 cont.ts_nsec = local_clock(); 1437 cont.ts_nsec = local_clock();
1438 cont.flags = 0;
1383 cont.cons = 0; 1439 cont.cons = 0;
1384 cont.flushed = false; 1440 cont.flushed = false;
1385 } 1441 }
1386 1442
1387 memcpy(cont.buf + cont.len, text, len); 1443 memcpy(cont.buf + cont.len, text, len);
1388 cont.len += len; 1444 cont.len += len;
1445
1446 if (cont.len > (sizeof(cont.buf) * 80) / 100)
1447 cont_flush(LOG_CONT);
1448
1389 return true; 1449 return true;
1390} 1450}
1391 1451
@@ -1394,7 +1454,7 @@ static size_t cont_print_text(char *text, size_t size)
1394 size_t textlen = 0; 1454 size_t textlen = 0;
1395 size_t len; 1455 size_t len;
1396 1456
1397 if (cont.cons == 0) { 1457 if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
1398 textlen += print_time(cont.ts_nsec, text); 1458 textlen += print_time(cont.ts_nsec, text);
1399 size -= textlen; 1459 size -= textlen;
1400 } 1460 }
@@ -1409,7 +1469,8 @@ static size_t cont_print_text(char *text, size_t size)
1409 } 1469 }
1410 1470
1411 if (cont.flushed) { 1471 if (cont.flushed) {
1412 text[textlen++] = '\n'; 1472 if (cont.flags & LOG_NEWLINE)
1473 text[textlen++] = '\n';
1413 /* got everything, release buffer */ 1474 /* got everything, release buffer */
1414 cont.len = 0; 1475 cont.len = 0;
1415 } 1476 }
@@ -1481,17 +1542,23 @@ asmlinkage int vprintk_emit(int facility, int level,
1481 lflags |= LOG_NEWLINE; 1542 lflags |= LOG_NEWLINE;
1482 } 1543 }
1483 1544
1484 /* strip syslog prefix and extract log level or control flags */ 1545 /* strip kernel syslog prefix and extract log level or control flags */
1485 if (text[0] == '<' && text[1] && text[2] == '>') { 1546 if (facility == 0) {
1486 switch (text[1]) { 1547 int kern_level = printk_get_level(text);
1487 case '0' ... '7': 1548
1488 if (level == -1) 1549 if (kern_level) {
1489 level = text[1] - '0'; 1550 const char *end_of_header = printk_skip_level(text);
1490 case 'd': /* KERN_DEFAULT */ 1551 switch (kern_level) {
1491 lflags |= LOG_PREFIX; 1552 case '0' ... '7':
1492 case 'c': /* KERN_CONT */ 1553 if (level == -1)
1493 text += 3; 1554 level = kern_level - '0';
1494 text_len -= 3; 1555 case 'd': /* KERN_DEFAULT */
1556 lflags |= LOG_PREFIX;
1557 case 'c': /* KERN_CONT */
1558 break;
1559 }
1560 text_len -= end_of_header - text;
1561 text = (char *)end_of_header;
1495 } 1562 }
1496 } 1563 }
1497 1564
@@ -1507,7 +1574,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1507 * or another task also prints continuation lines. 1574 * or another task also prints continuation lines.
1508 */ 1575 */
1509 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) 1576 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
1510 cont_flush(); 1577 cont_flush(LOG_NEWLINE);
1511 1578
1512 /* buffer line if possible, otherwise store it right away */ 1579 /* buffer line if possible, otherwise store it right away */
1513 if (!cont_add(facility, level, text, text_len)) 1580 if (!cont_add(facility, level, text, text_len))
@@ -1525,7 +1592,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1525 if (cont.len && cont.owner == current) { 1592 if (cont.len && cont.owner == current) {
1526 if (!(lflags & LOG_PREFIX)) 1593 if (!(lflags & LOG_PREFIX))
1527 stored = cont_add(facility, level, text, text_len); 1594 stored = cont_add(facility, level, text, text_len);
1528 cont_flush(); 1595 cont_flush(LOG_NEWLINE);
1529 } 1596 }
1530 1597
1531 if (!stored) 1598 if (!stored)
@@ -1616,9 +1683,20 @@ asmlinkage int printk(const char *fmt, ...)
1616} 1683}
1617EXPORT_SYMBOL(printk); 1684EXPORT_SYMBOL(printk);
1618 1685
1619#else 1686#else /* CONFIG_PRINTK */
1620 1687
1688#define LOG_LINE_MAX 0
1689#define PREFIX_MAX 0
1621#define LOG_LINE_MAX 0 1690#define LOG_LINE_MAX 0
1691static u64 syslog_seq;
1692static u32 syslog_idx;
1693static u64 console_seq;
1694static u32 console_idx;
1695static enum log_flags syslog_prev;
1696static u64 log_first_seq;
1697static u32 log_first_idx;
1698static u64 log_next_seq;
1699static enum log_flags console_prev;
1622static struct cont { 1700static struct cont {
1623 size_t len; 1701 size_t len;
1624 size_t cons; 1702 size_t cons;
@@ -1902,10 +1980,34 @@ void wake_up_klogd(void)
1902 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1980 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1903} 1981}
1904 1982
1905/* the next printk record to write to the console */ 1983static void console_cont_flush(char *text, size_t size)
1906static u64 console_seq; 1984{
1907static u32 console_idx; 1985 unsigned long flags;
1908static enum log_flags console_prev; 1986 size_t len;
1987
1988 raw_spin_lock_irqsave(&logbuf_lock, flags);
1989
1990 if (!cont.len)
1991 goto out;
1992
1993 /*
1994 * We still queue earlier records, likely because the console was
1995 * busy. The earlier ones need to be printed before this one, we
1996 * did not flush any fragment so far, so just let it queue up.
1997 */
1998 if (console_seq < log_next_seq && !cont.cons)
1999 goto out;
2000
2001 len = cont_print_text(text, size);
2002 raw_spin_unlock(&logbuf_lock);
2003 stop_critical_timings();
2004 call_console_drivers(cont.level, text, len);
2005 start_critical_timings();
2006 local_irq_restore(flags);
2007 return;
2008out:
2009 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2010}
1909 2011
1910/** 2012/**
1911 * console_unlock - unlock the console system 2013 * console_unlock - unlock the console system
@@ -1923,7 +2025,7 @@ static enum log_flags console_prev;
1923 */ 2025 */
1924void console_unlock(void) 2026void console_unlock(void)
1925{ 2027{
1926 static char text[LOG_LINE_MAX]; 2028 static char text[LOG_LINE_MAX + PREFIX_MAX];
1927 static u64 seen_seq; 2029 static u64 seen_seq;
1928 unsigned long flags; 2030 unsigned long flags;
1929 bool wake_klogd = false; 2031 bool wake_klogd = false;
@@ -1937,19 +2039,7 @@ void console_unlock(void)
1937 console_may_schedule = 0; 2039 console_may_schedule = 0;
1938 2040
1939 /* flush buffered message fragment immediately to console */ 2041 /* flush buffered message fragment immediately to console */
1940 raw_spin_lock_irqsave(&logbuf_lock, flags); 2042 console_cont_flush(text, sizeof(text));
1941 if (cont.len && (cont.cons < cont.len || cont.flushed)) {
1942 size_t len;
1943
1944 len = cont_print_text(text, sizeof(text));
1945 raw_spin_unlock(&logbuf_lock);
1946 stop_critical_timings();
1947 call_console_drivers(cont.level, text, len);
1948 start_critical_timings();
1949 local_irq_restore(flags);
1950 } else
1951 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1952
1953again: 2043again:
1954 for (;;) { 2044 for (;;) {
1955 struct log *msg; 2045 struct log *msg;
@@ -1986,6 +2076,7 @@ skip:
1986 * will properly dump everything later. 2076 * will properly dump everything later.
1987 */ 2077 */
1988 msg->flags &= ~LOG_NOCONS; 2078 msg->flags &= ~LOG_NOCONS;
2079 console_prev = msg->flags;
1989 goto skip; 2080 goto skip;
1990 } 2081 }
1991 2082
diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8ee76d5..34d45886ee84 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/export.h> 12#include <linux/export.h>
11#include <linux/errno.h> 13#include <linux/errno.h>
12#include <linux/ioport.h> 14#include <linux/ioport.h>
@@ -722,14 +724,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
722 724
723 write_lock(&resource_lock); 725 write_lock(&resource_lock);
724 726
727 if (!parent)
728 goto skip;
729
725 if ((start < parent->start) || (end > parent->end)) 730 if ((start < parent->start) || (end > parent->end))
726 goto out; 731 goto out;
727 732
728 for (tmp = res->child; tmp; tmp = tmp->sibling) {
729 if ((tmp->start < start) || (tmp->end > end))
730 goto out;
731 }
732
733 if (res->sibling && (res->sibling->start <= end)) 733 if (res->sibling && (res->sibling->start <= end))
734 goto out; 734 goto out;
735 735
@@ -741,6 +741,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
741 goto out; 741 goto out;
742 } 742 }
743 743
744skip:
745 for (tmp = res->child; tmp; tmp = tmp->sibling)
746 if ((tmp->start < start) || (tmp->end > end))
747 goto out;
748
744 res->start = start; 749 res->start = start;
745 res->end = end; 750 res->end = end;
746 result = 0; 751 result = 0;
@@ -788,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root,
788 resource_size_t start, resource_size_t end, 793 resource_size_t start, resource_size_t end,
789 const char *name) 794 const char *name)
790{ 795{
796 int abort = 0;
797
791 write_lock(&resource_lock); 798 write_lock(&resource_lock);
792 __reserve_region_with_split(root, start, end, name); 799 if (root->start > start || root->end < end) {
800 pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
801 (unsigned long long)start, (unsigned long long)end,
802 root);
803 if (start > root->end || end < root->start)
804 abort = 1;
805 else {
806 if (end > root->end)
807 end = root->end;
808 if (start < root->start)
809 start = root->start;
810 pr_err("fixing request to [0x%llx-0x%llx]\n",
811 (unsigned long long)start,
812 (unsigned long long)end);
813 }
814 dump_stack();
815 }
816 if (!abort)
817 __reserve_region_with_split(root, start, end, name);
793 write_unlock(&resource_lock); 818 write_unlock(&resource_lock);
794} 819}
795 820
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad732b56ba70..82ad284f823b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1097 * 1097 *
1098 * sched_move_task() holds both and thus holding either pins the cgroup, 1098 * sched_move_task() holds both and thus holding either pins the cgroup,
1099 * see set_task_rq(). 1099 * see task_group().
1100 * 1100 *
1101 * Furthermore, all task_rq users should acquire both locks, see 1101 * Furthermore, all task_rq users should acquire both locks, see
1102 * task_rq_lock(). 1102 * task_rq_lock().
@@ -4340,9 +4340,7 @@ recheck:
4340 */ 4340 */
4341 if (unlikely(policy == p->policy && (!rt_policy(policy) || 4341 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
4342 param->sched_priority == p->rt_priority))) { 4342 param->sched_priority == p->rt_priority))) {
4343 4343 task_rq_unlock(rq, p, &flags);
4344 __task_rq_unlock(rq);
4345 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4346 return 0; 4344 return 0;
4347 } 4345 }
4348 4346
@@ -6024,6 +6022,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 6022 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6025 * allows us to avoid some pointer chasing select_idle_sibling(). 6023 * allows us to avoid some pointer chasing select_idle_sibling().
6026 * 6024 *
6025 * Iterate domains and sched_groups downward, assigning CPUs to be
6026 * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
6027 * due to random perturbation self canceling, ie sw buddies pull
6028 * their counterpart to their CPU's hw counterpart.
6029 *
6027 * Also keep a unique ID per domain (we use the first cpu number in 6030 * Also keep a unique ID per domain (we use the first cpu number in
6028 * the cpumask of the domain), this allows us to quickly tell if 6031 * the cpumask of the domain), this allows us to quickly tell if
6029 * two cpus are in the same cache domain, see cpus_share_cache(). 6032 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6040,40 @@ static void update_top_cache_domain(int cpu)
6037 int id = cpu; 6040 int id = cpu;
6038 6041
6039 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 6042 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040 if (sd) 6043 if (sd) {
6044 struct sched_domain *tmp = sd;
6045 struct sched_group *sg, *prev;
6046 bool right;
6047
6048 /*
6049 * Traverse to first CPU in group, and count hops
6050 * to cpu from there, switching direction on each
6051 * hop, never ever pointing the last CPU rightward.
6052 */
6053 do {
6054 id = cpumask_first(sched_domain_span(tmp));
6055 prev = sg = tmp->groups;
6056 right = 1;
6057
6058 while (cpumask_first(sched_group_cpus(sg)) != id)
6059 sg = sg->next;
6060
6061 while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
6062 prev = sg;
6063 sg = sg->next;
6064 right = !right;
6065 }
6066
6067 /* A CPU went down, never point back to domain start. */
6068 if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
6069 right = false;
6070
6071 sg = right ? sg->next : prev;
6072 tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
6073 } while ((tmp = tmp->child));
6074
6041 id = cpumask_first(sched_domain_span(sd)); 6075 id = cpumask_first(sched_domain_span(sd));
6076 }
6042 6077
6043 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6078 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6044 per_cpu(sd_llc_id, cpu) = id; 6079 per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7132,66 @@ match2:
7097 mutex_unlock(&sched_domains_mutex); 7132 mutex_unlock(&sched_domains_mutex);
7098} 7133}
7099 7134
7135static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
7136
7100/* 7137/*
7101 * Update cpusets according to cpu_active mask. If cpusets are 7138 * Update cpusets according to cpu_active mask. If cpusets are
7102 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7139 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7103 * around partition_sched_domains(). 7140 * around partition_sched_domains().
7141 *
7142 * If we come here as part of a suspend/resume, don't touch cpusets because we
7143 * want to restore it back to its original state upon resume anyway.
7104 */ 7144 */
7105static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7145static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7106 void *hcpu) 7146 void *hcpu)
7107{ 7147{
7108 switch (action & ~CPU_TASKS_FROZEN) { 7148 switch (action) {
7149 case CPU_ONLINE_FROZEN:
7150 case CPU_DOWN_FAILED_FROZEN:
7151
7152 /*
7153 * num_cpus_frozen tracks how many CPUs are involved in suspend
7154 * resume sequence. As long as this is not the last online
7155 * operation in the resume sequence, just build a single sched
7156 * domain, ignoring cpusets.
7157 */
7158 num_cpus_frozen--;
7159 if (likely(num_cpus_frozen)) {
7160 partition_sched_domains(1, NULL, NULL);
7161 break;
7162 }
7163
7164 /*
7165 * This is the last CPU online operation. So fall through and
7166 * restore the original sched domains by considering the
7167 * cpuset configurations.
7168 */
7169
7109 case CPU_ONLINE: 7170 case CPU_ONLINE:
7110 case CPU_DOWN_FAILED: 7171 case CPU_DOWN_FAILED:
7111 cpuset_update_active_cpus(); 7172 cpuset_update_active_cpus(true);
7112 return NOTIFY_OK; 7173 break;
7113 default: 7174 default:
7114 return NOTIFY_DONE; 7175 return NOTIFY_DONE;
7115 } 7176 }
7177 return NOTIFY_OK;
7116} 7178}
7117 7179
7118static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7180static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7119 void *hcpu) 7181 void *hcpu)
7120{ 7182{
7121 switch (action & ~CPU_TASKS_FROZEN) { 7183 switch (action) {
7122 case CPU_DOWN_PREPARE: 7184 case CPU_DOWN_PREPARE:
7123 cpuset_update_active_cpus(); 7185 cpuset_update_active_cpus(false);
7124 return NOTIFY_OK; 7186 break;
7187 case CPU_DOWN_PREPARE_FROZEN:
7188 num_cpus_frozen++;
7189 partition_sched_domains(1, NULL, NULL);
7190 break;
7125 default: 7191 default:
7126 return NOTIFY_DONE; 7192 return NOTIFY_DONE;
7127 } 7193 }
7194 return NOTIFY_OK;
7128} 7195}
7129 7196
7130void __init sched_init_smp(void) 7197void __init sched_init_smp(void)
@@ -7589,6 +7656,7 @@ void sched_destroy_group(struct task_group *tg)
7589 */ 7656 */
7590void sched_move_task(struct task_struct *tsk) 7657void sched_move_task(struct task_struct *tsk)
7591{ 7658{
7659 struct task_group *tg;
7592 int on_rq, running; 7660 int on_rq, running;
7593 unsigned long flags; 7661 unsigned long flags;
7594 struct rq *rq; 7662 struct rq *rq;
@@ -7603,6 +7671,12 @@ void sched_move_task(struct task_struct *tsk)
7603 if (unlikely(running)) 7671 if (unlikely(running))
7604 tsk->sched_class->put_prev_task(rq, tsk); 7672 tsk->sched_class->put_prev_task(rq, tsk);
7605 7673
7674 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7675 lockdep_is_held(&tsk->sighand->siglock)),
7676 struct task_group, css);
7677 tg = autogroup_task_group(tsk, tg);
7678 tsk->sched_task_group = tg;
7679
7606#ifdef CONFIG_FAIR_GROUP_SCHED 7680#ifdef CONFIG_FAIR_GROUP_SCHED
7607 if (tsk->sched_class->task_move_group) 7681 if (tsk->sched_class->task_move_group)
7608 tsk->sched_class->task_move_group(tsk, on_rq); 7682 tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf660..23aa789c53ee 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
65int cpupri_find(struct cpupri *cp, struct task_struct *p, 65int cpupri_find(struct cpupri *cp, struct task_struct *p,
66 struct cpumask *lowest_mask) 66 struct cpumask *lowest_mask)
67{ 67{
68 int idx = 0; 68 int idx = 0;
69 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
70 70
71 if (task_pri >= MAX_RT_PRIO) 71 if (task_pri >= MAX_RT_PRIO)
72 return 0; 72 return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
137 */ 137 */
138void cpupri_set(struct cpupri *cp, int cpu, int newpri) 138void cpupri_set(struct cpupri *cp, int cpu, int newpri)
139{ 139{
140 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
141 int oldpri = *currpri; 141 int oldpri = *currpri;
142 int do_mb = 0; 142 int do_mb = 0;
143 143
144 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
145 145
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..d0cc03b3e70b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2637 int cpu = smp_processor_id(); 2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p); 2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd; 2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2642 2640
2643 /* 2641 /*
2644 * If the task is going to be woken-up on this cpu and if it is 2642 * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
2655 return prev_cpu; 2653 return prev_cpu;
2656 2654
2657 /* 2655 /*
2658 * Otherwise, iterate the domains and find an elegible idle cpu. 2656 * Otherwise, check assigned siblings to find an elegible idle cpu.
2659 */ 2657 */
2660 sd = rcu_dereference(per_cpu(sd_llc, target)); 2658 sd = rcu_dereference(per_cpu(sd_llc, target));
2661 for_each_lower_domain(sd) {
2662 sg = sd->groups;
2663 do {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2665 tsk_cpus_allowed(p)))
2666 goto next;
2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672 2659
2673 target = cpumask_first_and(sched_group_cpus(sg), 2660 for_each_lower_domain(sd) {
2674 tsk_cpus_allowed(p)); 2661 if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
2675 goto done; 2662 continue;
2676next: 2663 if (idle_cpu(sd->idle_buddy))
2677 sg = sg->next; 2664 return sd->idle_buddy;
2678 } while (sg != sd->groups);
2679 } 2665 }
2680done: 2666
2681 return target; 2667 return target;
2682} 2668}
2683 2669
@@ -3068,18 +3054,24 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3068 3054
3069#define LBF_ALL_PINNED 0x01 3055#define LBF_ALL_PINNED 0x01
3070#define LBF_NEED_BREAK 0x02 3056#define LBF_NEED_BREAK 0x02
3057#define LBF_SOME_PINNED 0x04
3071 3058
3072struct lb_env { 3059struct lb_env {
3073 struct sched_domain *sd; 3060 struct sched_domain *sd;
3074 3061
3075 int src_cpu;
3076 struct rq *src_rq; 3062 struct rq *src_rq;
3063 int src_cpu;
3077 3064
3078 int dst_cpu; 3065 int dst_cpu;
3079 struct rq *dst_rq; 3066 struct rq *dst_rq;
3080 3067
3068 struct cpumask *dst_grpmask;
3069 int new_dst_cpu;
3081 enum cpu_idle_type idle; 3070 enum cpu_idle_type idle;
3082 long imbalance; 3071 long imbalance;
3072 /* The set of CPUs under consideration for load-balancing */
3073 struct cpumask *cpus;
3074
3083 unsigned int flags; 3075 unsigned int flags;
3084 3076
3085 unsigned int loop; 3077 unsigned int loop;
@@ -3145,9 +3137,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3145 * 3) are cache-hot on their current CPU. 3137 * 3) are cache-hot on their current CPU.
3146 */ 3138 */
3147 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3139 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3140 int new_dst_cpu;
3141
3148 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3142 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3143
3144 /*
3145 * Remember if this task can be migrated to any other cpu in
3146 * our sched_group. We may want to revisit it if we couldn't
3147 * meet load balance goals by pulling other tasks on src_cpu.
3148 *
3149 * Also avoid computing new_dst_cpu if we have already computed
3150 * one in current iteration.
3151 */
3152 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3153 return 0;
3154
3155 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3156 tsk_cpus_allowed(p));
3157 if (new_dst_cpu < nr_cpu_ids) {
3158 env->flags |= LBF_SOME_PINNED;
3159 env->new_dst_cpu = new_dst_cpu;
3160 }
3149 return 0; 3161 return 0;
3150 } 3162 }
3163
3164 /* Record that we found atleast one task that could run on dst_cpu */
3151 env->flags &= ~LBF_ALL_PINNED; 3165 env->flags &= ~LBF_ALL_PINNED;
3152 3166
3153 if (task_running(env->src_rq, p)) { 3167 if (task_running(env->src_rq, p)) {
@@ -3642,8 +3656,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3642 */ 3656 */
3643static inline void update_sg_lb_stats(struct lb_env *env, 3657static inline void update_sg_lb_stats(struct lb_env *env,
3644 struct sched_group *group, int load_idx, 3658 struct sched_group *group, int load_idx,
3645 int local_group, const struct cpumask *cpus, 3659 int local_group, int *balance, struct sg_lb_stats *sgs)
3646 int *balance, struct sg_lb_stats *sgs)
3647{ 3660{
3648 unsigned long nr_running, max_nr_running, min_nr_running; 3661 unsigned long nr_running, max_nr_running, min_nr_running;
3649 unsigned long load, max_cpu_load, min_cpu_load; 3662 unsigned long load, max_cpu_load, min_cpu_load;
@@ -3660,7 +3673,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3660 max_nr_running = 0; 3673 max_nr_running = 0;
3661 min_nr_running = ~0UL; 3674 min_nr_running = ~0UL;
3662 3675
3663 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3676 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
3664 struct rq *rq = cpu_rq(i); 3677 struct rq *rq = cpu_rq(i);
3665 3678
3666 nr_running = rq->nr_running; 3679 nr_running = rq->nr_running;
@@ -3789,8 +3802,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
3789 * @sds: variable to hold the statistics for this sched_domain. 3802 * @sds: variable to hold the statistics for this sched_domain.
3790 */ 3803 */
3791static inline void update_sd_lb_stats(struct lb_env *env, 3804static inline void update_sd_lb_stats(struct lb_env *env,
3792 const struct cpumask *cpus, 3805 int *balance, struct sd_lb_stats *sds)
3793 int *balance, struct sd_lb_stats *sds)
3794{ 3806{
3795 struct sched_domain *child = env->sd->child; 3807 struct sched_domain *child = env->sd->child;
3796 struct sched_group *sg = env->sd->groups; 3808 struct sched_group *sg = env->sd->groups;
@@ -3807,8 +3819,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3807 3819
3808 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 3820 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3809 memset(&sgs, 0, sizeof(sgs)); 3821 memset(&sgs, 0, sizeof(sgs));
3810 update_sg_lb_stats(env, sg, load_idx, local_group, 3822 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
3811 cpus, balance, &sgs);
3812 3823
3813 if (local_group && !(*balance)) 3824 if (local_group && !(*balance))
3814 return; 3825 return;
@@ -4044,7 +4055,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4044 * to restore balance. 4055 * to restore balance.
4045 * 4056 *
4046 * @env: The load balancing environment. 4057 * @env: The load balancing environment.
4047 * @cpus: The set of CPUs under consideration for load-balancing.
4048 * @balance: Pointer to a variable indicating if this_cpu 4058 * @balance: Pointer to a variable indicating if this_cpu
4049 * is the appropriate cpu to perform load balancing at this_level. 4059 * is the appropriate cpu to perform load balancing at this_level.
4050 * 4060 *
@@ -4054,7 +4064,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4054 * put to idle by rebalancing its tasks onto our group. 4064 * put to idle by rebalancing its tasks onto our group.
4055 */ 4065 */
4056static struct sched_group * 4066static struct sched_group *
4057find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) 4067find_busiest_group(struct lb_env *env, int *balance)
4058{ 4068{
4059 struct sd_lb_stats sds; 4069 struct sd_lb_stats sds;
4060 4070
@@ -4064,7 +4074,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4064 * Compute the various statistics relavent for load balancing at 4074 * Compute the various statistics relavent for load balancing at
4065 * this level. 4075 * this level.
4066 */ 4076 */
4067 update_sd_lb_stats(env, cpus, balance, &sds); 4077 update_sd_lb_stats(env, balance, &sds);
4068 4078
4069 /* 4079 /*
4070 * this_cpu is not the appropriate cpu to perform load balancing at 4080 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4144,8 +4154,7 @@ ret:
4144 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4154 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4145 */ 4155 */
4146static struct rq *find_busiest_queue(struct lb_env *env, 4156static struct rq *find_busiest_queue(struct lb_env *env,
4147 struct sched_group *group, 4157 struct sched_group *group)
4148 const struct cpumask *cpus)
4149{ 4158{
4150 struct rq *busiest = NULL, *rq; 4159 struct rq *busiest = NULL, *rq;
4151 unsigned long max_load = 0; 4160 unsigned long max_load = 0;
@@ -4160,7 +4169,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4160 if (!capacity) 4169 if (!capacity)
4161 capacity = fix_small_capacity(env->sd, group); 4170 capacity = fix_small_capacity(env->sd, group);
4162 4171
4163 if (!cpumask_test_cpu(i, cpus)) 4172 if (!cpumask_test_cpu(i, env->cpus))
4164 continue; 4173 continue;
4165 4174
4166 rq = cpu_rq(i); 4175 rq = cpu_rq(i);
@@ -4227,7 +4236,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4227 struct sched_domain *sd, enum cpu_idle_type idle, 4236 struct sched_domain *sd, enum cpu_idle_type idle,
4228 int *balance) 4237 int *balance)
4229{ 4238{
4230 int ld_moved, active_balance = 0; 4239 int ld_moved, cur_ld_moved, active_balance = 0;
4240 int lb_iterations, max_lb_iterations;
4231 struct sched_group *group; 4241 struct sched_group *group;
4232 struct rq *busiest; 4242 struct rq *busiest;
4233 unsigned long flags; 4243 unsigned long flags;
@@ -4237,16 +4247,19 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4237 .sd = sd, 4247 .sd = sd,
4238 .dst_cpu = this_cpu, 4248 .dst_cpu = this_cpu,
4239 .dst_rq = this_rq, 4249 .dst_rq = this_rq,
4250 .dst_grpmask = sched_group_cpus(sd->groups),
4240 .idle = idle, 4251 .idle = idle,
4241 .loop_break = sched_nr_migrate_break, 4252 .loop_break = sched_nr_migrate_break,
4253 .cpus = cpus,
4242 }; 4254 };
4243 4255
4244 cpumask_copy(cpus, cpu_active_mask); 4256 cpumask_copy(cpus, cpu_active_mask);
4257 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4245 4258
4246 schedstat_inc(sd, lb_count[idle]); 4259 schedstat_inc(sd, lb_count[idle]);
4247 4260
4248redo: 4261redo:
4249 group = find_busiest_group(&env, cpus, balance); 4262 group = find_busiest_group(&env, balance);
4250 4263
4251 if (*balance == 0) 4264 if (*balance == 0)
4252 goto out_balanced; 4265 goto out_balanced;
@@ -4256,7 +4269,7 @@ redo:
4256 goto out_balanced; 4269 goto out_balanced;
4257 } 4270 }
4258 4271
4259 busiest = find_busiest_queue(&env, group, cpus); 4272 busiest = find_busiest_queue(&env, group);
4260 if (!busiest) { 4273 if (!busiest) {
4261 schedstat_inc(sd, lb_nobusyq[idle]); 4274 schedstat_inc(sd, lb_nobusyq[idle]);
4262 goto out_balanced; 4275 goto out_balanced;
@@ -4267,6 +4280,7 @@ redo:
4267 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4280 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4268 4281
4269 ld_moved = 0; 4282 ld_moved = 0;
4283 lb_iterations = 1;
4270 if (busiest->nr_running > 1) { 4284 if (busiest->nr_running > 1) {
4271 /* 4285 /*
4272 * Attempt to move tasks. If find_busiest_group has found 4286 * Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4298,13 @@ more_balance:
4284 double_rq_lock(this_rq, busiest); 4298 double_rq_lock(this_rq, busiest);
4285 if (!env.loop) 4299 if (!env.loop)
4286 update_h_load(env.src_cpu); 4300 update_h_load(env.src_cpu);
4287 ld_moved += move_tasks(&env); 4301
4302 /*
4303 * cur_ld_moved - load moved in current iteration
4304 * ld_moved - cumulative load moved across iterations
4305 */
4306 cur_ld_moved = move_tasks(&env);
4307 ld_moved += cur_ld_moved;
4288 double_rq_unlock(this_rq, busiest); 4308 double_rq_unlock(this_rq, busiest);
4289 local_irq_restore(flags); 4309 local_irq_restore(flags);
4290 4310
@@ -4296,14 +4316,52 @@ more_balance:
4296 /* 4316 /*
4297 * some other cpu did the load balance for us. 4317 * some other cpu did the load balance for us.
4298 */ 4318 */
4299 if (ld_moved && this_cpu != smp_processor_id()) 4319 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4300 resched_cpu(this_cpu); 4320 resched_cpu(env.dst_cpu);
4321
4322 /*
4323 * Revisit (affine) tasks on src_cpu that couldn't be moved to
4324 * us and move them to an alternate dst_cpu in our sched_group
4325 * where they can run. The upper limit on how many times we
4326 * iterate on same src_cpu is dependent on number of cpus in our
4327 * sched_group.
4328 *
4329 * This changes load balance semantics a bit on who can move
4330 * load to a given_cpu. In addition to the given_cpu itself
4331 * (or a ilb_cpu acting on its behalf where given_cpu is
4332 * nohz-idle), we now have balance_cpu in a position to move
4333 * load to given_cpu. In rare situations, this may cause
4334 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
4335 * _independently_ and at _same_ time to move some load to
4336 * given_cpu) causing exceess load to be moved to given_cpu.
4337 * This however should not happen so much in practice and
4338 * moreover subsequent load balance cycles should correct the
4339 * excess load moved.
4340 */
4341 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4342 lb_iterations++ < max_lb_iterations) {
4343
4344 this_rq = cpu_rq(env.new_dst_cpu);
4345 env.dst_rq = this_rq;
4346 env.dst_cpu = env.new_dst_cpu;
4347 env.flags &= ~LBF_SOME_PINNED;
4348 env.loop = 0;
4349 env.loop_break = sched_nr_migrate_break;
4350 /*
4351 * Go back to "more_balance" rather than "redo" since we
4352 * need to continue with same src_cpu.
4353 */
4354 goto more_balance;
4355 }
4301 4356
4302 /* All tasks on this runqueue were pinned by CPU affinity */ 4357 /* All tasks on this runqueue were pinned by CPU affinity */
4303 if (unlikely(env.flags & LBF_ALL_PINNED)) { 4358 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4304 cpumask_clear_cpu(cpu_of(busiest), cpus); 4359 cpumask_clear_cpu(cpu_of(busiest), cpus);
4305 if (!cpumask_empty(cpus)) 4360 if (!cpumask_empty(cpus)) {
4361 env.loop = 0;
4362 env.loop_break = sched_nr_migrate_break;
4306 goto redo; 4363 goto redo;
4364 }
4307 goto out_balanced; 4365 goto out_balanced;
4308 } 4366 }
4309 } 4367 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..c35a1a7dd4d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
538/* 538/*
539 * Return the group to which this tasks belongs. 539 * Return the group to which this tasks belongs.
540 * 540 *
541 * We use task_subsys_state_check() and extend the RCU verification with 541 * We cannot use task_subsys_state() and friends because the cgroup
542 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each 542 * subsystem changes that value before the cgroup_subsys::attach() method
543 * task it moves into the cgroup. Therefore by holding either of those locks, 543 * is called, therefore we cannot pin it and might observe the wrong value.
544 * we pin the task to the current cgroup. 544 *
545 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
546 * core changes this before calling sched_move_task().
547 *
548 * Instead we use a 'copy' which is updated from sched_move_task() while
549 * holding both task_struct::pi_lock and rq::lock.
545 */ 550 */
546static inline struct task_group *task_group(struct task_struct *p) 551static inline struct task_group *task_group(struct task_struct *p)
547{ 552{
548 struct task_group *tg; 553 return p->sched_task_group;
549 struct cgroup_subsys_state *css;
550
551 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
552 lockdep_is_held(&p->pi_lock) ||
553 lockdep_is_held(&task_rq(p)->lock));
554 tg = container_of(css, struct task_group, css);
555
556 return autogroup_task_group(p, tg);
557} 554}
558 555
559/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 556/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
diff --git a/kernel/signal.c b/kernel/signal.c
index 677102789cf2..be4f856d52f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1971void ptrace_notify(int exit_code) 1971void ptrace_notify(int exit_code)
1972{ 1972{
1973 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); 1973 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1974 if (unlikely(current->task_works)) {
1975 if (test_and_clear_ti_thread_flag(current_thread_info(),
1976 TIF_NOTIFY_RESUME)) {
1977 smp_mb__after_clear_bit();
1978 task_work_run();
1979 }
1980 }
1974 1981
1975 spin_lock_irq(&current->sighand->siglock); 1982 spin_lock_irq(&current->sighand->siglock);
1976 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); 1983 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2191 struct signal_struct *signal = current->signal; 2198 struct signal_struct *signal = current->signal;
2192 int signr; 2199 int signr;
2193 2200
2201 if (unlikely(current->task_works)) {
2202 if (test_and_clear_ti_thread_flag(current_thread_info(),
2203 TIF_NOTIFY_RESUME)) {
2204 smp_mb__after_clear_bit();
2205 task_work_run();
2206 }
2207 }
2208
2194 if (unlikely(uprobe_deny_signal())) 2209 if (unlikely(uprobe_deny_signal()))
2195 return 0; 2210 return 0;
2196 2211
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..b73e681df09e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
210 __u32 pending; 210 __u32 pending;
211 int max_restart = MAX_SOFTIRQ_RESTART; 211 int max_restart = MAX_SOFTIRQ_RESTART;
212 int cpu; 212 int cpu;
213 unsigned long old_flags = current->flags;
214
215 /*
216 * Mask out PF_MEMALLOC s current task context is borrowed for the
217 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
218 * again if the socket is related to swap
219 */
220 current->flags &= ~PF_MEMALLOC;
213 221
214 pending = local_softirq_pending(); 222 pending = local_softirq_pending();
215 account_system_vtime(current); 223 account_system_vtime(current);
@@ -265,6 +273,7 @@ restart:
265 273
266 account_system_vtime(current); 274 account_system_vtime(current);
267 __local_bh_enable(SOFTIRQ_OFFSET); 275 __local_bh_enable(SOFTIRQ_OFFSET);
276 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
268} 277}
269 278
270#ifndef __ARCH_HAS_DO_SOFTIRQ 279#ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/kernel/sys.c b/kernel/sys.c
index 2d39a84cd857..241507f23eca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2015 break; 2015 break;
2016 } 2016 }
2017 me->pdeath_signal = arg2; 2017 me->pdeath_signal = arg2;
2018 error = 0;
2019 break; 2018 break;
2020 case PR_GET_PDEATHSIG: 2019 case PR_GET_PDEATHSIG:
2021 error = put_user(me->pdeath_signal, (int __user *)arg2); 2020 error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2029 break; 2028 break;
2030 } 2029 }
2031 set_dumpable(me->mm, arg2); 2030 set_dumpable(me->mm, arg2);
2032 error = 0;
2033 break; 2031 break;
2034 2032
2035 case PR_SET_UNALIGN: 2033 case PR_SET_UNALIGN:
@@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2056 case PR_SET_TIMING: 2054 case PR_SET_TIMING:
2057 if (arg2 != PR_TIMING_STATISTICAL) 2055 if (arg2 != PR_TIMING_STATISTICAL)
2058 error = -EINVAL; 2056 error = -EINVAL;
2059 else
2060 error = 0;
2061 break; 2057 break;
2062
2063 case PR_SET_NAME: 2058 case PR_SET_NAME:
2064 comm[sizeof(me->comm)-1] = 0; 2059 comm[sizeof(me->comm)-1] = 0;
2065 if (strncpy_from_user(comm, (char __user *)arg2, 2060 if (strncpy_from_user(comm, (char __user *)arg2,
@@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2067 return -EFAULT; 2062 return -EFAULT;
2068 set_task_comm(me, comm); 2063 set_task_comm(me, comm);
2069 proc_comm_connector(me); 2064 proc_comm_connector(me);
2070 return 0; 2065 break;
2071 case PR_GET_NAME: 2066 case PR_GET_NAME:
2072 get_task_comm(comm, me); 2067 get_task_comm(comm, me);
2073 if (copy_to_user((char __user *)arg2, comm, 2068 if (copy_to_user((char __user *)arg2, comm,
2074 sizeof(comm))) 2069 sizeof(comm)))
2075 return -EFAULT; 2070 return -EFAULT;
2076 return 0; 2071 break;
2077 case PR_GET_ENDIAN: 2072 case PR_GET_ENDIAN:
2078 error = GET_ENDIAN(me, arg2); 2073 error = GET_ENDIAN(me, arg2);
2079 break; 2074 break;
2080 case PR_SET_ENDIAN: 2075 case PR_SET_ENDIAN:
2081 error = SET_ENDIAN(me, arg2); 2076 error = SET_ENDIAN(me, arg2);
2082 break; 2077 break;
2083
2084 case PR_GET_SECCOMP: 2078 case PR_GET_SECCOMP:
2085 error = prctl_get_seccomp(); 2079 error = prctl_get_seccomp();
2086 break; 2080 break;
@@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2108 current->default_timer_slack_ns; 2102 current->default_timer_slack_ns;
2109 else 2103 else
2110 current->timer_slack_ns = arg2; 2104 current->timer_slack_ns = arg2;
2111 error = 0;
2112 break; 2105 break;
2113 case PR_MCE_KILL: 2106 case PR_MCE_KILL:
2114 if (arg4 | arg5) 2107 if (arg4 | arg5)
@@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2134 default: 2127 default:
2135 return -EINVAL; 2128 return -EINVAL;
2136 } 2129 }
2137 error = 0;
2138 break; 2130 break;
2139 case PR_MCE_KILL_GET: 2131 case PR_MCE_KILL_GET:
2140 if (arg2 | arg3 | arg4 | arg5) 2132 if (arg2 | arg3 | arg4 | arg5)
@@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2153 break; 2145 break;
2154 case PR_SET_CHILD_SUBREAPER: 2146 case PR_SET_CHILD_SUBREAPER:
2155 me->signal->is_child_subreaper = !!arg2; 2147 me->signal->is_child_subreaper = !!arg2;
2156 error = 0;
2157 break; 2148 break;
2158 case PR_GET_CHILD_SUBREAPER: 2149 case PR_GET_CHILD_SUBREAPER:
2159 error = put_user(me->signal->is_child_subreaper, 2150 error = put_user(me->signal->is_child_subreaper,
@@ -2195,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info)
2195 argv_free(info->argv); 2186 argv_free(info->argv);
2196} 2187}
2197 2188
2198/** 2189static int __orderly_poweroff(void)
2199 * orderly_poweroff - Trigger an orderly system poweroff
2200 * @force: force poweroff if command execution fails
2201 *
2202 * This may be called from any context to trigger a system shutdown.
2203 * If the orderly shutdown fails, it will force an immediate shutdown.
2204 */
2205int orderly_poweroff(bool force)
2206{ 2190{
2207 int argc; 2191 int argc;
2208 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); 2192 char **argv;
2209 static char *envp[] = { 2193 static char *envp[] = {
2210 "HOME=/", 2194 "HOME=/",
2211 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", 2195 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2212 NULL 2196 NULL
2213 }; 2197 };
2214 int ret = -ENOMEM; 2198 int ret;
2215 2199
2200 argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2216 if (argv == NULL) { 2201 if (argv == NULL) {
2217 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2202 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2218 __func__, poweroff_cmd); 2203 __func__, poweroff_cmd);
2219 goto out; 2204 return -ENOMEM;
2220 } 2205 }
2221 2206
2222 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, 2207 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
2223 NULL, argv_cleanup, NULL); 2208 NULL, argv_cleanup, NULL);
2224out:
2225 if (likely(!ret))
2226 return 0;
2227
2228 if (ret == -ENOMEM) 2209 if (ret == -ENOMEM)
2229 argv_free(argv); 2210 argv_free(argv);
2230 2211
2231 if (force) { 2212 return ret;
2213}
2214
2215/**
2216 * orderly_poweroff - Trigger an orderly system poweroff
2217 * @force: force poweroff if command execution fails
2218 *
2219 * This may be called from any context to trigger a system shutdown.
2220 * If the orderly shutdown fails, it will force an immediate shutdown.
2221 */
2222int orderly_poweroff(bool force)
2223{
2224 int ret = __orderly_poweroff();
2225
2226 if (ret && force) {
2232 printk(KERN_WARNING "Failed to start orderly shutdown: " 2227 printk(KERN_WARNING "Failed to start orderly shutdown: "
2233 "forcing the issue\n"); 2228 "forcing the issue\n");
2234 2229
2235 /* I guess this should try to kick off some daemon to 2230 /*
2236 sync and poweroff asap. Or not even bother syncing 2231 * I guess this should try to kick off some daemon to sync and
2237 if we're doing an emergency shutdown? */ 2232 * poweroff asap. Or not even bother syncing if we're doing an
2233 * emergency shutdown?
2234 */
2238 emergency_sync(); 2235 emergency_sync();
2239 kernel_power_off(); 2236 kernel_power_off();
2240 } 2237 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb4..87174ef59161 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,6 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/kmemcheck.h> 32#include <linux/kmemcheck.h>
33#include <linux/kmemleak.h>
33#include <linux/fs.h> 34#include <linux/fs.h>
34#include <linux/init.h> 35#include <linux/init.h>
35#include <linux/kernel.h> 36#include <linux/kernel.h>
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, loff_t *ppos); 175 void __user *buffer, size_t *lenp, loff_t *ppos);
175#endif 176#endif
176 177
178static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
179 void __user *buffer, size_t *lenp, loff_t *ppos);
180static int proc_dostring_coredump(struct ctl_table *table, int write,
181 void __user *buffer, size_t *lenp, loff_t *ppos);
182
177#ifdef CONFIG_MAGIC_SYSRQ 183#ifdef CONFIG_MAGIC_SYSRQ
178/* Note: sysrq code uses it's own private copy */ 184/* Note: sysrq code uses it's own private copy */
179static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 185static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = {
410 .data = core_pattern, 416 .data = core_pattern,
411 .maxlen = CORENAME_MAX_SIZE, 417 .maxlen = CORENAME_MAX_SIZE,
412 .mode = 0644, 418 .mode = 0644,
413 .proc_handler = proc_dostring, 419 .proc_handler = proc_dostring_coredump,
414 }, 420 },
415 { 421 {
416 .procname = "core_pipe_limit", 422 .procname = "core_pipe_limit",
@@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = {
1095 .extra1 = &zero, 1101 .extra1 = &zero,
1096 }, 1102 },
1097 { 1103 {
1098 .procname = "nr_pdflush_threads", 1104 .procname = "nr_pdflush_threads",
1099 .data = &nr_pdflush_threads, 1105 .mode = 0444 /* read-only */,
1100 .maxlen = sizeof nr_pdflush_threads, 1106 .proc_handler = pdflush_proc_obsolete,
1101 .mode = 0444 /* read-only*/,
1102 .proc_handler = proc_dointvec,
1103 }, 1107 },
1104 { 1108 {
1105 .procname = "swappiness", 1109 .procname = "swappiness",
@@ -1494,11 +1498,29 @@ static struct ctl_table fs_table[] = {
1494#endif 1498#endif
1495#endif 1499#endif
1496 { 1500 {
1501 .procname = "protected_symlinks",
1502 .data = &sysctl_protected_symlinks,
1503 .maxlen = sizeof(int),
1504 .mode = 0600,
1505 .proc_handler = proc_dointvec_minmax,
1506 .extra1 = &zero,
1507 .extra2 = &one,
1508 },
1509 {
1510 .procname = "protected_hardlinks",
1511 .data = &sysctl_protected_hardlinks,
1512 .maxlen = sizeof(int),
1513 .mode = 0600,
1514 .proc_handler = proc_dointvec_minmax,
1515 .extra1 = &zero,
1516 .extra2 = &one,
1517 },
1518 {
1497 .procname = "suid_dumpable", 1519 .procname = "suid_dumpable",
1498 .data = &suid_dumpable, 1520 .data = &suid_dumpable,
1499 .maxlen = sizeof(int), 1521 .maxlen = sizeof(int),
1500 .mode = 0644, 1522 .mode = 0644,
1501 .proc_handler = proc_dointvec_minmax, 1523 .proc_handler = proc_dointvec_minmax_coredump,
1502 .extra1 = &zero, 1524 .extra1 = &zero,
1503 .extra2 = &two, 1525 .extra2 = &two,
1504 }, 1526 },
@@ -1551,7 +1573,10 @@ static struct ctl_table dev_table[] = {
1551 1573
1552int __init sysctl_init(void) 1574int __init sysctl_init(void)
1553{ 1575{
1554 register_sysctl_table(sysctl_base_table); 1576 struct ctl_table_header *hdr;
1577
1578 hdr = register_sysctl_table(sysctl_base_table);
1579 kmemleak_not_leak(hdr);
1555 return 0; 1580 return 0;
1556} 1581}
1557 1582
@@ -2009,6 +2034,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2009 do_proc_dointvec_minmax_conv, &param); 2034 do_proc_dointvec_minmax_conv, &param);
2010} 2035}
2011 2036
2037static void validate_coredump_safety(void)
2038{
2039 if (suid_dumpable == SUID_DUMPABLE_SAFE &&
2040 core_pattern[0] != '/' && core_pattern[0] != '|') {
2041 printk(KERN_WARNING "Unsafe core_pattern used with "\
2042 "suid_dumpable=2. Pipe handler or fully qualified "\
2043 "core dump path required.\n");
2044 }
2045}
2046
2047static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
2048 void __user *buffer, size_t *lenp, loff_t *ppos)
2049{
2050 int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2051 if (!error)
2052 validate_coredump_safety();
2053 return error;
2054}
2055
2056static int proc_dostring_coredump(struct ctl_table *table, int write,
2057 void __user *buffer, size_t *lenp, loff_t *ppos)
2058{
2059 int error = proc_dostring(table, write, buffer, lenp, ppos);
2060 if (!error)
2061 validate_coredump_safety();
2062 return error;
2063}
2064
2012static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2065static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2013 void __user *buffer, 2066 void __user *buffer,
2014 size_t *lenp, loff_t *ppos, 2067 size_t *lenp, loff_t *ppos,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..65bdcf198d4e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, 147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ 148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ 149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
150 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, 150 /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, 151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
152 /* VM_PAGEBUF unused */ 152 /* VM_PAGEBUF unused */
153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ 153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 82d1c794066d..91d4e1742a0c 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -3,82 +3,78 @@
3#include <linux/tracehook.h> 3#include <linux/tracehook.h>
4 4
5int 5int
6task_work_add(struct task_struct *task, struct task_work *twork, bool notify) 6task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
7{ 7{
8 struct callback_head *last, *first;
8 unsigned long flags; 9 unsigned long flags;
9 int err = -ESRCH;
10 10
11#ifndef TIF_NOTIFY_RESUME
12 if (notify)
13 return -ENOTSUPP;
14#endif
15 /* 11 /*
16 * We must not insert the new work if the task has already passed 12 * Not inserting the new work if the task has already passed
17 * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() 13 * exit_task_work() is the responisbility of callers.
18 * and check PF_EXITING under pi_lock.
19 */ 14 */
20 raw_spin_lock_irqsave(&task->pi_lock, flags); 15 raw_spin_lock_irqsave(&task->pi_lock, flags);
21 if (likely(!(task->flags & PF_EXITING))) { 16 last = task->task_works;
22 hlist_add_head(&twork->hlist, &task->task_works); 17 first = last ? last->next : twork;
23 err = 0; 18 twork->next = first;
24 } 19 if (last)
20 last->next = twork;
21 task->task_works = twork;
25 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 22 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
26 23
27 /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ 24 /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
28 if (likely(!err) && notify) 25 if (notify)
29 set_notify_resume(task); 26 set_notify_resume(task);
30 return err; 27 return 0;
31} 28}
32 29
33struct task_work * 30struct callback_head *
34task_work_cancel(struct task_struct *task, task_work_func_t func) 31task_work_cancel(struct task_struct *task, task_work_func_t func)
35{ 32{
36 unsigned long flags; 33 unsigned long flags;
37 struct task_work *twork; 34 struct callback_head *last, *res = NULL;
38 struct hlist_node *pos;
39 35
40 raw_spin_lock_irqsave(&task->pi_lock, flags); 36 raw_spin_lock_irqsave(&task->pi_lock, flags);
41 hlist_for_each_entry(twork, pos, &task->task_works, hlist) { 37 last = task->task_works;
42 if (twork->func == func) { 38 if (last) {
43 hlist_del(&twork->hlist); 39 struct callback_head *q = last, *p = q->next;
44 goto found; 40 while (1) {
41 if (p->func == func) {
42 q->next = p->next;
43 if (p == last)
44 task->task_works = q == p ? NULL : q;
45 res = p;
46 break;
47 }
48 if (p == last)
49 break;
50 q = p;
51 p = q->next;
45 } 52 }
46 } 53 }
47 twork = NULL;
48 found:
49 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 54 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
50 55 return res;
51 return twork;
52} 56}
53 57
54void task_work_run(void) 58void task_work_run(void)
55{ 59{
56 struct task_struct *task = current; 60 struct task_struct *task = current;
57 struct hlist_head task_works; 61 struct callback_head *p, *q;
58 struct hlist_node *pos;
59 62
60 raw_spin_lock_irq(&task->pi_lock); 63 while (1) {
61 hlist_move_list(&task->task_works, &task_works); 64 raw_spin_lock_irq(&task->pi_lock);
62 raw_spin_unlock_irq(&task->pi_lock); 65 p = task->task_works;
66 task->task_works = NULL;
67 raw_spin_unlock_irq(&task->pi_lock);
63 68
64 if (unlikely(hlist_empty(&task_works))) 69 if (unlikely(!p))
65 return; 70 return;
66 /*
67 * We use hlist to save the space in task_struct, but we want fifo.
68 * Find the last entry, the list should be short, then process them
69 * in reverse order.
70 */
71 for (pos = task_works.first; pos->next; pos = pos->next)
72 ;
73 71
74 for (;;) { 72 q = p->next; /* head */
75 struct hlist_node **pprev = pos->pprev; 73 p->next = NULL; /* cut it */
76 struct task_work *twork = container_of(pos, struct task_work, 74 while (q) {
77 hlist); 75 p = q->next;
78 twork->func(twork); 76 q->func(q);
79 77 q = p;
80 if (pprev == &task_works.first) 78 }
81 break;
82 pos = container_of(pprev, struct hlist_node, next);
83 } 79 }
84} 80}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4f..d0a32796550f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
436 436
437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
438 sizeof(struct cgroupstats)); 438 sizeof(struct cgroupstats));
439 if (na == NULL) {
440 rc = -EMSGSIZE;
441 goto err;
442 }
443
439 stats = nla_data(na); 444 stats = nla_data(na);
440 memset(stats, 0, sizeof(*stats)); 445 memset(stats, 0, sizeof(*stats));
441 446
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a470154e0408..46da0537c10b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
37 * requested HZ value. It is also not recommended 37 * requested HZ value. It is also not recommended
38 * for "tick-less" systems. 38 * for "tick-less" systems.
39 */ 39 */
40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) 40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ))
41 41
42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier 42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
43 * conversion, the .shift value could be zero. However 43 * conversion, the .shift value could be zero. However
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b7fbadc5c973..24174b4d669b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock);
28/* USER_HZ period (usecs): */ 28/* USER_HZ period (usecs): */
29unsigned long tick_usec = TICK_USEC; 29unsigned long tick_usec = TICK_USEC;
30 30
31/* ACTHZ period (nsecs): */ 31/* SHIFTED_HZ period (nsecs): */
32unsigned long tick_nsec; 32unsigned long tick_nsec;
33 33
34static u64 tick_length; 34static u64 tick_length;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f045cc50832d..2988bc819187 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -65,14 +65,14 @@ struct timekeeper {
65 * used instead. 65 * used instead.
66 */ 66 */
67 struct timespec wall_to_monotonic; 67 struct timespec wall_to_monotonic;
68 /* time spent in suspend */
69 struct timespec total_sleep_time;
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time;
72 /* Offset clock monotonic -> clock realtime */ 68 /* Offset clock monotonic -> clock realtime */
73 ktime_t offs_real; 69 ktime_t offs_real;
70 /* time spent in suspend */
71 struct timespec total_sleep_time;
74 /* Offset clock monotonic -> clock boottime */ 72 /* Offset clock monotonic -> clock boottime */
75 ktime_t offs_boot; 73 ktime_t offs_boot;
74 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
75 struct timespec raw_time;
76 /* Seqlock for all timekeeper values */ 76 /* Seqlock for all timekeeper values */
77 seqlock_t lock; 77 seqlock_t lock;
78}; 78};
@@ -108,13 +108,38 @@ static struct timespec tk_xtime(struct timekeeper *tk)
108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) 108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
109{ 109{
110 tk->xtime_sec = ts->tv_sec; 110 tk->xtime_sec = ts->tv_sec;
111 tk->xtime_nsec = ts->tv_nsec << tk->shift; 111 tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
112} 112}
113 113
114static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) 114static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
115{ 115{
116 tk->xtime_sec += ts->tv_sec; 116 tk->xtime_sec += ts->tv_sec;
117 tk->xtime_nsec += ts->tv_nsec << tk->shift; 117 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
118}
119
120static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
121{
122 struct timespec tmp;
123
124 /*
125 * Verify consistency of: offset_real = -wall_to_monotonic
126 * before modifying anything
127 */
128 set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
129 -tk->wall_to_monotonic.tv_nsec);
130 WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
131 tk->wall_to_monotonic = wtm;
132 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
133 tk->offs_real = timespec_to_ktime(tmp);
134}
135
136static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
137{
138 /* Verify consistency before modifying */
139 WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
140
141 tk->total_sleep_time = t;
142 tk->offs_boot = timespec_to_ktime(t);
118} 143}
119 144
120/** 145/**
@@ -217,14 +242,6 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
217 return nsec + arch_gettimeoffset(); 242 return nsec + arch_gettimeoffset();
218} 243}
219 244
220static void update_rt_offset(struct timekeeper *tk)
221{
222 struct timespec tmp, *wtm = &tk->wall_to_monotonic;
223
224 set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
225 tk->offs_real = timespec_to_ktime(tmp);
226}
227
228/* must hold write on timekeeper.lock */ 245/* must hold write on timekeeper.lock */
229static void timekeeping_update(struct timekeeper *tk, bool clearntp) 246static void timekeeping_update(struct timekeeper *tk, bool clearntp)
230{ 247{
@@ -234,12 +251,10 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
234 tk->ntp_error = 0; 251 tk->ntp_error = 0;
235 ntp_clear(); 252 ntp_clear();
236 } 253 }
237 update_rt_offset(tk);
238 xt = tk_xtime(tk); 254 xt = tk_xtime(tk);
239 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); 255 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
240} 256}
241 257
242
243/** 258/**
244 * timekeeping_forward_now - update clock to the current time 259 * timekeeping_forward_now - update clock to the current time
245 * 260 *
@@ -277,18 +292,19 @@ static void timekeeping_forward_now(struct timekeeper *tk)
277 */ 292 */
278void getnstimeofday(struct timespec *ts) 293void getnstimeofday(struct timespec *ts)
279{ 294{
295 struct timekeeper *tk = &timekeeper;
280 unsigned long seq; 296 unsigned long seq;
281 s64 nsecs = 0; 297 s64 nsecs = 0;
282 298
283 WARN_ON(timekeeping_suspended); 299 WARN_ON(timekeeping_suspended);
284 300
285 do { 301 do {
286 seq = read_seqbegin(&timekeeper.lock); 302 seq = read_seqbegin(&tk->lock);
287 303
288 ts->tv_sec = timekeeper.xtime_sec; 304 ts->tv_sec = tk->xtime_sec;
289 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 305 ts->tv_nsec = timekeeping_get_ns(tk);
290 306
291 } while (read_seqretry(&timekeeper.lock, seq)); 307 } while (read_seqretry(&tk->lock, seq));
292 308
293 timespec_add_ns(ts, nsecs); 309 timespec_add_ns(ts, nsecs);
294} 310}
@@ -296,19 +312,18 @@ EXPORT_SYMBOL(getnstimeofday);
296 312
297ktime_t ktime_get(void) 313ktime_t ktime_get(void)
298{ 314{
315 struct timekeeper *tk = &timekeeper;
299 unsigned int seq; 316 unsigned int seq;
300 s64 secs, nsecs; 317 s64 secs, nsecs;
301 318
302 WARN_ON(timekeeping_suspended); 319 WARN_ON(timekeeping_suspended);
303 320
304 do { 321 do {
305 seq = read_seqbegin(&timekeeper.lock); 322 seq = read_seqbegin(&tk->lock);
306 secs = timekeeper.xtime_sec + 323 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
307 timekeeper.wall_to_monotonic.tv_sec; 324 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
308 nsecs = timekeeping_get_ns(&timekeeper) +
309 timekeeper.wall_to_monotonic.tv_nsec;
310 325
311 } while (read_seqretry(&timekeeper.lock, seq)); 326 } while (read_seqretry(&tk->lock, seq));
312 /* 327 /*
313 * Use ktime_set/ktime_add_ns to create a proper ktime on 328 * Use ktime_set/ktime_add_ns to create a proper ktime on
314 * 32-bit architectures without CONFIG_KTIME_SCALAR. 329 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -327,18 +342,19 @@ EXPORT_SYMBOL_GPL(ktime_get);
327 */ 342 */
328void ktime_get_ts(struct timespec *ts) 343void ktime_get_ts(struct timespec *ts)
329{ 344{
345 struct timekeeper *tk = &timekeeper;
330 struct timespec tomono; 346 struct timespec tomono;
331 unsigned int seq; 347 unsigned int seq;
332 348
333 WARN_ON(timekeeping_suspended); 349 WARN_ON(timekeeping_suspended);
334 350
335 do { 351 do {
336 seq = read_seqbegin(&timekeeper.lock); 352 seq = read_seqbegin(&tk->lock);
337 ts->tv_sec = timekeeper.xtime_sec; 353 ts->tv_sec = tk->xtime_sec;
338 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 354 ts->tv_nsec = timekeeping_get_ns(tk);
339 tomono = timekeeper.wall_to_monotonic; 355 tomono = tk->wall_to_monotonic;
340 356
341 } while (read_seqretry(&timekeeper.lock, seq)); 357 } while (read_seqretry(&tk->lock, seq));
342 358
343 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 359 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
344 ts->tv_nsec + tomono.tv_nsec); 360 ts->tv_nsec + tomono.tv_nsec);
@@ -358,22 +374,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
358 */ 374 */
359void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) 375void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
360{ 376{
377 struct timekeeper *tk = &timekeeper;
361 unsigned long seq; 378 unsigned long seq;
362 s64 nsecs_raw, nsecs_real; 379 s64 nsecs_raw, nsecs_real;
363 380
364 WARN_ON_ONCE(timekeeping_suspended); 381 WARN_ON_ONCE(timekeeping_suspended);
365 382
366 do { 383 do {
367 seq = read_seqbegin(&timekeeper.lock); 384 seq = read_seqbegin(&tk->lock);
368 385
369 *ts_raw = timekeeper.raw_time; 386 *ts_raw = tk->raw_time;
370 ts_real->tv_sec = timekeeper.xtime_sec; 387 ts_real->tv_sec = tk->xtime_sec;
371 ts_real->tv_nsec = 0; 388 ts_real->tv_nsec = 0;
372 389
373 nsecs_raw = timekeeping_get_ns_raw(&timekeeper); 390 nsecs_raw = timekeeping_get_ns_raw(tk);
374 nsecs_real = timekeeping_get_ns(&timekeeper); 391 nsecs_real = timekeeping_get_ns(tk);
375 392
376 } while (read_seqretry(&timekeeper.lock, seq)); 393 } while (read_seqretry(&tk->lock, seq));
377 394
378 timespec_add_ns(ts_raw, nsecs_raw); 395 timespec_add_ns(ts_raw, nsecs_raw);
379 timespec_add_ns(ts_real, nsecs_real); 396 timespec_add_ns(ts_real, nsecs_real);
@@ -406,28 +423,28 @@ EXPORT_SYMBOL(do_gettimeofday);
406 */ 423 */
407int do_settimeofday(const struct timespec *tv) 424int do_settimeofday(const struct timespec *tv)
408{ 425{
426 struct timekeeper *tk = &timekeeper;
409 struct timespec ts_delta, xt; 427 struct timespec ts_delta, xt;
410 unsigned long flags; 428 unsigned long flags;
411 429
412 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 430 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
413 return -EINVAL; 431 return -EINVAL;
414 432
415 write_seqlock_irqsave(&timekeeper.lock, flags); 433 write_seqlock_irqsave(&tk->lock, flags);
416 434
417 timekeeping_forward_now(&timekeeper); 435 timekeeping_forward_now(tk);
418 436
419 xt = tk_xtime(&timekeeper); 437 xt = tk_xtime(tk);
420 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 438 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
421 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 439 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
422 440
423 timekeeper.wall_to_monotonic = 441 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
424 timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
425 442
426 tk_set_xtime(&timekeeper, tv); 443 tk_set_xtime(tk, tv);
427 444
428 timekeeping_update(&timekeeper, true); 445 timekeeping_update(tk, true);
429 446
430 write_sequnlock_irqrestore(&timekeeper.lock, flags); 447 write_sequnlock_irqrestore(&tk->lock, flags);
431 448
432 /* signal hrtimers about time change */ 449 /* signal hrtimers about time change */
433 clock_was_set(); 450 clock_was_set();
@@ -436,7 +453,6 @@ int do_settimeofday(const struct timespec *tv)
436} 453}
437EXPORT_SYMBOL(do_settimeofday); 454EXPORT_SYMBOL(do_settimeofday);
438 455
439
440/** 456/**
441 * timekeeping_inject_offset - Adds or subtracts from the current time. 457 * timekeeping_inject_offset - Adds or subtracts from the current time.
442 * @tv: pointer to the timespec variable containing the offset 458 * @tv: pointer to the timespec variable containing the offset
@@ -445,23 +461,23 @@ EXPORT_SYMBOL(do_settimeofday);
445 */ 461 */
446int timekeeping_inject_offset(struct timespec *ts) 462int timekeeping_inject_offset(struct timespec *ts)
447{ 463{
464 struct timekeeper *tk = &timekeeper;
448 unsigned long flags; 465 unsigned long flags;
449 466
450 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 467 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
451 return -EINVAL; 468 return -EINVAL;
452 469
453 write_seqlock_irqsave(&timekeeper.lock, flags); 470 write_seqlock_irqsave(&tk->lock, flags);
454 471
455 timekeeping_forward_now(&timekeeper); 472 timekeeping_forward_now(tk);
456 473
457 474
458 tk_xtime_add(&timekeeper, ts); 475 tk_xtime_add(tk, ts);
459 timekeeper.wall_to_monotonic = 476 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
460 timespec_sub(timekeeper.wall_to_monotonic, *ts);
461 477
462 timekeeping_update(&timekeeper, true); 478 timekeeping_update(tk, true);
463 479
464 write_sequnlock_irqrestore(&timekeeper.lock, flags); 480 write_sequnlock_irqrestore(&tk->lock, flags);
465 481
466 /* signal hrtimers about time change */ 482 /* signal hrtimers about time change */
467 clock_was_set(); 483 clock_was_set();
@@ -477,23 +493,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
477 */ 493 */
478static int change_clocksource(void *data) 494static int change_clocksource(void *data)
479{ 495{
496 struct timekeeper *tk = &timekeeper;
480 struct clocksource *new, *old; 497 struct clocksource *new, *old;
481 unsigned long flags; 498 unsigned long flags;
482 499
483 new = (struct clocksource *) data; 500 new = (struct clocksource *) data;
484 501
485 write_seqlock_irqsave(&timekeeper.lock, flags); 502 write_seqlock_irqsave(&tk->lock, flags);
486 503
487 timekeeping_forward_now(&timekeeper); 504 timekeeping_forward_now(tk);
488 if (!new->enable || new->enable(new) == 0) { 505 if (!new->enable || new->enable(new) == 0) {
489 old = timekeeper.clock; 506 old = tk->clock;
490 tk_setup_internals(&timekeeper, new); 507 tk_setup_internals(tk, new);
491 if (old->disable) 508 if (old->disable)
492 old->disable(old); 509 old->disable(old);
493 } 510 }
494 timekeeping_update(&timekeeper, true); 511 timekeeping_update(tk, true);
495 512
496 write_sequnlock_irqrestore(&timekeeper.lock, flags); 513 write_sequnlock_irqrestore(&tk->lock, flags);
497 514
498 return 0; 515 return 0;
499} 516}
@@ -507,7 +524,9 @@ static int change_clocksource(void *data)
507 */ 524 */
508void timekeeping_notify(struct clocksource *clock) 525void timekeeping_notify(struct clocksource *clock)
509{ 526{
510 if (timekeeper.clock == clock) 527 struct timekeeper *tk = &timekeeper;
528
529 if (tk->clock == clock)
511 return; 530 return;
512 stop_machine(change_clocksource, clock, NULL); 531 stop_machine(change_clocksource, clock, NULL);
513 tick_clock_notify(); 532 tick_clock_notify();
@@ -536,35 +555,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
536 */ 555 */
537void getrawmonotonic(struct timespec *ts) 556void getrawmonotonic(struct timespec *ts)
538{ 557{
558 struct timekeeper *tk = &timekeeper;
539 unsigned long seq; 559 unsigned long seq;
540 s64 nsecs; 560 s64 nsecs;
541 561
542 do { 562 do {
543 seq = read_seqbegin(&timekeeper.lock); 563 seq = read_seqbegin(&tk->lock);
544 nsecs = timekeeping_get_ns_raw(&timekeeper); 564 nsecs = timekeeping_get_ns_raw(tk);
545 *ts = timekeeper.raw_time; 565 *ts = tk->raw_time;
546 566
547 } while (read_seqretry(&timekeeper.lock, seq)); 567 } while (read_seqretry(&tk->lock, seq));
548 568
549 timespec_add_ns(ts, nsecs); 569 timespec_add_ns(ts, nsecs);
550} 570}
551EXPORT_SYMBOL(getrawmonotonic); 571EXPORT_SYMBOL(getrawmonotonic);
552 572
553
554/** 573/**
555 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 574 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
556 */ 575 */
557int timekeeping_valid_for_hres(void) 576int timekeeping_valid_for_hres(void)
558{ 577{
578 struct timekeeper *tk = &timekeeper;
559 unsigned long seq; 579 unsigned long seq;
560 int ret; 580 int ret;
561 581
562 do { 582 do {
563 seq = read_seqbegin(&timekeeper.lock); 583 seq = read_seqbegin(&tk->lock);
564 584
565 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 585 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
566 586
567 } while (read_seqretry(&timekeeper.lock, seq)); 587 } while (read_seqretry(&tk->lock, seq));
568 588
569 return ret; 589 return ret;
570} 590}
@@ -574,15 +594,16 @@ int timekeeping_valid_for_hres(void)
574 */ 594 */
575u64 timekeeping_max_deferment(void) 595u64 timekeeping_max_deferment(void)
576{ 596{
597 struct timekeeper *tk = &timekeeper;
577 unsigned long seq; 598 unsigned long seq;
578 u64 ret; 599 u64 ret;
579 600
580 do { 601 do {
581 seq = read_seqbegin(&timekeeper.lock); 602 seq = read_seqbegin(&tk->lock);
582 603
583 ret = timekeeper.clock->max_idle_ns; 604 ret = tk->clock->max_idle_ns;
584 605
585 } while (read_seqretry(&timekeeper.lock, seq)); 606 } while (read_seqretry(&tk->lock, seq));
586 607
587 return ret; 608 return ret;
588} 609}
@@ -622,46 +643,43 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
622 */ 643 */
623void __init timekeeping_init(void) 644void __init timekeeping_init(void)
624{ 645{
646 struct timekeeper *tk = &timekeeper;
625 struct clocksource *clock; 647 struct clocksource *clock;
626 unsigned long flags; 648 unsigned long flags;
627 struct timespec now, boot; 649 struct timespec now, boot, tmp;
628 650
629 read_persistent_clock(&now); 651 read_persistent_clock(&now);
630 read_boot_clock(&boot); 652 read_boot_clock(&boot);
631 653
632 seqlock_init(&timekeeper.lock); 654 seqlock_init(&tk->lock);
633 655
634 ntp_init(); 656 ntp_init();
635 657
636 write_seqlock_irqsave(&timekeeper.lock, flags); 658 write_seqlock_irqsave(&tk->lock, flags);
637 clock = clocksource_default_clock(); 659 clock = clocksource_default_clock();
638 if (clock->enable) 660 if (clock->enable)
639 clock->enable(clock); 661 clock->enable(clock);
640 tk_setup_internals(&timekeeper, clock); 662 tk_setup_internals(tk, clock);
641 663
642 tk_set_xtime(&timekeeper, &now); 664 tk_set_xtime(tk, &now);
643 timekeeper.raw_time.tv_sec = 0; 665 tk->raw_time.tv_sec = 0;
644 timekeeper.raw_time.tv_nsec = 0; 666 tk->raw_time.tv_nsec = 0;
645 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 667 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
646 boot = tk_xtime(&timekeeper); 668 boot = tk_xtime(tk);
647 669
648 set_normalized_timespec(&timekeeper.wall_to_monotonic, 670 set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
649 -boot.tv_sec, -boot.tv_nsec); 671 tk_set_wall_to_mono(tk, tmp);
650 update_rt_offset(&timekeeper); 672
651 timekeeper.total_sleep_time.tv_sec = 0; 673 tmp.tv_sec = 0;
652 timekeeper.total_sleep_time.tv_nsec = 0; 674 tmp.tv_nsec = 0;
653 write_sequnlock_irqrestore(&timekeeper.lock, flags); 675 tk_set_sleep_time(tk, tmp);
676
677 write_sequnlock_irqrestore(&tk->lock, flags);
654} 678}
655 679
656/* time in seconds when suspend began */ 680/* time in seconds when suspend began */
657static struct timespec timekeeping_suspend_time; 681static struct timespec timekeeping_suspend_time;
658 682
659static void update_sleep_time(struct timespec t)
660{
661 timekeeper.total_sleep_time = t;
662 timekeeper.offs_boot = timespec_to_ktime(t);
663}
664
665/** 683/**
666 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 684 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
667 * @delta: pointer to a timespec delta value 685 * @delta: pointer to a timespec delta value
@@ -677,13 +695,11 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
677 "sleep delta value!\n"); 695 "sleep delta value!\n");
678 return; 696 return;
679 } 697 }
680
681 tk_xtime_add(tk, delta); 698 tk_xtime_add(tk, delta);
682 tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); 699 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
683 update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); 700 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
684} 701}
685 702
686
687/** 703/**
688 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values 704 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
689 * @delta: pointer to a timespec delta value 705 * @delta: pointer to a timespec delta value
@@ -696,6 +712,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
696 */ 712 */
697void timekeeping_inject_sleeptime(struct timespec *delta) 713void timekeeping_inject_sleeptime(struct timespec *delta)
698{ 714{
715 struct timekeeper *tk = &timekeeper;
699 unsigned long flags; 716 unsigned long flags;
700 struct timespec ts; 717 struct timespec ts;
701 718
@@ -704,21 +721,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
704 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 721 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
705 return; 722 return;
706 723
707 write_seqlock_irqsave(&timekeeper.lock, flags); 724 write_seqlock_irqsave(&tk->lock, flags);
708 725
709 timekeeping_forward_now(&timekeeper); 726 timekeeping_forward_now(tk);
710 727
711 __timekeeping_inject_sleeptime(&timekeeper, delta); 728 __timekeeping_inject_sleeptime(tk, delta);
712 729
713 timekeeping_update(&timekeeper, true); 730 timekeeping_update(tk, true);
714 731
715 write_sequnlock_irqrestore(&timekeeper.lock, flags); 732 write_sequnlock_irqrestore(&tk->lock, flags);
716 733
717 /* signal hrtimers about time change */ 734 /* signal hrtimers about time change */
718 clock_was_set(); 735 clock_was_set();
719} 736}
720 737
721
722/** 738/**
723 * timekeeping_resume - Resumes the generic timekeeping subsystem. 739 * timekeeping_resume - Resumes the generic timekeeping subsystem.
724 * 740 *
@@ -728,6 +744,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
728 */ 744 */
729static void timekeeping_resume(void) 745static void timekeeping_resume(void)
730{ 746{
747 struct timekeeper *tk = &timekeeper;
731 unsigned long flags; 748 unsigned long flags;
732 struct timespec ts; 749 struct timespec ts;
733 750
@@ -735,18 +752,18 @@ static void timekeeping_resume(void)
735 752
736 clocksource_resume(); 753 clocksource_resume();
737 754
738 write_seqlock_irqsave(&timekeeper.lock, flags); 755 write_seqlock_irqsave(&tk->lock, flags);
739 756
740 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 757 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
741 ts = timespec_sub(ts, timekeeping_suspend_time); 758 ts = timespec_sub(ts, timekeeping_suspend_time);
742 __timekeeping_inject_sleeptime(&timekeeper, &ts); 759 __timekeeping_inject_sleeptime(tk, &ts);
743 } 760 }
744 /* re-base the last cycle value */ 761 /* re-base the last cycle value */
745 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 762 tk->clock->cycle_last = tk->clock->read(tk->clock);
746 timekeeper.ntp_error = 0; 763 tk->ntp_error = 0;
747 timekeeping_suspended = 0; 764 timekeeping_suspended = 0;
748 timekeeping_update(&timekeeper, false); 765 timekeeping_update(tk, false);
749 write_sequnlock_irqrestore(&timekeeper.lock, flags); 766 write_sequnlock_irqrestore(&tk->lock, flags);
750 767
751 touch_softlockup_watchdog(); 768 touch_softlockup_watchdog();
752 769
@@ -758,14 +775,15 @@ static void timekeeping_resume(void)
758 775
759static int timekeeping_suspend(void) 776static int timekeeping_suspend(void)
760{ 777{
778 struct timekeeper *tk = &timekeeper;
761 unsigned long flags; 779 unsigned long flags;
762 struct timespec delta, delta_delta; 780 struct timespec delta, delta_delta;
763 static struct timespec old_delta; 781 static struct timespec old_delta;
764 782
765 read_persistent_clock(&timekeeping_suspend_time); 783 read_persistent_clock(&timekeeping_suspend_time);
766 784
767 write_seqlock_irqsave(&timekeeper.lock, flags); 785 write_seqlock_irqsave(&tk->lock, flags);
768 timekeeping_forward_now(&timekeeper); 786 timekeeping_forward_now(tk);
769 timekeeping_suspended = 1; 787 timekeeping_suspended = 1;
770 788
771 /* 789 /*
@@ -774,7 +792,7 @@ static int timekeeping_suspend(void)
774 * try to compensate so the difference in system time 792 * try to compensate so the difference in system time
775 * and persistent_clock time stays close to constant. 793 * and persistent_clock time stays close to constant.
776 */ 794 */
777 delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); 795 delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
778 delta_delta = timespec_sub(delta, old_delta); 796 delta_delta = timespec_sub(delta, old_delta);
779 if (abs(delta_delta.tv_sec) >= 2) { 797 if (abs(delta_delta.tv_sec) >= 2) {
780 /* 798 /*
@@ -787,7 +805,7 @@ static int timekeeping_suspend(void)
787 timekeeping_suspend_time = 805 timekeeping_suspend_time =
788 timespec_add(timekeeping_suspend_time, delta_delta); 806 timespec_add(timekeeping_suspend_time, delta_delta);
789 } 807 }
790 write_sequnlock_irqrestore(&timekeeper.lock, flags); 808 write_sequnlock_irqrestore(&tk->lock, flags);
791 809
792 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 810 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
793 clocksource_suspend(); 811 clocksource_suspend();
@@ -898,7 +916,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
898 * the error. This causes the likely below to be unlikely. 916 * the error. This causes the likely below to be unlikely.
899 * 917 *
900 * The proper fix is to avoid rounding up by using 918 * The proper fix is to avoid rounding up by using
901 * the high precision timekeeper.xtime_nsec instead of 919 * the high precision tk->xtime_nsec instead of
902 * xtime.tv_nsec everywhere. Fixing this will take some 920 * xtime.tv_nsec everywhere. Fixing this will take some
903 * time. 921 * time.
904 */ 922 */
@@ -1003,7 +1021,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1003 1021
1004} 1022}
1005 1023
1006
1007/** 1024/**
1008 * accumulate_nsecs_to_secs - Accumulates nsecs into secs 1025 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
1009 * 1026 *
@@ -1024,15 +1041,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1024 1041
1025 /* Figure out if its a leap sec and apply if needed */ 1042 /* Figure out if its a leap sec and apply if needed */
1026 leap = second_overflow(tk->xtime_sec); 1043 leap = second_overflow(tk->xtime_sec);
1027 tk->xtime_sec += leap; 1044 if (unlikely(leap)) {
1028 tk->wall_to_monotonic.tv_sec -= leap; 1045 struct timespec ts;
1029 if (leap) 1046
1030 clock_was_set_delayed(); 1047 tk->xtime_sec += leap;
1031 1048
1049 ts.tv_sec = leap;
1050 ts.tv_nsec = 0;
1051 tk_set_wall_to_mono(tk,
1052 timespec_sub(tk->wall_to_monotonic, ts));
1053
1054 clock_was_set_delayed();
1055 }
1032 } 1056 }
1033} 1057}
1034 1058
1035
1036/** 1059/**
1037 * logarithmic_accumulation - shifted accumulation of cycles 1060 * logarithmic_accumulation - shifted accumulation of cycles
1038 * 1061 *
@@ -1076,7 +1099,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1076 return offset; 1099 return offset;
1077} 1100}
1078 1101
1079
1080/** 1102/**
1081 * update_wall_time - Uses the current clocksource to increment the wall time 1103 * update_wall_time - Uses the current clocksource to increment the wall time
1082 * 1104 *
@@ -1084,21 +1106,22 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1084static void update_wall_time(void) 1106static void update_wall_time(void)
1085{ 1107{
1086 struct clocksource *clock; 1108 struct clocksource *clock;
1109 struct timekeeper *tk = &timekeeper;
1087 cycle_t offset; 1110 cycle_t offset;
1088 int shift = 0, maxshift; 1111 int shift = 0, maxshift;
1089 unsigned long flags; 1112 unsigned long flags;
1090 s64 remainder; 1113 s64 remainder;
1091 1114
1092 write_seqlock_irqsave(&timekeeper.lock, flags); 1115 write_seqlock_irqsave(&tk->lock, flags);
1093 1116
1094 /* Make sure we're fully resumed: */ 1117 /* Make sure we're fully resumed: */
1095 if (unlikely(timekeeping_suspended)) 1118 if (unlikely(timekeeping_suspended))
1096 goto out; 1119 goto out;
1097 1120
1098 clock = timekeeper.clock; 1121 clock = tk->clock;
1099 1122
1100#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1123#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1101 offset = timekeeper.cycle_interval; 1124 offset = tk->cycle_interval;
1102#else 1125#else
1103 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1126 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1104#endif 1127#endif
@@ -1111,19 +1134,19 @@ static void update_wall_time(void)
1111 * chunk in one go, and then try to consume the next smaller 1134 * chunk in one go, and then try to consume the next smaller
1112 * doubled multiple. 1135 * doubled multiple.
1113 */ 1136 */
1114 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); 1137 shift = ilog2(offset) - ilog2(tk->cycle_interval);
1115 shift = max(0, shift); 1138 shift = max(0, shift);
1116 /* Bound shift to one less than what overflows tick_length */ 1139 /* Bound shift to one less than what overflows tick_length */
1117 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 1140 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1118 shift = min(shift, maxshift); 1141 shift = min(shift, maxshift);
1119 while (offset >= timekeeper.cycle_interval) { 1142 while (offset >= tk->cycle_interval) {
1120 offset = logarithmic_accumulation(&timekeeper, offset, shift); 1143 offset = logarithmic_accumulation(tk, offset, shift);
1121 if(offset < timekeeper.cycle_interval<<shift) 1144 if (offset < tk->cycle_interval<<shift)
1122 shift--; 1145 shift--;
1123 } 1146 }
1124 1147
1125 /* correct the clock when NTP error is too big */ 1148 /* correct the clock when NTP error is too big */
1126 timekeeping_adjust(&timekeeper, offset); 1149 timekeeping_adjust(tk, offset);
1127 1150
1128 1151
1129 /* 1152 /*
@@ -1135,21 +1158,21 @@ static void update_wall_time(void)
1135 * the vsyscall implementations are converted to use xtime_nsec 1158 * the vsyscall implementations are converted to use xtime_nsec
1136 * (shifted nanoseconds), this can be killed. 1159 * (shifted nanoseconds), this can be killed.
1137 */ 1160 */
1138 remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1); 1161 remainder = tk->xtime_nsec & ((1 << tk->shift) - 1);
1139 timekeeper.xtime_nsec -= remainder; 1162 tk->xtime_nsec -= remainder;
1140 timekeeper.xtime_nsec += 1 << timekeeper.shift; 1163 tk->xtime_nsec += 1 << tk->shift;
1141 timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift; 1164 tk->ntp_error += remainder << tk->ntp_error_shift;
1142 1165
1143 /* 1166 /*
1144 * Finally, make sure that after the rounding 1167 * Finally, make sure that after the rounding
1145 * xtime_nsec isn't larger than NSEC_PER_SEC 1168 * xtime_nsec isn't larger than NSEC_PER_SEC
1146 */ 1169 */
1147 accumulate_nsecs_to_secs(&timekeeper); 1170 accumulate_nsecs_to_secs(tk);
1148 1171
1149 timekeeping_update(&timekeeper, false); 1172 timekeeping_update(tk, false);
1150 1173
1151out: 1174out:
1152 write_sequnlock_irqrestore(&timekeeper.lock, flags); 1175 write_sequnlock_irqrestore(&tk->lock, flags);
1153 1176
1154} 1177}
1155 1178
@@ -1166,18 +1189,18 @@ out:
1166 */ 1189 */
1167void getboottime(struct timespec *ts) 1190void getboottime(struct timespec *ts)
1168{ 1191{
1192 struct timekeeper *tk = &timekeeper;
1169 struct timespec boottime = { 1193 struct timespec boottime = {
1170 .tv_sec = timekeeper.wall_to_monotonic.tv_sec + 1194 .tv_sec = tk->wall_to_monotonic.tv_sec +
1171 timekeeper.total_sleep_time.tv_sec, 1195 tk->total_sleep_time.tv_sec,
1172 .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + 1196 .tv_nsec = tk->wall_to_monotonic.tv_nsec +
1173 timekeeper.total_sleep_time.tv_nsec 1197 tk->total_sleep_time.tv_nsec
1174 }; 1198 };
1175 1199
1176 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 1200 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
1177} 1201}
1178EXPORT_SYMBOL_GPL(getboottime); 1202EXPORT_SYMBOL_GPL(getboottime);
1179 1203
1180
1181/** 1204/**
1182 * get_monotonic_boottime - Returns monotonic time since boot 1205 * get_monotonic_boottime - Returns monotonic time since boot
1183 * @ts: pointer to the timespec to be set 1206 * @ts: pointer to the timespec to be set
@@ -1189,19 +1212,20 @@ EXPORT_SYMBOL_GPL(getboottime);
1189 */ 1212 */
1190void get_monotonic_boottime(struct timespec *ts) 1213void get_monotonic_boottime(struct timespec *ts)
1191{ 1214{
1215 struct timekeeper *tk = &timekeeper;
1192 struct timespec tomono, sleep; 1216 struct timespec tomono, sleep;
1193 unsigned int seq; 1217 unsigned int seq;
1194 1218
1195 WARN_ON(timekeeping_suspended); 1219 WARN_ON(timekeeping_suspended);
1196 1220
1197 do { 1221 do {
1198 seq = read_seqbegin(&timekeeper.lock); 1222 seq = read_seqbegin(&tk->lock);
1199 ts->tv_sec = timekeeper.xtime_sec; 1223 ts->tv_sec = tk->xtime_sec;
1200 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 1224 ts->tv_nsec = timekeeping_get_ns(tk);
1201 tomono = timekeeper.wall_to_monotonic; 1225 tomono = tk->wall_to_monotonic;
1202 sleep = timekeeper.total_sleep_time; 1226 sleep = tk->total_sleep_time;
1203 1227
1204 } while (read_seqretry(&timekeeper.lock, seq)); 1228 } while (read_seqretry(&tk->lock, seq));
1205 1229
1206 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1230 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
1207 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); 1231 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
@@ -1231,31 +1255,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
1231 */ 1255 */
1232void monotonic_to_bootbased(struct timespec *ts) 1256void monotonic_to_bootbased(struct timespec *ts)
1233{ 1257{
1234 *ts = timespec_add(*ts, timekeeper.total_sleep_time); 1258 struct timekeeper *tk = &timekeeper;
1259
1260 *ts = timespec_add(*ts, tk->total_sleep_time);
1235} 1261}
1236EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1262EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1237 1263
1238unsigned long get_seconds(void) 1264unsigned long get_seconds(void)
1239{ 1265{
1240 return timekeeper.xtime_sec; 1266 struct timekeeper *tk = &timekeeper;
1267
1268 return tk->xtime_sec;
1241} 1269}
1242EXPORT_SYMBOL(get_seconds); 1270EXPORT_SYMBOL(get_seconds);
1243 1271
1244struct timespec __current_kernel_time(void) 1272struct timespec __current_kernel_time(void)
1245{ 1273{
1246 return tk_xtime(&timekeeper); 1274 struct timekeeper *tk = &timekeeper;
1275
1276 return tk_xtime(tk);
1247} 1277}
1248 1278
1249struct timespec current_kernel_time(void) 1279struct timespec current_kernel_time(void)
1250{ 1280{
1281 struct timekeeper *tk = &timekeeper;
1251 struct timespec now; 1282 struct timespec now;
1252 unsigned long seq; 1283 unsigned long seq;
1253 1284
1254 do { 1285 do {
1255 seq = read_seqbegin(&timekeeper.lock); 1286 seq = read_seqbegin(&tk->lock);
1256 1287
1257 now = tk_xtime(&timekeeper); 1288 now = tk_xtime(tk);
1258 } while (read_seqretry(&timekeeper.lock, seq)); 1289 } while (read_seqretry(&tk->lock, seq));
1259 1290
1260 return now; 1291 return now;
1261} 1292}
@@ -1263,15 +1294,16 @@ EXPORT_SYMBOL(current_kernel_time);
1263 1294
1264struct timespec get_monotonic_coarse(void) 1295struct timespec get_monotonic_coarse(void)
1265{ 1296{
1297 struct timekeeper *tk = &timekeeper;
1266 struct timespec now, mono; 1298 struct timespec now, mono;
1267 unsigned long seq; 1299 unsigned long seq;
1268 1300
1269 do { 1301 do {
1270 seq = read_seqbegin(&timekeeper.lock); 1302 seq = read_seqbegin(&tk->lock);
1271 1303
1272 now = tk_xtime(&timekeeper); 1304 now = tk_xtime(tk);
1273 mono = timekeeper.wall_to_monotonic; 1305 mono = tk->wall_to_monotonic;
1274 } while (read_seqretry(&timekeeper.lock, seq)); 1306 } while (read_seqretry(&tk->lock, seq));
1275 1307
1276 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1308 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1277 now.tv_nsec + mono.tv_nsec); 1309 now.tv_nsec + mono.tv_nsec);
@@ -1300,14 +1332,15 @@ void do_timer(unsigned long ticks)
1300void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, 1332void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1301 struct timespec *wtom, struct timespec *sleep) 1333 struct timespec *wtom, struct timespec *sleep)
1302{ 1334{
1335 struct timekeeper *tk = &timekeeper;
1303 unsigned long seq; 1336 unsigned long seq;
1304 1337
1305 do { 1338 do {
1306 seq = read_seqbegin(&timekeeper.lock); 1339 seq = read_seqbegin(&tk->lock);
1307 *xtim = tk_xtime(&timekeeper); 1340 *xtim = tk_xtime(tk);
1308 *wtom = timekeeper.wall_to_monotonic; 1341 *wtom = tk->wall_to_monotonic;
1309 *sleep = timekeeper.total_sleep_time; 1342 *sleep = tk->total_sleep_time;
1310 } while (read_seqretry(&timekeeper.lock, seq)); 1343 } while (read_seqretry(&tk->lock, seq));
1311} 1344}
1312 1345
1313#ifdef CONFIG_HIGH_RES_TIMERS 1346#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1321,19 +1354,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1321 */ 1354 */
1322ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) 1355ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1323{ 1356{
1357 struct timekeeper *tk = &timekeeper;
1324 ktime_t now; 1358 ktime_t now;
1325 unsigned int seq; 1359 unsigned int seq;
1326 u64 secs, nsecs; 1360 u64 secs, nsecs;
1327 1361
1328 do { 1362 do {
1329 seq = read_seqbegin(&timekeeper.lock); 1363 seq = read_seqbegin(&tk->lock);
1330 1364
1331 secs = timekeeper.xtime_sec; 1365 secs = tk->xtime_sec;
1332 nsecs = timekeeping_get_ns(&timekeeper); 1366 nsecs = timekeeping_get_ns(tk);
1333 1367
1334 *offs_real = timekeeper.offs_real; 1368 *offs_real = tk->offs_real;
1335 *offs_boot = timekeeper.offs_boot; 1369 *offs_boot = tk->offs_boot;
1336 } while (read_seqretry(&timekeeper.lock, seq)); 1370 } while (read_seqretry(&tk->lock, seq));
1337 1371
1338 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1372 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1339 now = ktime_sub(now, *offs_real); 1373 now = ktime_sub(now, *offs_real);
@@ -1346,19 +1380,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1346 */ 1380 */
1347ktime_t ktime_get_monotonic_offset(void) 1381ktime_t ktime_get_monotonic_offset(void)
1348{ 1382{
1383 struct timekeeper *tk = &timekeeper;
1349 unsigned long seq; 1384 unsigned long seq;
1350 struct timespec wtom; 1385 struct timespec wtom;
1351 1386
1352 do { 1387 do {
1353 seq = read_seqbegin(&timekeeper.lock); 1388 seq = read_seqbegin(&tk->lock);
1354 wtom = timekeeper.wall_to_monotonic; 1389 wtom = tk->wall_to_monotonic;
1355 } while (read_seqretry(&timekeeper.lock, seq)); 1390 } while (read_seqretry(&tk->lock, seq));
1356 1391
1357 return timespec_to_ktime(wtom); 1392 return timespec_to_ktime(wtom);
1358} 1393}
1359EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); 1394EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1360 1395
1361
1362/** 1396/**
1363 * xtime_update() - advances the timekeeping infrastructure 1397 * xtime_update() - advances the timekeeping infrastructure
1364 * @ticks: number of ticks, that have elapsed since the last call. 1398 * @ticks: number of ticks, that have elapsed since the last call.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a120f98c4112..5c38c81496ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3187,10 +3187,10 @@ static int tracing_set_tracer(const char *buf)
3187 } 3187 }
3188 destroy_trace_option_files(topts); 3188 destroy_trace_option_files(topts);
3189 3189
3190 current_trace = t; 3190 current_trace = &nop_trace;
3191 3191
3192 topts = create_trace_option_files(current_trace); 3192 topts = create_trace_option_files(t);
3193 if (current_trace->use_max_tr) { 3193 if (t->use_max_tr) {
3194 int cpu; 3194 int cpu;
3195 /* we need to make per cpu buffer sizes equivalent */ 3195 /* we need to make per cpu buffer sizes equivalent */
3196 for_each_tracing_cpu(cpu) { 3196 for_each_tracing_cpu(cpu) {
@@ -3210,6 +3210,7 @@ static int tracing_set_tracer(const char *buf)
3210 goto out; 3210 goto out;
3211 } 3211 }
3212 3212
3213 current_trace = t;
3213 trace_branch_enable(tr); 3214 trace_branch_enable(tr);
3214 out: 3215 out:
3215 mutex_unlock(&trace_types_lock); 3216 mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 9824419c8404..84b1e045faba 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -282,7 +282,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
282 282
283 head = this_cpu_ptr(event_function.perf_events); 283 head = this_cpu_ptr(event_function.perf_events);
284 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 284 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
285 1, &regs, head); 285 1, &regs, head, NULL);
286 286
287#undef ENTRY_SIZE 287#undef ENTRY_SIZE
288} 288}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index fdff65dff1bb..483162a9f908 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,6 +13,7 @@
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/pstore.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17 18
18#include "trace.h" 19#include "trace.h"
@@ -75,6 +76,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
75 preempt_enable_notrace(); 76 preempt_enable_notrace();
76} 77}
77 78
79/* Our two options */
80enum {
81 TRACE_FUNC_OPT_STACK = 0x1,
82 TRACE_FUNC_OPT_PSTORE = 0x2,
83};
84
85static struct tracer_flags func_flags;
86
78static void 87static void
79function_trace_call(unsigned long ip, unsigned long parent_ip, 88function_trace_call(unsigned long ip, unsigned long parent_ip,
80 struct ftrace_ops *op, struct pt_regs *pt_regs) 89 struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -100,6 +109,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
100 disabled = atomic_inc_return(&data->disabled); 109 disabled = atomic_inc_return(&data->disabled);
101 110
102 if (likely(disabled == 1)) { 111 if (likely(disabled == 1)) {
112 /*
113 * So far tracing doesn't support multiple buffers, so
114 * we make an explicit call for now.
115 */
116 if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
117 pstore_ftrace_call(ip, parent_ip);
103 pc = preempt_count(); 118 pc = preempt_count();
104 trace_function(tr, ip, parent_ip, flags, pc); 119 trace_function(tr, ip, parent_ip, flags, pc);
105 } 120 }
@@ -162,15 +177,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly =
162 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, 177 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
163}; 178};
164 179
165/* Our two options */
166enum {
167 TRACE_FUNC_OPT_STACK = 0x1,
168};
169
170static struct tracer_opt func_opts[] = { 180static struct tracer_opt func_opts[] = {
171#ifdef CONFIG_STACKTRACE 181#ifdef CONFIG_STACKTRACE
172 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, 182 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
173#endif 183#endif
184#ifdef CONFIG_PSTORE_FTRACE
185 { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
186#endif
174 { } /* Always set a last empty entry */ 187 { } /* Always set a last empty entry */
175}; 188};
176 189
@@ -208,10 +221,11 @@ static void tracing_stop_function_trace(void)
208 221
209static int func_set_flag(u32 old_flags, u32 bit, int set) 222static int func_set_flag(u32 old_flags, u32 bit, int set)
210{ 223{
211 if (bit == TRACE_FUNC_OPT_STACK) { 224 switch (bit) {
225 case TRACE_FUNC_OPT_STACK:
212 /* do nothing if already set */ 226 /* do nothing if already set */
213 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) 227 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
214 return 0; 228 break;
215 229
216 if (set) { 230 if (set) {
217 unregister_ftrace_function(&trace_ops); 231 unregister_ftrace_function(&trace_ops);
@@ -221,10 +235,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
221 register_ftrace_function(&trace_ops); 235 register_ftrace_function(&trace_ops);
222 } 236 }
223 237
224 return 0; 238 break;
239 case TRACE_FUNC_OPT_PSTORE:
240 break;
241 default:
242 return -EINVAL;
225 } 243 }
226 244
227 return -EINVAL; 245 return 0;
228} 246}
229 247
230static struct tracer function_trace __read_mostly = 248static struct tracer function_trace __read_mostly =
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5699fe..1a2117043bb1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1002 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1002 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1003 1003
1004 head = this_cpu_ptr(call->perf_events); 1004 head = this_cpu_ptr(call->perf_events);
1005 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1005 perf_trace_buf_submit(entry, size, rctx,
1006 entry->ip, 1, regs, head, NULL);
1006} 1007}
1007 1008
1008/* Kretprobe profile handler */ 1009/* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1033 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1034 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1034 1035
1035 head = this_cpu_ptr(call->perf_events); 1036 head = this_cpu_ptr(call->perf_events);
1036 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1037 perf_trace_buf_submit(entry, size, rctx,
1038 entry->ret_ip, 1, regs, head, NULL);
1037} 1039}
1038#endif /* CONFIG_PERF_EVENTS */ 1040#endif /* CONFIG_PERF_EVENTS */
1039 1041
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc73369099..60e4d7875672 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
532 (unsigned long *)&rec->args); 532 (unsigned long *)&rec->args);
533 533
534 head = this_cpu_ptr(sys_data->enter_event->perf_events); 534 head = this_cpu_ptr(sys_data->enter_event->perf_events);
535 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 535 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
536} 536}
537 537
538int perf_sysenter_enable(struct ftrace_event_call *call) 538int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
608 rec->ret = syscall_get_return_value(current, regs); 608 rec->ret = syscall_get_return_value(current, regs);
609 609
610 head = this_cpu_ptr(sys_data->exit_event->perf_events); 610 head = this_cpu_ptr(sys_data->exit_event->perf_events);
611 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 611 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
612} 612}
613 613
614int perf_sysexit_enable(struct ftrace_event_call *call) 614int perf_sysexit_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2b36ac68549e..03003cd7dd96 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
671 671
672 head = this_cpu_ptr(call->perf_events); 672 head = this_cpu_ptr(call->perf_events);
673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
674 674
675 out: 675 out:
676 preempt_enable(); 676 preempt_enable();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..69add8a9da68 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -575,7 +575,7 @@ out:
575/* 575/*
576 * Create/destroy watchdog threads as CPUs come and go: 576 * Create/destroy watchdog threads as CPUs come and go:
577 */ 577 */
578static int __cpuinit 578static int
579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
580{ 580{
581 int hotcpu = (unsigned long)hcpu; 581 int hotcpu = (unsigned long)hcpu;
@@ -610,10 +610,27 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
610 return NOTIFY_OK; 610 return NOTIFY_OK;
611} 611}
612 612
613static struct notifier_block __cpuinitdata cpu_nfb = { 613static struct notifier_block cpu_nfb = {
614 .notifier_call = cpu_callback 614 .notifier_call = cpu_callback
615}; 615};
616 616
617#ifdef CONFIG_SUSPEND
618/*
619 * On exit from suspend we force an offline->online transition on the boot CPU
620 * so that the PMU state that was lost while in suspended state gets set up
621 * properly for the boot CPU. This information is required for restarting the
622 * NMI watchdog.
623 */
624void lockup_detector_bootcpu_resume(void)
625{
626 void *cpu = (void *)(long)smp_processor_id();
627
628 cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
629 cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
630 cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
631}
632#endif
633
617void __init lockup_detector_init(void) 634void __init lockup_detector_init(void)
618{ 635{
619 void *cpu = (void *)(long)smp_processor_id(); 636 void *cpu = (void *)(long)smp_processor_id();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67df..692d97628a10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,32 +45,41 @@
45#include "workqueue_sched.h" 45#include "workqueue_sched.h"
46 46
47enum { 47enum {
48 /* global_cwq flags */ 48 /*
49 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 49 * global_cwq flags
50 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ 50 *
51 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 51 * A bound gcwq is either associated or disassociated with its CPU.
52 GCWQ_FREEZING = 1 << 3, /* freeze in progress */ 52 * While associated (!DISASSOCIATED), all workers are bound to the
53 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ 53 * CPU and none has %WORKER_UNBOUND set and concurrency management
54 * is in effect.
55 *
56 * While DISASSOCIATED, the cpu may be offline and all workers have
57 * %WORKER_UNBOUND set and concurrency management disabled, and may
58 * be executing on any CPU. The gcwq behaves as an unbound one.
59 *
60 * Note that DISASSOCIATED can be flipped only while holding
61 * managership of all pools on the gcwq to avoid changing binding
62 * state while create_worker() is in progress.
63 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65 GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66
67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
54 69
55 /* worker flags */ 70 /* worker flags */
56 WORKER_STARTED = 1 << 0, /* started */ 71 WORKER_STARTED = 1 << 0, /* started */
57 WORKER_DIE = 1 << 1, /* die die die */ 72 WORKER_DIE = 1 << 1, /* die die die */
58 WORKER_IDLE = 1 << 2, /* is idle */ 73 WORKER_IDLE = 1 << 2, /* is idle */
59 WORKER_PREP = 1 << 3, /* preparing to run works */ 74 WORKER_PREP = 1 << 3, /* preparing to run works */
60 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
61 WORKER_REBIND = 1 << 5, /* mom is home, come back */ 75 WORKER_REBIND = 1 << 5, /* mom is home, come back */
62 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 76 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
63 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 77 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
64 78
65 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | 79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
66 WORKER_CPU_INTENSIVE | WORKER_UNBOUND, 80 WORKER_CPU_INTENSIVE,
67 81
68 /* gcwq->trustee_state */ 82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
69 TRUSTEE_START = 0, /* start */
70 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
71 TRUSTEE_BUTCHER = 2, /* butcher workers */
72 TRUSTEE_RELEASE = 3, /* release workers */
73 TRUSTEE_DONE = 4, /* trustee is done */
74 83
75 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
76 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, 85 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
@@ -84,13 +93,13 @@ enum {
84 (min two ticks) */ 93 (min two ticks) */
85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 94 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
86 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 95 CREATE_COOLDOWN = HZ, /* time to breath after fail */
87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
88 96
89 /* 97 /*
90 * Rescue workers are used only on emergencies and shared by 98 * Rescue workers are used only on emergencies and shared by
91 * all cpus. Give -20. 99 * all cpus. Give -20.
92 */ 100 */
93 RESCUER_NICE_LEVEL = -20, 101 RESCUER_NICE_LEVEL = -20,
102 HIGHPRI_NICE_LEVEL = -20,
94}; 103};
95 104
96/* 105/*
@@ -115,6 +124,8 @@ enum {
115 */ 124 */
116 125
117struct global_cwq; 126struct global_cwq;
127struct worker_pool;
128struct idle_rebind;
118 129
119/* 130/*
120 * The poor guys doing the actual heavy lifting. All on-duty workers 131 * The poor guys doing the actual heavy lifting. All on-duty workers
@@ -131,12 +142,31 @@ struct worker {
131 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ 142 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
132 struct list_head scheduled; /* L: scheduled works */ 143 struct list_head scheduled; /* L: scheduled works */
133 struct task_struct *task; /* I: worker task */ 144 struct task_struct *task; /* I: worker task */
134 struct global_cwq *gcwq; /* I: the associated gcwq */ 145 struct worker_pool *pool; /* I: the associated pool */
135 /* 64 bytes boundary on 64bit, 32 on 32bit */ 146 /* 64 bytes boundary on 64bit, 32 on 32bit */
136 unsigned long last_active; /* L: last active timestamp */ 147 unsigned long last_active; /* L: last active timestamp */
137 unsigned int flags; /* X: flags */ 148 unsigned int flags; /* X: flags */
138 int id; /* I: worker id */ 149 int id; /* I: worker id */
139 struct work_struct rebind_work; /* L: rebind worker to cpu */ 150
151 /* for rebinding worker to CPU */
152 struct idle_rebind *idle_rebind; /* L: for idle worker */
153 struct work_struct rebind_work; /* L: for busy worker */
154};
155
156struct worker_pool {
157 struct global_cwq *gcwq; /* I: the owning gcwq */
158 unsigned int flags; /* X: flags */
159
160 struct list_head worklist; /* L: list of pending works */
161 int nr_workers; /* L: total number of workers */
162 int nr_idle; /* L: currently idle ones */
163
164 struct list_head idle_list; /* X: list of idle workers */
165 struct timer_list idle_timer; /* L: worker idle timeout */
166 struct timer_list mayday_timer; /* L: SOS timer for workers */
167
168 struct mutex manager_mutex; /* mutex manager should hold */
169 struct ida worker_ida; /* L: for worker IDs */
140}; 170};
141 171
142/* 172/*
@@ -146,27 +176,16 @@ struct worker {
146 */ 176 */
147struct global_cwq { 177struct global_cwq {
148 spinlock_t lock; /* the gcwq lock */ 178 spinlock_t lock; /* the gcwq lock */
149 struct list_head worklist; /* L: list of pending works */
150 unsigned int cpu; /* I: the associated cpu */ 179 unsigned int cpu; /* I: the associated cpu */
151 unsigned int flags; /* L: GCWQ_* flags */ 180 unsigned int flags; /* L: GCWQ_* flags */
152 181
153 int nr_workers; /* L: total number of workers */ 182 /* workers are chained either in busy_hash or pool idle_list */
154 int nr_idle; /* L: currently idle ones */
155
156 /* workers are chained either in the idle_list or busy_hash */
157 struct list_head idle_list; /* X: list of idle workers */
158 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; 183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
159 /* L: hash of busy workers */ 184 /* L: hash of busy workers */
160 185
161 struct timer_list idle_timer; /* L: worker idle timeout */ 186 struct worker_pool pools[2]; /* normal and highpri pools */
162 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
163
164 struct ida worker_ida; /* L: for worker IDs */
165 187
166 struct task_struct *trustee; /* L: for gcwq shutdown */ 188 wait_queue_head_t rebind_hold; /* rebind hold wait */
167 unsigned int trustee_state; /* L: trustee state */
168 wait_queue_head_t trustee_wait; /* trustee wait */
169 struct worker *first_idle; /* L: first idle worker */
170} ____cacheline_aligned_in_smp; 189} ____cacheline_aligned_in_smp;
171 190
172/* 191/*
@@ -175,7 +194,7 @@ struct global_cwq {
175 * aligned at two's power of the number of flag bits. 194 * aligned at two's power of the number of flag bits.
176 */ 195 */
177struct cpu_workqueue_struct { 196struct cpu_workqueue_struct {
178 struct global_cwq *gcwq; /* I: the associated gcwq */ 197 struct worker_pool *pool; /* I: the associated pool */
179 struct workqueue_struct *wq; /* I: the owning workqueue */ 198 struct workqueue_struct *wq; /* I: the owning workqueue */
180 int work_color; /* L: current color */ 199 int work_color; /* L: current color */
181 int flush_color; /* L: flushing color */ 200 int flush_color; /* L: flushing color */
@@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
264#define CREATE_TRACE_POINTS 283#define CREATE_TRACE_POINTS
265#include <trace/events/workqueue.h> 284#include <trace/events/workqueue.h>
266 285
286#define for_each_worker_pool(pool, gcwq) \
287 for ((pool) = &(gcwq)->pools[0]; \
288 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
289
267#define for_each_busy_worker(worker, i, pos, gcwq) \ 290#define for_each_busy_worker(worker, i, pos, gcwq) \
268 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 291 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
269 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) 292 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -444,7 +467,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */
444 * try_to_wake_up(). Put it in a separate cacheline. 467 * try_to_wake_up(). Put it in a separate cacheline.
445 */ 468 */
446static DEFINE_PER_CPU(struct global_cwq, global_cwq); 469static DEFINE_PER_CPU(struct global_cwq, global_cwq);
447static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); 470static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
448 471
449/* 472/*
450 * Global cpu workqueue and nr_running counter for unbound gcwq. The 473 * Global cpu workqueue and nr_running counter for unbound gcwq. The
@@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
452 * workers have WORKER_UNBOUND set. 475 * workers have WORKER_UNBOUND set.
453 */ 476 */
454static struct global_cwq unbound_global_cwq; 477static struct global_cwq unbound_global_cwq;
455static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ 478static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
479 [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
480};
456 481
457static int worker_thread(void *__worker); 482static int worker_thread(void *__worker);
458 483
484static int worker_pool_pri(struct worker_pool *pool)
485{
486 return pool - pool->gcwq->pools;
487}
488
459static struct global_cwq *get_gcwq(unsigned int cpu) 489static struct global_cwq *get_gcwq(unsigned int cpu)
460{ 490{
461 if (cpu != WORK_CPU_UNBOUND) 491 if (cpu != WORK_CPU_UNBOUND)
@@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
464 return &unbound_global_cwq; 494 return &unbound_global_cwq;
465} 495}
466 496
467static atomic_t *get_gcwq_nr_running(unsigned int cpu) 497static atomic_t *get_pool_nr_running(struct worker_pool *pool)
468{ 498{
499 int cpu = pool->gcwq->cpu;
500 int idx = worker_pool_pri(pool);
501
469 if (cpu != WORK_CPU_UNBOUND) 502 if (cpu != WORK_CPU_UNBOUND)
470 return &per_cpu(gcwq_nr_running, cpu); 503 return &per_cpu(pool_nr_running, cpu)[idx];
471 else 504 else
472 return &unbound_gcwq_nr_running; 505 return &unbound_pool_nr_running[idx];
473} 506}
474 507
475static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, 508static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
555 588
556 if (data & WORK_STRUCT_CWQ) 589 if (data & WORK_STRUCT_CWQ)
557 return ((struct cpu_workqueue_struct *) 590 return ((struct cpu_workqueue_struct *)
558 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; 591 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
559 592
560 cpu = data >> WORK_STRUCT_FLAG_BITS; 593 cpu = data >> WORK_STRUCT_FLAG_BITS;
561 if (cpu == WORK_CPU_NONE) 594 if (cpu == WORK_CPU_NONE)
@@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
566} 599}
567 600
568/* 601/*
569 * Policy functions. These define the policies on how the global 602 * Policy functions. These define the policies on how the global worker
570 * worker pool is managed. Unless noted otherwise, these functions 603 * pools are managed. Unless noted otherwise, these functions assume that
571 * assume that they're being called with gcwq->lock held. 604 * they're being called with gcwq->lock held.
572 */ 605 */
573 606
574static bool __need_more_worker(struct global_cwq *gcwq) 607static bool __need_more_worker(struct worker_pool *pool)
575{ 608{
576 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || 609 return !atomic_read(get_pool_nr_running(pool));
577 gcwq->flags & GCWQ_HIGHPRI_PENDING;
578} 610}
579 611
580/* 612/*
581 * Need to wake up a worker? Called from anything but currently 613 * Need to wake up a worker? Called from anything but currently
582 * running workers. 614 * running workers.
615 *
616 * Note that, because unbound workers never contribute to nr_running, this
617 * function will always return %true for unbound gcwq as long as the
618 * worklist isn't empty.
583 */ 619 */
584static bool need_more_worker(struct global_cwq *gcwq) 620static bool need_more_worker(struct worker_pool *pool)
585{ 621{
586 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); 622 return !list_empty(&pool->worklist) && __need_more_worker(pool);
587} 623}
588 624
589/* Can I start working? Called from busy but !running workers. */ 625/* Can I start working? Called from busy but !running workers. */
590static bool may_start_working(struct global_cwq *gcwq) 626static bool may_start_working(struct worker_pool *pool)
591{ 627{
592 return gcwq->nr_idle; 628 return pool->nr_idle;
593} 629}
594 630
595/* Do I need to keep working? Called from currently running workers. */ 631/* Do I need to keep working? Called from currently running workers. */
596static bool keep_working(struct global_cwq *gcwq) 632static bool keep_working(struct worker_pool *pool)
597{ 633{
598 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 634 atomic_t *nr_running = get_pool_nr_running(pool);
599 635
600 return !list_empty(&gcwq->worklist) && 636 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
601 (atomic_read(nr_running) <= 1 ||
602 gcwq->flags & GCWQ_HIGHPRI_PENDING);
603} 637}
604 638
605/* Do we need a new worker? Called from manager. */ 639/* Do we need a new worker? Called from manager. */
606static bool need_to_create_worker(struct global_cwq *gcwq) 640static bool need_to_create_worker(struct worker_pool *pool)
607{ 641{
608 return need_more_worker(gcwq) && !may_start_working(gcwq); 642 return need_more_worker(pool) && !may_start_working(pool);
609} 643}
610 644
611/* Do I need to be the manager? */ 645/* Do I need to be the manager? */
612static bool need_to_manage_workers(struct global_cwq *gcwq) 646static bool need_to_manage_workers(struct worker_pool *pool)
613{ 647{
614 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; 648 return need_to_create_worker(pool) ||
649 (pool->flags & POOL_MANAGE_WORKERS);
615} 650}
616 651
617/* Do we have too many workers and should some go away? */ 652/* Do we have too many workers and should some go away? */
618static bool too_many_workers(struct global_cwq *gcwq) 653static bool too_many_workers(struct worker_pool *pool)
619{ 654{
620 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; 655 bool managing = mutex_is_locked(&pool->manager_mutex);
621 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ 656 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
622 int nr_busy = gcwq->nr_workers - nr_idle; 657 int nr_busy = pool->nr_workers - nr_idle;
623 658
624 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 659 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
625} 660}
@@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
629 */ 664 */
630 665
631/* Return the first worker. Safe with preemption disabled */ 666/* Return the first worker. Safe with preemption disabled */
632static struct worker *first_worker(struct global_cwq *gcwq) 667static struct worker *first_worker(struct worker_pool *pool)
633{ 668{
634 if (unlikely(list_empty(&gcwq->idle_list))) 669 if (unlikely(list_empty(&pool->idle_list)))
635 return NULL; 670 return NULL;
636 671
637 return list_first_entry(&gcwq->idle_list, struct worker, entry); 672 return list_first_entry(&pool->idle_list, struct worker, entry);
638} 673}
639 674
640/** 675/**
641 * wake_up_worker - wake up an idle worker 676 * wake_up_worker - wake up an idle worker
642 * @gcwq: gcwq to wake worker for 677 * @pool: worker pool to wake worker from
643 * 678 *
644 * Wake up the first idle worker of @gcwq. 679 * Wake up the first idle worker of @pool.
645 * 680 *
646 * CONTEXT: 681 * CONTEXT:
647 * spin_lock_irq(gcwq->lock). 682 * spin_lock_irq(gcwq->lock).
648 */ 683 */
649static void wake_up_worker(struct global_cwq *gcwq) 684static void wake_up_worker(struct worker_pool *pool)
650{ 685{
651 struct worker *worker = first_worker(gcwq); 686 struct worker *worker = first_worker(pool);
652 687
653 if (likely(worker)) 688 if (likely(worker))
654 wake_up_process(worker->task); 689 wake_up_process(worker->task);
@@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
670 struct worker *worker = kthread_data(task); 705 struct worker *worker = kthread_data(task);
671 706
672 if (!(worker->flags & WORKER_NOT_RUNNING)) 707 if (!(worker->flags & WORKER_NOT_RUNNING))
673 atomic_inc(get_gcwq_nr_running(cpu)); 708 atomic_inc(get_pool_nr_running(worker->pool));
674} 709}
675 710
676/** 711/**
@@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
692 unsigned int cpu) 727 unsigned int cpu)
693{ 728{
694 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 729 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
695 struct global_cwq *gcwq = get_gcwq(cpu); 730 struct worker_pool *pool = worker->pool;
696 atomic_t *nr_running = get_gcwq_nr_running(cpu); 731 atomic_t *nr_running = get_pool_nr_running(pool);
697 732
698 if (worker->flags & WORKER_NOT_RUNNING) 733 if (worker->flags & WORKER_NOT_RUNNING)
699 return NULL; 734 return NULL;
@@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
706 * worklist not empty test sequence is in insert_work(). 741 * worklist not empty test sequence is in insert_work().
707 * Please read comment there. 742 * Please read comment there.
708 * 743 *
709 * NOT_RUNNING is clear. This means that trustee is not in 744 * NOT_RUNNING is clear. This means that we're bound to and
710 * charge and we're running on the local cpu w/ rq lock held 745 * running on the local cpu w/ rq lock held and preemption
711 * and preemption disabled, which in turn means that none else 746 * disabled, which in turn means that none else could be
712 * could be manipulating idle_list, so dereferencing idle_list 747 * manipulating idle_list, so dereferencing idle_list without gcwq
713 * without gcwq lock is safe. 748 * lock is safe.
714 */ 749 */
715 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) 750 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
716 to_wakeup = first_worker(gcwq); 751 to_wakeup = first_worker(pool);
717 return to_wakeup ? to_wakeup->task : NULL; 752 return to_wakeup ? to_wakeup->task : NULL;
718} 753}
719 754
@@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
733static inline void worker_set_flags(struct worker *worker, unsigned int flags, 768static inline void worker_set_flags(struct worker *worker, unsigned int flags,
734 bool wakeup) 769 bool wakeup)
735{ 770{
736 struct global_cwq *gcwq = worker->gcwq; 771 struct worker_pool *pool = worker->pool;
737 772
738 WARN_ON_ONCE(worker->task != current); 773 WARN_ON_ONCE(worker->task != current);
739 774
@@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
744 */ 779 */
745 if ((flags & WORKER_NOT_RUNNING) && 780 if ((flags & WORKER_NOT_RUNNING) &&
746 !(worker->flags & WORKER_NOT_RUNNING)) { 781 !(worker->flags & WORKER_NOT_RUNNING)) {
747 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 782 atomic_t *nr_running = get_pool_nr_running(pool);
748 783
749 if (wakeup) { 784 if (wakeup) {
750 if (atomic_dec_and_test(nr_running) && 785 if (atomic_dec_and_test(nr_running) &&
751 !list_empty(&gcwq->worklist)) 786 !list_empty(&pool->worklist))
752 wake_up_worker(gcwq); 787 wake_up_worker(pool);
753 } else 788 } else
754 atomic_dec(nr_running); 789 atomic_dec(nr_running);
755 } 790 }
@@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
769 */ 804 */
770static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 805static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
771{ 806{
772 struct global_cwq *gcwq = worker->gcwq; 807 struct worker_pool *pool = worker->pool;
773 unsigned int oflags = worker->flags; 808 unsigned int oflags = worker->flags;
774 809
775 WARN_ON_ONCE(worker->task != current); 810 WARN_ON_ONCE(worker->task != current);
@@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
783 */ 818 */
784 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 819 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
785 if (!(worker->flags & WORKER_NOT_RUNNING)) 820 if (!(worker->flags & WORKER_NOT_RUNNING))
786 atomic_inc(get_gcwq_nr_running(gcwq->cpu)); 821 atomic_inc(get_pool_nr_running(pool));
787} 822}
788 823
789/** 824/**
@@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
867} 902}
868 903
869/** 904/**
870 * gcwq_determine_ins_pos - find insertion position
871 * @gcwq: gcwq of interest
872 * @cwq: cwq a work is being queued for
873 *
874 * A work for @cwq is about to be queued on @gcwq, determine insertion
875 * position for the work. If @cwq is for HIGHPRI wq, the work is
876 * queued at the head of the queue but in FIFO order with respect to
877 * other HIGHPRI works; otherwise, at the end of the queue. This
878 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
879 * there are HIGHPRI works pending.
880 *
881 * CONTEXT:
882 * spin_lock_irq(gcwq->lock).
883 *
884 * RETURNS:
885 * Pointer to inserstion position.
886 */
887static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
888 struct cpu_workqueue_struct *cwq)
889{
890 struct work_struct *twork;
891
892 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
893 return &gcwq->worklist;
894
895 list_for_each_entry(twork, &gcwq->worklist, entry) {
896 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
897
898 if (!(tcwq->wq->flags & WQ_HIGHPRI))
899 break;
900 }
901
902 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
903 return &twork->entry;
904}
905
906/**
907 * insert_work - insert a work into gcwq 905 * insert_work - insert a work into gcwq
908 * @cwq: cwq @work belongs to 906 * @cwq: cwq @work belongs to
909 * @work: work to insert 907 * @work: work to insert
@@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
920 struct work_struct *work, struct list_head *head, 918 struct work_struct *work, struct list_head *head,
921 unsigned int extra_flags) 919 unsigned int extra_flags)
922{ 920{
923 struct global_cwq *gcwq = cwq->gcwq; 921 struct worker_pool *pool = cwq->pool;
924 922
925 /* we own @work, set data and link */ 923 /* we own @work, set data and link */
926 set_work_cwq(work, cwq, extra_flags); 924 set_work_cwq(work, cwq, extra_flags);
@@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
940 */ 938 */
941 smp_mb(); 939 smp_mb();
942 940
943 if (__need_more_worker(gcwq)) 941 if (__need_more_worker(pool))
944 wake_up_worker(gcwq); 942 wake_up_worker(pool);
945} 943}
946 944
947/* 945/*
@@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1043 if (likely(cwq->nr_active < cwq->max_active)) { 1041 if (likely(cwq->nr_active < cwq->max_active)) {
1044 trace_workqueue_activate_work(work); 1042 trace_workqueue_activate_work(work);
1045 cwq->nr_active++; 1043 cwq->nr_active++;
1046 worklist = gcwq_determine_ins_pos(gcwq, cwq); 1044 worklist = &cwq->pool->worklist;
1047 } else { 1045 } else {
1048 work_flags |= WORK_STRUCT_DELAYED; 1046 work_flags |= WORK_STRUCT_DELAYED;
1049 worklist = &cwq->delayed_works; 1047 worklist = &cwq->delayed_works;
@@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1192 */ 1190 */
1193static void worker_enter_idle(struct worker *worker) 1191static void worker_enter_idle(struct worker *worker)
1194{ 1192{
1195 struct global_cwq *gcwq = worker->gcwq; 1193 struct worker_pool *pool = worker->pool;
1194 struct global_cwq *gcwq = pool->gcwq;
1196 1195
1197 BUG_ON(worker->flags & WORKER_IDLE); 1196 BUG_ON(worker->flags & WORKER_IDLE);
1198 BUG_ON(!list_empty(&worker->entry) && 1197 BUG_ON(!list_empty(&worker->entry) &&
@@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker)
1200 1199
1201 /* can't use worker_set_flags(), also called from start_worker() */ 1200 /* can't use worker_set_flags(), also called from start_worker() */
1202 worker->flags |= WORKER_IDLE; 1201 worker->flags |= WORKER_IDLE;
1203 gcwq->nr_idle++; 1202 pool->nr_idle++;
1204 worker->last_active = jiffies; 1203 worker->last_active = jiffies;
1205 1204
1206 /* idle_list is LIFO */ 1205 /* idle_list is LIFO */
1207 list_add(&worker->entry, &gcwq->idle_list); 1206 list_add(&worker->entry, &pool->idle_list);
1208 1207
1209 if (likely(!(worker->flags & WORKER_ROGUE))) { 1208 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1210 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) 1209 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1211 mod_timer(&gcwq->idle_timer,
1212 jiffies + IDLE_WORKER_TIMEOUT);
1213 } else
1214 wake_up_all(&gcwq->trustee_wait);
1215 1210
1216 /* 1211 /*
1217 * Sanity check nr_running. Because trustee releases gcwq->lock 1212 * Sanity check nr_running. Because gcwq_unbind_fn() releases
1218 * between setting %WORKER_ROGUE and zapping nr_running, the 1213 * gcwq->lock between setting %WORKER_UNBOUND and zapping
1219 * warning may trigger spuriously. Check iff trustee is idle. 1214 * nr_running, the warning may trigger spuriously. Check iff
1215 * unbind is not in progress.
1220 */ 1216 */
1221 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && 1217 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
1222 gcwq->nr_workers == gcwq->nr_idle && 1218 pool->nr_workers == pool->nr_idle &&
1223 atomic_read(get_gcwq_nr_running(gcwq->cpu))); 1219 atomic_read(get_pool_nr_running(pool)));
1224} 1220}
1225 1221
1226/** 1222/**
@@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker)
1234 */ 1230 */
1235static void worker_leave_idle(struct worker *worker) 1231static void worker_leave_idle(struct worker *worker)
1236{ 1232{
1237 struct global_cwq *gcwq = worker->gcwq; 1233 struct worker_pool *pool = worker->pool;
1238 1234
1239 BUG_ON(!(worker->flags & WORKER_IDLE)); 1235 BUG_ON(!(worker->flags & WORKER_IDLE));
1240 worker_clr_flags(worker, WORKER_IDLE); 1236 worker_clr_flags(worker, WORKER_IDLE);
1241 gcwq->nr_idle--; 1237 pool->nr_idle--;
1242 list_del_init(&worker->entry); 1238 list_del_init(&worker->entry);
1243} 1239}
1244 1240
@@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker)
1258 * verbatim as it's best effort and blocking and gcwq may be 1254 * verbatim as it's best effort and blocking and gcwq may be
1259 * [dis]associated in the meantime. 1255 * [dis]associated in the meantime.
1260 * 1256 *
1261 * This function tries set_cpus_allowed() and locks gcwq and verifies 1257 * This function tries set_cpus_allowed() and locks gcwq and verifies the
1262 * the binding against GCWQ_DISASSOCIATED which is set during 1258 * binding against %GCWQ_DISASSOCIATED which is set during
1263 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters 1259 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1264 * idle state or fetches works without dropping lock, it can guarantee 1260 * enters idle state or fetches works without dropping lock, it can
1265 * the scheduling requirement described in the first paragraph. 1261 * guarantee the scheduling requirement described in the first paragraph.
1266 * 1262 *
1267 * CONTEXT: 1263 * CONTEXT:
1268 * Might sleep. Called without any lock but returns with gcwq->lock 1264 * Might sleep. Called without any lock but returns with gcwq->lock
@@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker)
1275static bool worker_maybe_bind_and_lock(struct worker *worker) 1271static bool worker_maybe_bind_and_lock(struct worker *worker)
1276__acquires(&gcwq->lock) 1272__acquires(&gcwq->lock)
1277{ 1273{
1278 struct global_cwq *gcwq = worker->gcwq; 1274 struct global_cwq *gcwq = worker->pool->gcwq;
1279 struct task_struct *task = worker->task; 1275 struct task_struct *task = worker->task;
1280 1276
1281 while (true) { 1277 while (true) {
@@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock)
1308 } 1304 }
1309} 1305}
1310 1306
1307struct idle_rebind {
1308 int cnt; /* # workers to be rebound */
1309 struct completion done; /* all workers rebound */
1310};
1311
1312/*
1313 * Rebind an idle @worker to its CPU. During CPU onlining, this has to
1314 * happen synchronously for idle workers. worker_thread() will test
1315 * %WORKER_REBIND before leaving idle and call this function.
1316 */
1317static void idle_worker_rebind(struct worker *worker)
1318{
1319 struct global_cwq *gcwq = worker->pool->gcwq;
1320
1321 /* CPU must be online at this point */
1322 WARN_ON(!worker_maybe_bind_and_lock(worker));
1323 if (!--worker->idle_rebind->cnt)
1324 complete(&worker->idle_rebind->done);
1325 spin_unlock_irq(&worker->pool->gcwq->lock);
1326
1327 /* we did our part, wait for rebind_workers() to finish up */
1328 wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
1329}
1330
1311/* 1331/*
1312 * Function for worker->rebind_work used to rebind rogue busy workers 1332 * Function for @worker->rebind.work used to rebind unbound busy workers to
1313 * to the associated cpu which is coming back online. This is 1333 * the associated cpu which is coming back online. This is scheduled by
1314 * scheduled by cpu up but can race with other cpu hotplug operations 1334 * cpu up but can race with other cpu hotplug operations and may be
1315 * and may be executed twice without intervening cpu down. 1335 * executed twice without intervening cpu down.
1316 */ 1336 */
1317static void worker_rebind_fn(struct work_struct *work) 1337static void busy_worker_rebind_fn(struct work_struct *work)
1318{ 1338{
1319 struct worker *worker = container_of(work, struct worker, rebind_work); 1339 struct worker *worker = container_of(work, struct worker, rebind_work);
1320 struct global_cwq *gcwq = worker->gcwq; 1340 struct global_cwq *gcwq = worker->pool->gcwq;
1321 1341
1322 if (worker_maybe_bind_and_lock(worker)) 1342 if (worker_maybe_bind_and_lock(worker))
1323 worker_clr_flags(worker, WORKER_REBIND); 1343 worker_clr_flags(worker, WORKER_REBIND);
@@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work)
1325 spin_unlock_irq(&gcwq->lock); 1345 spin_unlock_irq(&gcwq->lock);
1326} 1346}
1327 1347
1348/**
1349 * rebind_workers - rebind all workers of a gcwq to the associated CPU
1350 * @gcwq: gcwq of interest
1351 *
1352 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
1353 * is different for idle and busy ones.
1354 *
1355 * The idle ones should be rebound synchronously and idle rebinding should
1356 * be complete before any worker starts executing work items with
1357 * concurrency management enabled; otherwise, scheduler may oops trying to
1358 * wake up non-local idle worker from wq_worker_sleeping().
1359 *
1360 * This is achieved by repeatedly requesting rebinding until all idle
1361 * workers are known to have been rebound under @gcwq->lock and holding all
1362 * idle workers from becoming busy until idle rebinding is complete.
1363 *
1364 * Once idle workers are rebound, busy workers can be rebound as they
1365 * finish executing their current work items. Queueing the rebind work at
1366 * the head of their scheduled lists is enough. Note that nr_running will
1367 * be properbly bumped as busy workers rebind.
1368 *
1369 * On return, all workers are guaranteed to either be bound or have rebind
1370 * work item scheduled.
1371 */
1372static void rebind_workers(struct global_cwq *gcwq)
1373 __releases(&gcwq->lock) __acquires(&gcwq->lock)
1374{
1375 struct idle_rebind idle_rebind;
1376 struct worker_pool *pool;
1377 struct worker *worker;
1378 struct hlist_node *pos;
1379 int i;
1380
1381 lockdep_assert_held(&gcwq->lock);
1382
1383 for_each_worker_pool(pool, gcwq)
1384 lockdep_assert_held(&pool->manager_mutex);
1385
1386 /*
1387 * Rebind idle workers. Interlocked both ways. We wait for
1388 * workers to rebind via @idle_rebind.done. Workers will wait for
1389 * us to finish up by watching %WORKER_REBIND.
1390 */
1391 init_completion(&idle_rebind.done);
1392retry:
1393 idle_rebind.cnt = 1;
1394 INIT_COMPLETION(idle_rebind.done);
1395
1396 /* set REBIND and kick idle ones, we'll wait for these later */
1397 for_each_worker_pool(pool, gcwq) {
1398 list_for_each_entry(worker, &pool->idle_list, entry) {
1399 if (worker->flags & WORKER_REBIND)
1400 continue;
1401
1402 /* morph UNBOUND to REBIND */
1403 worker->flags &= ~WORKER_UNBOUND;
1404 worker->flags |= WORKER_REBIND;
1405
1406 idle_rebind.cnt++;
1407 worker->idle_rebind = &idle_rebind;
1408
1409 /* worker_thread() will call idle_worker_rebind() */
1410 wake_up_process(worker->task);
1411 }
1412 }
1413
1414 if (--idle_rebind.cnt) {
1415 spin_unlock_irq(&gcwq->lock);
1416 wait_for_completion(&idle_rebind.done);
1417 spin_lock_irq(&gcwq->lock);
1418 /* busy ones might have become idle while waiting, retry */
1419 goto retry;
1420 }
1421
1422 /*
1423 * All idle workers are rebound and waiting for %WORKER_REBIND to
1424 * be cleared inside idle_worker_rebind(). Clear and release.
1425 * Clearing %WORKER_REBIND from this foreign context is safe
1426 * because these workers are still guaranteed to be idle.
1427 */
1428 for_each_worker_pool(pool, gcwq)
1429 list_for_each_entry(worker, &pool->idle_list, entry)
1430 worker->flags &= ~WORKER_REBIND;
1431
1432 wake_up_all(&gcwq->rebind_hold);
1433
1434 /* rebind busy workers */
1435 for_each_busy_worker(worker, i, pos, gcwq) {
1436 struct work_struct *rebind_work = &worker->rebind_work;
1437
1438 /* morph UNBOUND to REBIND */
1439 worker->flags &= ~WORKER_UNBOUND;
1440 worker->flags |= WORKER_REBIND;
1441
1442 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1443 work_data_bits(rebind_work)))
1444 continue;
1445
1446 /* wq doesn't matter, use the default one */
1447 debug_work_activate(rebind_work);
1448 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
1449 worker->scheduled.next,
1450 work_color_to_flags(WORK_NO_COLOR));
1451 }
1452}
1453
1328static struct worker *alloc_worker(void) 1454static struct worker *alloc_worker(void)
1329{ 1455{
1330 struct worker *worker; 1456 struct worker *worker;
@@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void)
1333 if (worker) { 1459 if (worker) {
1334 INIT_LIST_HEAD(&worker->entry); 1460 INIT_LIST_HEAD(&worker->entry);
1335 INIT_LIST_HEAD(&worker->scheduled); 1461 INIT_LIST_HEAD(&worker->scheduled);
1336 INIT_WORK(&worker->rebind_work, worker_rebind_fn); 1462 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1337 /* on creation a worker is in !idle && prep state */ 1463 /* on creation a worker is in !idle && prep state */
1338 worker->flags = WORKER_PREP; 1464 worker->flags = WORKER_PREP;
1339 } 1465 }
@@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void)
1342 1468
1343/** 1469/**
1344 * create_worker - create a new workqueue worker 1470 * create_worker - create a new workqueue worker
1345 * @gcwq: gcwq the new worker will belong to 1471 * @pool: pool the new worker will belong to
1346 * @bind: whether to set affinity to @cpu or not
1347 * 1472 *
1348 * Create a new worker which is bound to @gcwq. The returned worker 1473 * Create a new worker which is bound to @pool. The returned worker
1349 * can be started by calling start_worker() or destroyed using 1474 * can be started by calling start_worker() or destroyed using
1350 * destroy_worker(). 1475 * destroy_worker().
1351 * 1476 *
@@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void)
1355 * RETURNS: 1480 * RETURNS:
1356 * Pointer to the newly created worker. 1481 * Pointer to the newly created worker.
1357 */ 1482 */
1358static struct worker *create_worker(struct global_cwq *gcwq, bool bind) 1483static struct worker *create_worker(struct worker_pool *pool)
1359{ 1484{
1360 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; 1485 struct global_cwq *gcwq = pool->gcwq;
1486 const char *pri = worker_pool_pri(pool) ? "H" : "";
1361 struct worker *worker = NULL; 1487 struct worker *worker = NULL;
1362 int id = -1; 1488 int id = -1;
1363 1489
1364 spin_lock_irq(&gcwq->lock); 1490 spin_lock_irq(&gcwq->lock);
1365 while (ida_get_new(&gcwq->worker_ida, &id)) { 1491 while (ida_get_new(&pool->worker_ida, &id)) {
1366 spin_unlock_irq(&gcwq->lock); 1492 spin_unlock_irq(&gcwq->lock);
1367 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) 1493 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1368 goto fail; 1494 goto fail;
1369 spin_lock_irq(&gcwq->lock); 1495 spin_lock_irq(&gcwq->lock);
1370 } 1496 }
@@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1374 if (!worker) 1500 if (!worker)
1375 goto fail; 1501 goto fail;
1376 1502
1377 worker->gcwq = gcwq; 1503 worker->pool = pool;
1378 worker->id = id; 1504 worker->id = id;
1379 1505
1380 if (!on_unbound_cpu) 1506 if (gcwq->cpu != WORK_CPU_UNBOUND)
1381 worker->task = kthread_create_on_node(worker_thread, 1507 worker->task = kthread_create_on_node(worker_thread,
1382 worker, 1508 worker, cpu_to_node(gcwq->cpu),
1383 cpu_to_node(gcwq->cpu), 1509 "kworker/%u:%d%s", gcwq->cpu, id, pri);
1384 "kworker/%u:%d", gcwq->cpu, id);
1385 else 1510 else
1386 worker->task = kthread_create(worker_thread, worker, 1511 worker->task = kthread_create(worker_thread, worker,
1387 "kworker/u:%d", id); 1512 "kworker/u:%d%s", id, pri);
1388 if (IS_ERR(worker->task)) 1513 if (IS_ERR(worker->task))
1389 goto fail; 1514 goto fail;
1390 1515
1516 if (worker_pool_pri(pool))
1517 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1518
1391 /* 1519 /*
1392 * A rogue worker will become a regular one if CPU comes 1520 * Determine CPU binding of the new worker depending on
1393 * online later on. Make sure every worker has 1521 * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the
1394 * PF_THREAD_BOUND set. 1522 * flag remains stable across this function. See the comments
1523 * above the flag definition for details.
1524 *
1525 * As an unbound worker may later become a regular one if CPU comes
1526 * online, make sure every worker has %PF_THREAD_BOUND set.
1395 */ 1527 */
1396 if (bind && !on_unbound_cpu) 1528 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
1397 kthread_bind(worker->task, gcwq->cpu); 1529 kthread_bind(worker->task, gcwq->cpu);
1398 else { 1530 } else {
1399 worker->task->flags |= PF_THREAD_BOUND; 1531 worker->task->flags |= PF_THREAD_BOUND;
1400 if (on_unbound_cpu) 1532 worker->flags |= WORKER_UNBOUND;
1401 worker->flags |= WORKER_UNBOUND;
1402 } 1533 }
1403 1534
1404 return worker; 1535 return worker;
1405fail: 1536fail:
1406 if (id >= 0) { 1537 if (id >= 0) {
1407 spin_lock_irq(&gcwq->lock); 1538 spin_lock_irq(&gcwq->lock);
1408 ida_remove(&gcwq->worker_ida, id); 1539 ida_remove(&pool->worker_ida, id);
1409 spin_unlock_irq(&gcwq->lock); 1540 spin_unlock_irq(&gcwq->lock);
1410 } 1541 }
1411 kfree(worker); 1542 kfree(worker);
@@ -1424,7 +1555,7 @@ fail:
1424static void start_worker(struct worker *worker) 1555static void start_worker(struct worker *worker)
1425{ 1556{
1426 worker->flags |= WORKER_STARTED; 1557 worker->flags |= WORKER_STARTED;
1427 worker->gcwq->nr_workers++; 1558 worker->pool->nr_workers++;
1428 worker_enter_idle(worker); 1559 worker_enter_idle(worker);
1429 wake_up_process(worker->task); 1560 wake_up_process(worker->task);
1430} 1561}
@@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker)
1440 */ 1571 */
1441static void destroy_worker(struct worker *worker) 1572static void destroy_worker(struct worker *worker)
1442{ 1573{
1443 struct global_cwq *gcwq = worker->gcwq; 1574 struct worker_pool *pool = worker->pool;
1575 struct global_cwq *gcwq = pool->gcwq;
1444 int id = worker->id; 1576 int id = worker->id;
1445 1577
1446 /* sanity check frenzy */ 1578 /* sanity check frenzy */
@@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker)
1448 BUG_ON(!list_empty(&worker->scheduled)); 1580 BUG_ON(!list_empty(&worker->scheduled));
1449 1581
1450 if (worker->flags & WORKER_STARTED) 1582 if (worker->flags & WORKER_STARTED)
1451 gcwq->nr_workers--; 1583 pool->nr_workers--;
1452 if (worker->flags & WORKER_IDLE) 1584 if (worker->flags & WORKER_IDLE)
1453 gcwq->nr_idle--; 1585 pool->nr_idle--;
1454 1586
1455 list_del_init(&worker->entry); 1587 list_del_init(&worker->entry);
1456 worker->flags |= WORKER_DIE; 1588 worker->flags |= WORKER_DIE;
@@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker)
1461 kfree(worker); 1593 kfree(worker);
1462 1594
1463 spin_lock_irq(&gcwq->lock); 1595 spin_lock_irq(&gcwq->lock);
1464 ida_remove(&gcwq->worker_ida, id); 1596 ida_remove(&pool->worker_ida, id);
1465} 1597}
1466 1598
1467static void idle_worker_timeout(unsigned long __gcwq) 1599static void idle_worker_timeout(unsigned long __pool)
1468{ 1600{
1469 struct global_cwq *gcwq = (void *)__gcwq; 1601 struct worker_pool *pool = (void *)__pool;
1602 struct global_cwq *gcwq = pool->gcwq;
1470 1603
1471 spin_lock_irq(&gcwq->lock); 1604 spin_lock_irq(&gcwq->lock);
1472 1605
1473 if (too_many_workers(gcwq)) { 1606 if (too_many_workers(pool)) {
1474 struct worker *worker; 1607 struct worker *worker;
1475 unsigned long expires; 1608 unsigned long expires;
1476 1609
1477 /* idle_list is kept in LIFO order, check the last one */ 1610 /* idle_list is kept in LIFO order, check the last one */
1478 worker = list_entry(gcwq->idle_list.prev, struct worker, entry); 1611 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1479 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1612 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1480 1613
1481 if (time_before(jiffies, expires)) 1614 if (time_before(jiffies, expires))
1482 mod_timer(&gcwq->idle_timer, expires); 1615 mod_timer(&pool->idle_timer, expires);
1483 else { 1616 else {
1484 /* it's been idle for too long, wake up manager */ 1617 /* it's been idle for too long, wake up manager */
1485 gcwq->flags |= GCWQ_MANAGE_WORKERS; 1618 pool->flags |= POOL_MANAGE_WORKERS;
1486 wake_up_worker(gcwq); 1619 wake_up_worker(pool);
1487 } 1620 }
1488 } 1621 }
1489 1622
@@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work)
1500 return false; 1633 return false;
1501 1634
1502 /* mayday mayday mayday */ 1635 /* mayday mayday mayday */
1503 cpu = cwq->gcwq->cpu; 1636 cpu = cwq->pool->gcwq->cpu;
1504 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1637 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1505 if (cpu == WORK_CPU_UNBOUND) 1638 if (cpu == WORK_CPU_UNBOUND)
1506 cpu = 0; 1639 cpu = 0;
@@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work)
1509 return true; 1642 return true;
1510} 1643}
1511 1644
1512static void gcwq_mayday_timeout(unsigned long __gcwq) 1645static void gcwq_mayday_timeout(unsigned long __pool)
1513{ 1646{
1514 struct global_cwq *gcwq = (void *)__gcwq; 1647 struct worker_pool *pool = (void *)__pool;
1648 struct global_cwq *gcwq = pool->gcwq;
1515 struct work_struct *work; 1649 struct work_struct *work;
1516 1650
1517 spin_lock_irq(&gcwq->lock); 1651 spin_lock_irq(&gcwq->lock);
1518 1652
1519 if (need_to_create_worker(gcwq)) { 1653 if (need_to_create_worker(pool)) {
1520 /* 1654 /*
1521 * We've been trying to create a new worker but 1655 * We've been trying to create a new worker but
1522 * haven't been successful. We might be hitting an 1656 * haven't been successful. We might be hitting an
1523 * allocation deadlock. Send distress signals to 1657 * allocation deadlock. Send distress signals to
1524 * rescuers. 1658 * rescuers.
1525 */ 1659 */
1526 list_for_each_entry(work, &gcwq->worklist, entry) 1660 list_for_each_entry(work, &pool->worklist, entry)
1527 send_mayday(work); 1661 send_mayday(work);
1528 } 1662 }
1529 1663
1530 spin_unlock_irq(&gcwq->lock); 1664 spin_unlock_irq(&gcwq->lock);
1531 1665
1532 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); 1666 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1533} 1667}
1534 1668
1535/** 1669/**
1536 * maybe_create_worker - create a new worker if necessary 1670 * maybe_create_worker - create a new worker if necessary
1537 * @gcwq: gcwq to create a new worker for 1671 * @pool: pool to create a new worker for
1538 * 1672 *
1539 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to 1673 * Create a new worker for @pool if necessary. @pool is guaranteed to
1540 * have at least one idle worker on return from this function. If 1674 * have at least one idle worker on return from this function. If
1541 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is 1675 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1542 * sent to all rescuers with works scheduled on @gcwq to resolve 1676 * sent to all rescuers with works scheduled on @pool to resolve
1543 * possible allocation deadlock. 1677 * possible allocation deadlock.
1544 * 1678 *
1545 * On return, need_to_create_worker() is guaranteed to be false and 1679 * On return, need_to_create_worker() is guaranteed to be false and
@@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
1554 * false if no action was taken and gcwq->lock stayed locked, true 1688 * false if no action was taken and gcwq->lock stayed locked, true
1555 * otherwise. 1689 * otherwise.
1556 */ 1690 */
1557static bool maybe_create_worker(struct global_cwq *gcwq) 1691static bool maybe_create_worker(struct worker_pool *pool)
1558__releases(&gcwq->lock) 1692__releases(&gcwq->lock)
1559__acquires(&gcwq->lock) 1693__acquires(&gcwq->lock)
1560{ 1694{
1561 if (!need_to_create_worker(gcwq)) 1695 struct global_cwq *gcwq = pool->gcwq;
1696
1697 if (!need_to_create_worker(pool))
1562 return false; 1698 return false;
1563restart: 1699restart:
1564 spin_unlock_irq(&gcwq->lock); 1700 spin_unlock_irq(&gcwq->lock);
1565 1701
1566 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 1702 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1567 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1703 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1568 1704
1569 while (true) { 1705 while (true) {
1570 struct worker *worker; 1706 struct worker *worker;
1571 1707
1572 worker = create_worker(gcwq, true); 1708 worker = create_worker(pool);
1573 if (worker) { 1709 if (worker) {
1574 del_timer_sync(&gcwq->mayday_timer); 1710 del_timer_sync(&pool->mayday_timer);
1575 spin_lock_irq(&gcwq->lock); 1711 spin_lock_irq(&gcwq->lock);
1576 start_worker(worker); 1712 start_worker(worker);
1577 BUG_ON(need_to_create_worker(gcwq)); 1713 BUG_ON(need_to_create_worker(pool));
1578 return true; 1714 return true;
1579 } 1715 }
1580 1716
1581 if (!need_to_create_worker(gcwq)) 1717 if (!need_to_create_worker(pool))
1582 break; 1718 break;
1583 1719
1584 __set_current_state(TASK_INTERRUPTIBLE); 1720 __set_current_state(TASK_INTERRUPTIBLE);
1585 schedule_timeout(CREATE_COOLDOWN); 1721 schedule_timeout(CREATE_COOLDOWN);
1586 1722
1587 if (!need_to_create_worker(gcwq)) 1723 if (!need_to_create_worker(pool))
1588 break; 1724 break;
1589 } 1725 }
1590 1726
1591 del_timer_sync(&gcwq->mayday_timer); 1727 del_timer_sync(&pool->mayday_timer);
1592 spin_lock_irq(&gcwq->lock); 1728 spin_lock_irq(&gcwq->lock);
1593 if (need_to_create_worker(gcwq)) 1729 if (need_to_create_worker(pool))
1594 goto restart; 1730 goto restart;
1595 return true; 1731 return true;
1596} 1732}
1597 1733
1598/** 1734/**
1599 * maybe_destroy_worker - destroy workers which have been idle for a while 1735 * maybe_destroy_worker - destroy workers which have been idle for a while
1600 * @gcwq: gcwq to destroy workers for 1736 * @pool: pool to destroy workers for
1601 * 1737 *
1602 * Destroy @gcwq workers which have been idle for longer than 1738 * Destroy @pool workers which have been idle for longer than
1603 * IDLE_WORKER_TIMEOUT. 1739 * IDLE_WORKER_TIMEOUT.
1604 * 1740 *
1605 * LOCKING: 1741 * LOCKING:
@@ -1610,19 +1746,19 @@ restart:
1610 * false if no action was taken and gcwq->lock stayed locked, true 1746 * false if no action was taken and gcwq->lock stayed locked, true
1611 * otherwise. 1747 * otherwise.
1612 */ 1748 */
1613static bool maybe_destroy_workers(struct global_cwq *gcwq) 1749static bool maybe_destroy_workers(struct worker_pool *pool)
1614{ 1750{
1615 bool ret = false; 1751 bool ret = false;
1616 1752
1617 while (too_many_workers(gcwq)) { 1753 while (too_many_workers(pool)) {
1618 struct worker *worker; 1754 struct worker *worker;
1619 unsigned long expires; 1755 unsigned long expires;
1620 1756
1621 worker = list_entry(gcwq->idle_list.prev, struct worker, entry); 1757 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1622 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1758 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1623 1759
1624 if (time_before(jiffies, expires)) { 1760 if (time_before(jiffies, expires)) {
1625 mod_timer(&gcwq->idle_timer, expires); 1761 mod_timer(&pool->idle_timer, expires);
1626 break; 1762 break;
1627 } 1763 }
1628 1764
@@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
1655 */ 1791 */
1656static bool manage_workers(struct worker *worker) 1792static bool manage_workers(struct worker *worker)
1657{ 1793{
1658 struct global_cwq *gcwq = worker->gcwq; 1794 struct worker_pool *pool = worker->pool;
1659 bool ret = false; 1795 bool ret = false;
1660 1796
1661 if (gcwq->flags & GCWQ_MANAGING_WORKERS) 1797 if (!mutex_trylock(&pool->manager_mutex))
1662 return ret; 1798 return ret;
1663 1799
1664 gcwq->flags &= ~GCWQ_MANAGE_WORKERS; 1800 pool->flags &= ~POOL_MANAGE_WORKERS;
1665 gcwq->flags |= GCWQ_MANAGING_WORKERS;
1666 1801
1667 /* 1802 /*
1668 * Destroy and then create so that may_start_working() is true 1803 * Destroy and then create so that may_start_working() is true
1669 * on return. 1804 * on return.
1670 */ 1805 */
1671 ret |= maybe_destroy_workers(gcwq); 1806 ret |= maybe_destroy_workers(pool);
1672 ret |= maybe_create_worker(gcwq); 1807 ret |= maybe_create_worker(pool);
1673
1674 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1675
1676 /*
1677 * The trustee might be waiting to take over the manager
1678 * position, tell it we're done.
1679 */
1680 if (unlikely(gcwq->trustee))
1681 wake_up_all(&gcwq->trustee_wait);
1682 1808
1809 mutex_unlock(&pool->manager_mutex);
1683 return ret; 1810 return ret;
1684} 1811}
1685 1812
@@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1728{ 1855{
1729 struct work_struct *work = list_first_entry(&cwq->delayed_works, 1856 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1730 struct work_struct, entry); 1857 struct work_struct, entry);
1731 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1732 1858
1733 trace_workqueue_activate_work(work); 1859 trace_workqueue_activate_work(work);
1734 move_linked_works(work, pos, NULL); 1860 move_linked_works(work, &cwq->pool->worklist, NULL);
1735 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 1861 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1736 cwq->nr_active++; 1862 cwq->nr_active++;
1737} 1863}
@@ -1804,7 +1930,8 @@ __releases(&gcwq->lock)
1804__acquires(&gcwq->lock) 1930__acquires(&gcwq->lock)
1805{ 1931{
1806 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 1932 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1807 struct global_cwq *gcwq = cwq->gcwq; 1933 struct worker_pool *pool = worker->pool;
1934 struct global_cwq *gcwq = pool->gcwq;
1808 struct hlist_head *bwh = busy_worker_head(gcwq, work); 1935 struct hlist_head *bwh = busy_worker_head(gcwq, work);
1809 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; 1936 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1810 work_func_t f = work->func; 1937 work_func_t f = work->func;
@@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock)
1823 lockdep_copy_map(&lockdep_map, &work->lockdep_map); 1950 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
1824#endif 1951#endif
1825 /* 1952 /*
1953 * Ensure we're on the correct CPU. DISASSOCIATED test is
1954 * necessary to avoid spurious warnings from rescuers servicing the
1955 * unbound or a disassociated gcwq.
1956 */
1957 WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
1958 !(gcwq->flags & GCWQ_DISASSOCIATED) &&
1959 raw_smp_processor_id() != gcwq->cpu);
1960
1961 /*
1826 * A single work shouldn't be executed concurrently by 1962 * A single work shouldn't be executed concurrently by
1827 * multiple workers on a single cpu. Check whether anyone is 1963 * multiple workers on a single cpu. Check whether anyone is
1828 * already processing the work. If so, defer the work to the 1964 * already processing the work. If so, defer the work to the
@@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock)
1846 list_del_init(&work->entry); 1982 list_del_init(&work->entry);
1847 1983
1848 /* 1984 /*
1849 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1850 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1851 */
1852 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1853 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1854 struct work_struct, entry);
1855
1856 if (!list_empty(&gcwq->worklist) &&
1857 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1858 wake_up_worker(gcwq);
1859 else
1860 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1861 }
1862
1863 /*
1864 * CPU intensive works don't participate in concurrency 1985 * CPU intensive works don't participate in concurrency
1865 * management. They're the scheduler's responsibility. 1986 * management. They're the scheduler's responsibility.
1866 */ 1987 */
1867 if (unlikely(cpu_intensive)) 1988 if (unlikely(cpu_intensive))
1868 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 1989 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1869 1990
1991 /*
1992 * Unbound gcwq isn't concurrency managed and work items should be
1993 * executed ASAP. Wake up another worker if necessary.
1994 */
1995 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
1996 wake_up_worker(pool);
1997
1870 spin_unlock_irq(&gcwq->lock); 1998 spin_unlock_irq(&gcwq->lock);
1871 1999
1872 work_clear_pending(work); 2000 work_clear_pending(work);
@@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker)
1939static int worker_thread(void *__worker) 2067static int worker_thread(void *__worker)
1940{ 2068{
1941 struct worker *worker = __worker; 2069 struct worker *worker = __worker;
1942 struct global_cwq *gcwq = worker->gcwq; 2070 struct worker_pool *pool = worker->pool;
2071 struct global_cwq *gcwq = pool->gcwq;
1943 2072
1944 /* tell the scheduler that this is a workqueue worker */ 2073 /* tell the scheduler that this is a workqueue worker */
1945 worker->task->flags |= PF_WQ_WORKER; 2074 worker->task->flags |= PF_WQ_WORKER;
1946woke_up: 2075woke_up:
1947 spin_lock_irq(&gcwq->lock); 2076 spin_lock_irq(&gcwq->lock);
1948 2077
1949 /* DIE can be set only while we're idle, checking here is enough */ 2078 /*
1950 if (worker->flags & WORKER_DIE) { 2079 * DIE can be set only while idle and REBIND set while busy has
2080 * @worker->rebind_work scheduled. Checking here is enough.
2081 */
2082 if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
1951 spin_unlock_irq(&gcwq->lock); 2083 spin_unlock_irq(&gcwq->lock);
1952 worker->task->flags &= ~PF_WQ_WORKER; 2084
1953 return 0; 2085 if (worker->flags & WORKER_DIE) {
2086 worker->task->flags &= ~PF_WQ_WORKER;
2087 return 0;
2088 }
2089
2090 idle_worker_rebind(worker);
2091 goto woke_up;
1954 } 2092 }
1955 2093
1956 worker_leave_idle(worker); 2094 worker_leave_idle(worker);
1957recheck: 2095recheck:
1958 /* no more worker necessary? */ 2096 /* no more worker necessary? */
1959 if (!need_more_worker(gcwq)) 2097 if (!need_more_worker(pool))
1960 goto sleep; 2098 goto sleep;
1961 2099
1962 /* do we need to manage? */ 2100 /* do we need to manage? */
1963 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) 2101 if (unlikely(!may_start_working(pool)) && manage_workers(worker))
1964 goto recheck; 2102 goto recheck;
1965 2103
1966 /* 2104 /*
@@ -1979,7 +2117,7 @@ recheck:
1979 2117
1980 do { 2118 do {
1981 struct work_struct *work = 2119 struct work_struct *work =
1982 list_first_entry(&gcwq->worklist, 2120 list_first_entry(&pool->worklist,
1983 struct work_struct, entry); 2121 struct work_struct, entry);
1984 2122
1985 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { 2123 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1991,11 +2129,11 @@ recheck:
1991 move_linked_works(work, &worker->scheduled, NULL); 2129 move_linked_works(work, &worker->scheduled, NULL);
1992 process_scheduled_works(worker); 2130 process_scheduled_works(worker);
1993 } 2131 }
1994 } while (keep_working(gcwq)); 2132 } while (keep_working(pool));
1995 2133
1996 worker_set_flags(worker, WORKER_PREP, false); 2134 worker_set_flags(worker, WORKER_PREP, false);
1997sleep: 2135sleep:
1998 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) 2136 if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
1999 goto recheck; 2137 goto recheck;
2000 2138
2001 /* 2139 /*
@@ -2053,14 +2191,15 @@ repeat:
2053 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2191 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2054 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2192 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2055 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); 2193 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2056 struct global_cwq *gcwq = cwq->gcwq; 2194 struct worker_pool *pool = cwq->pool;
2195 struct global_cwq *gcwq = pool->gcwq;
2057 struct work_struct *work, *n; 2196 struct work_struct *work, *n;
2058 2197
2059 __set_current_state(TASK_RUNNING); 2198 __set_current_state(TASK_RUNNING);
2060 mayday_clear_cpu(cpu, wq->mayday_mask); 2199 mayday_clear_cpu(cpu, wq->mayday_mask);
2061 2200
2062 /* migrate to the target cpu if possible */ 2201 /* migrate to the target cpu if possible */
2063 rescuer->gcwq = gcwq; 2202 rescuer->pool = pool;
2064 worker_maybe_bind_and_lock(rescuer); 2203 worker_maybe_bind_and_lock(rescuer);
2065 2204
2066 /* 2205 /*
@@ -2068,7 +2207,7 @@ repeat:
2068 * process'em. 2207 * process'em.
2069 */ 2208 */
2070 BUG_ON(!list_empty(&rescuer->scheduled)); 2209 BUG_ON(!list_empty(&rescuer->scheduled));
2071 list_for_each_entry_safe(work, n, &gcwq->worklist, entry) 2210 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2072 if (get_work_cwq(work) == cwq) 2211 if (get_work_cwq(work) == cwq)
2073 move_linked_works(work, scheduled, &n); 2212 move_linked_works(work, scheduled, &n);
2074 2213
@@ -2079,8 +2218,8 @@ repeat:
2079 * regular worker; otherwise, we end up with 0 concurrency 2218 * regular worker; otherwise, we end up with 0 concurrency
2080 * and stalling the execution. 2219 * and stalling the execution.
2081 */ 2220 */
2082 if (keep_working(gcwq)) 2221 if (keep_working(pool))
2083 wake_up_worker(gcwq); 2222 wake_up_worker(pool);
2084 2223
2085 spin_unlock_irq(&gcwq->lock); 2224 spin_unlock_irq(&gcwq->lock);
2086 } 2225 }
@@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2205 2344
2206 for_each_cwq_cpu(cpu, wq) { 2345 for_each_cwq_cpu(cpu, wq) {
2207 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2346 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2208 struct global_cwq *gcwq = cwq->gcwq; 2347 struct global_cwq *gcwq = cwq->pool->gcwq;
2209 2348
2210 spin_lock_irq(&gcwq->lock); 2349 spin_lock_irq(&gcwq->lock);
2211 2350
@@ -2421,9 +2560,9 @@ reflush:
2421 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2560 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2422 bool drained; 2561 bool drained;
2423 2562
2424 spin_lock_irq(&cwq->gcwq->lock); 2563 spin_lock_irq(&cwq->pool->gcwq->lock);
2425 drained = !cwq->nr_active && list_empty(&cwq->delayed_works); 2564 drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2426 spin_unlock_irq(&cwq->gcwq->lock); 2565 spin_unlock_irq(&cwq->pool->gcwq->lock);
2427 2566
2428 if (drained) 2567 if (drained)
2429 continue; 2568 continue;
@@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2463 */ 2602 */
2464 smp_rmb(); 2603 smp_rmb();
2465 cwq = get_work_cwq(work); 2604 cwq = get_work_cwq(work);
2466 if (unlikely(!cwq || gcwq != cwq->gcwq)) 2605 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2467 goto already_gone; 2606 goto already_gone;
2468 } else if (wait_executing) { 2607 } else if (wait_executing) {
2469 worker = find_worker_executing_work(gcwq, work); 2608 worker = find_worker_executing_work(gcwq, work);
@@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
2984 if (flags & WQ_MEM_RECLAIM) 3123 if (flags & WQ_MEM_RECLAIM)
2985 flags |= WQ_RESCUER; 3124 flags |= WQ_RESCUER;
2986 3125
2987 /*
2988 * Unbound workqueues aren't concurrency managed and should be
2989 * dispatched to workers immediately.
2990 */
2991 if (flags & WQ_UNBOUND)
2992 flags |= WQ_HIGHPRI;
2993
2994 max_active = max_active ?: WQ_DFL_ACTIVE; 3126 max_active = max_active ?: WQ_DFL_ACTIVE;
2995 max_active = wq_clamp_max_active(max_active, flags, wq->name); 3127 max_active = wq_clamp_max_active(max_active, flags, wq->name);
2996 3128
@@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3011 for_each_cwq_cpu(cpu, wq) { 3143 for_each_cwq_cpu(cpu, wq) {
3012 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3144 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3013 struct global_cwq *gcwq = get_gcwq(cpu); 3145 struct global_cwq *gcwq = get_gcwq(cpu);
3146 int pool_idx = (bool)(flags & WQ_HIGHPRI);
3014 3147
3015 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3148 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
3016 cwq->gcwq = gcwq; 3149 cwq->pool = &gcwq->pools[pool_idx];
3017 cwq->wq = wq; 3150 cwq->wq = wq;
3018 cwq->flush_color = -1; 3151 cwq->flush_color = -1;
3019 cwq->max_active = max_active; 3152 cwq->max_active = max_active;
@@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy);
3225 * gcwqs serve mix of short, long and very long running works making 3358 * gcwqs serve mix of short, long and very long running works making
3226 * blocked draining impractical. 3359 * blocked draining impractical.
3227 * 3360 *
3228 * This is solved by allowing a gcwq to be detached from CPU, running 3361 * This is solved by allowing a gcwq to be disassociated from the CPU
3229 * it with unbound (rogue) workers and allowing it to be reattached 3362 * running as an unbound one and allowing it to be reattached later if the
3230 * later if the cpu comes back online. A separate thread is created 3363 * cpu comes back online.
3231 * to govern a gcwq in such state and is called the trustee of the
3232 * gcwq.
3233 *
3234 * Trustee states and their descriptions.
3235 *
3236 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3237 * new trustee is started with this state.
3238 *
3239 * IN_CHARGE Once started, trustee will enter this state after
3240 * assuming the manager role and making all existing
3241 * workers rogue. DOWN_PREPARE waits for trustee to
3242 * enter this state. After reaching IN_CHARGE, trustee
3243 * tries to execute the pending worklist until it's empty
3244 * and the state is set to BUTCHER, or the state is set
3245 * to RELEASE.
3246 *
3247 * BUTCHER Command state which is set by the cpu callback after
3248 * the cpu has went down. Once this state is set trustee
3249 * knows that there will be no new works on the worklist
3250 * and once the worklist is empty it can proceed to
3251 * killing idle workers.
3252 *
3253 * RELEASE Command state which is set by the cpu callback if the
3254 * cpu down has been canceled or it has come online
3255 * again. After recognizing this state, trustee stops
3256 * trying to drain or butcher and clears ROGUE, rebinds
3257 * all remaining workers back to the cpu and releases
3258 * manager role.
3259 *
3260 * DONE Trustee will enter this state after BUTCHER or RELEASE
3261 * is complete.
3262 *
3263 * trustee CPU draining
3264 * took over down complete
3265 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3266 * | | ^
3267 * | CPU is back online v return workers |
3268 * ----------------> RELEASE --------------
3269 */ 3364 */
3270 3365
3271/** 3366/* claim manager positions of all pools */
3272 * trustee_wait_event_timeout - timed event wait for trustee 3367static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
3273 * @cond: condition to wait for
3274 * @timeout: timeout in jiffies
3275 *
3276 * wait_event_timeout() for trustee to use. Handles locking and
3277 * checks for RELEASE request.
3278 *
3279 * CONTEXT:
3280 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3281 * multiple times. To be used by trustee.
3282 *
3283 * RETURNS:
3284 * Positive indicating left time if @cond is satisfied, 0 if timed
3285 * out, -1 if canceled.
3286 */
3287#define trustee_wait_event_timeout(cond, timeout) ({ \
3288 long __ret = (timeout); \
3289 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3290 __ret) { \
3291 spin_unlock_irq(&gcwq->lock); \
3292 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3293 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3294 __ret); \
3295 spin_lock_irq(&gcwq->lock); \
3296 } \
3297 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3298})
3299
3300/**
3301 * trustee_wait_event - event wait for trustee
3302 * @cond: condition to wait for
3303 *
3304 * wait_event() for trustee to use. Automatically handles locking and
3305 * checks for CANCEL request.
3306 *
3307 * CONTEXT:
3308 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3309 * multiple times. To be used by trustee.
3310 *
3311 * RETURNS:
3312 * 0 if @cond is satisfied, -1 if canceled.
3313 */
3314#define trustee_wait_event(cond) ({ \
3315 long __ret1; \
3316 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3317 __ret1 < 0 ? -1 : 0; \
3318})
3319
3320static int __cpuinit trustee_thread(void *__gcwq)
3321{ 3368{
3322 struct global_cwq *gcwq = __gcwq; 3369 struct worker_pool *pool;
3323 struct worker *worker;
3324 struct work_struct *work;
3325 struct hlist_node *pos;
3326 long rc;
3327 int i;
3328
3329 BUG_ON(gcwq->cpu != smp_processor_id());
3330 3370
3371 for_each_worker_pool(pool, gcwq)
3372 mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
3331 spin_lock_irq(&gcwq->lock); 3373 spin_lock_irq(&gcwq->lock);
3332 /* 3374}
3333 * Claim the manager position and make all workers rogue.
3334 * Trustee must be bound to the target cpu and can't be
3335 * cancelled.
3336 */
3337 BUG_ON(gcwq->cpu != smp_processor_id());
3338 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3339 BUG_ON(rc < 0);
3340
3341 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3342
3343 list_for_each_entry(worker, &gcwq->idle_list, entry)
3344 worker->flags |= WORKER_ROGUE;
3345 3375
3346 for_each_busy_worker(worker, i, pos, gcwq) 3376/* release manager positions */
3347 worker->flags |= WORKER_ROGUE; 3377static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
3378{
3379 struct worker_pool *pool;
3348 3380
3349 /*
3350 * Call schedule() so that we cross rq->lock and thus can
3351 * guarantee sched callbacks see the rogue flag. This is
3352 * necessary as scheduler callbacks may be invoked from other
3353 * cpus.
3354 */
3355 spin_unlock_irq(&gcwq->lock); 3381 spin_unlock_irq(&gcwq->lock);
3356 schedule(); 3382 for_each_worker_pool(pool, gcwq)
3357 spin_lock_irq(&gcwq->lock); 3383 mutex_unlock(&pool->manager_mutex);
3384}
3358 3385
3359 /* 3386static void gcwq_unbind_fn(struct work_struct *work)
3360 * Sched callbacks are disabled now. Zap nr_running. After 3387{
3361 * this, nr_running stays zero and need_more_worker() and 3388 struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3362 * keep_working() are always true as long as the worklist is 3389 struct worker_pool *pool;
3363 * not empty. 3390 struct worker *worker;
3364 */ 3391 struct hlist_node *pos;
3365 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); 3392 int i;
3366 3393
3367 spin_unlock_irq(&gcwq->lock); 3394 BUG_ON(gcwq->cpu != smp_processor_id());
3368 del_timer_sync(&gcwq->idle_timer);
3369 spin_lock_irq(&gcwq->lock);
3370 3395
3371 /* 3396 gcwq_claim_management_and_lock(gcwq);
3372 * We're now in charge. Notify and proceed to drain. We need
3373 * to keep the gcwq running during the whole CPU down
3374 * procedure as other cpu hotunplug callbacks may need to
3375 * flush currently running tasks.
3376 */
3377 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3378 wake_up_all(&gcwq->trustee_wait);
3379 3397
3380 /* 3398 /*
3381 * The original cpu is in the process of dying and may go away 3399 * We've claimed all manager positions. Make all workers unbound
3382 * anytime now. When that happens, we and all workers would 3400 * and set DISASSOCIATED. Before this, all workers except for the
3383 * be migrated to other cpus. Try draining any left work. We 3401 * ones which are still executing works from before the last CPU
3384 * want to get it over with ASAP - spam rescuers, wake up as 3402 * down must be on the cpu. After this, they may become diasporas.
3385 * many idlers as necessary and create new ones till the
3386 * worklist is empty. Note that if the gcwq is frozen, there
3387 * may be frozen works in freezable cwqs. Don't declare
3388 * completion while frozen.
3389 */ 3403 */
3390 while (gcwq->nr_workers != gcwq->nr_idle || 3404 for_each_worker_pool(pool, gcwq)
3391 gcwq->flags & GCWQ_FREEZING || 3405 list_for_each_entry(worker, &pool->idle_list, entry)
3392 gcwq->trustee_state == TRUSTEE_IN_CHARGE) { 3406 worker->flags |= WORKER_UNBOUND;
3393 int nr_works = 0;
3394
3395 list_for_each_entry(work, &gcwq->worklist, entry) {
3396 send_mayday(work);
3397 nr_works++;
3398 }
3399 3407
3400 list_for_each_entry(worker, &gcwq->idle_list, entry) { 3408 for_each_busy_worker(worker, i, pos, gcwq)
3401 if (!nr_works--) 3409 worker->flags |= WORKER_UNBOUND;
3402 break;
3403 wake_up_process(worker->task);
3404 }
3405 3410
3406 if (need_to_create_worker(gcwq)) { 3411 gcwq->flags |= GCWQ_DISASSOCIATED;
3407 spin_unlock_irq(&gcwq->lock);
3408 worker = create_worker(gcwq, false);
3409 spin_lock_irq(&gcwq->lock);
3410 if (worker) {
3411 worker->flags |= WORKER_ROGUE;
3412 start_worker(worker);
3413 }
3414 }
3415 3412
3416 /* give a breather */ 3413 gcwq_release_management_and_unlock(gcwq);
3417 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3418 break;
3419 }
3420 3414
3421 /* 3415 /*
3422 * Either all works have been scheduled and cpu is down, or 3416 * Call schedule() so that we cross rq->lock and thus can guarantee
3423 * cpu down has already been canceled. Wait for and butcher 3417 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
3424 * all workers till we're canceled. 3418 * as scheduler callbacks may be invoked from other cpus.
3425 */ 3419 */
3426 do { 3420 schedule();
3427 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3428 while (!list_empty(&gcwq->idle_list))
3429 destroy_worker(list_first_entry(&gcwq->idle_list,
3430 struct worker, entry));
3431 } while (gcwq->nr_workers && rc >= 0);
3432 3421
3433 /* 3422 /*
3434 * At this point, either draining has completed and no worker 3423 * Sched callbacks are disabled now. Zap nr_running. After this,
3435 * is left, or cpu down has been canceled or the cpu is being 3424 * nr_running stays zero and need_more_worker() and keep_working()
3436 * brought back up. There shouldn't be any idle one left. 3425 * are always true as long as the worklist is not empty. @gcwq now
3437 * Tell the remaining busy ones to rebind once it finishes the 3426 * behaves as unbound (in terms of concurrency management) gcwq
3438 * currently scheduled works by scheduling the rebind_work. 3427 * which is served by workers tied to the CPU.
3428 *
3429 * On return from this function, the current worker would trigger
3430 * unbound chain execution of pending work items if other workers
3431 * didn't already.
3439 */ 3432 */
3440 WARN_ON(!list_empty(&gcwq->idle_list)); 3433 for_each_worker_pool(pool, gcwq)
3441 3434 atomic_set(get_pool_nr_running(pool), 0);
3442 for_each_busy_worker(worker, i, pos, gcwq) {
3443 struct work_struct *rebind_work = &worker->rebind_work;
3444
3445 /*
3446 * Rebind_work may race with future cpu hotplug
3447 * operations. Use a separate flag to mark that
3448 * rebinding is scheduled.
3449 */
3450 worker->flags |= WORKER_REBIND;
3451 worker->flags &= ~WORKER_ROGUE;
3452
3453 /* queue rebind_work, wq doesn't matter, use the default one */
3454 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3455 work_data_bits(rebind_work)))
3456 continue;
3457
3458 debug_work_activate(rebind_work);
3459 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3460 worker->scheduled.next,
3461 work_color_to_flags(WORK_NO_COLOR));
3462 }
3463
3464 /* relinquish manager role */
3465 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3466
3467 /* notify completion */
3468 gcwq->trustee = NULL;
3469 gcwq->trustee_state = TRUSTEE_DONE;
3470 wake_up_all(&gcwq->trustee_wait);
3471 spin_unlock_irq(&gcwq->lock);
3472 return 0;
3473} 3435}
3474 3436
3475/** 3437/*
3476 * wait_trustee_state - wait for trustee to enter the specified state 3438 * Workqueues should be brought up before normal priority CPU notifiers.
3477 * @gcwq: gcwq the trustee of interest belongs to 3439 * This will be registered high priority CPU notifier.
3478 * @state: target state to wait for
3479 *
3480 * Wait for the trustee to reach @state. DONE is already matched.
3481 *
3482 * CONTEXT:
3483 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3484 * multiple times. To be used by cpu_callback.
3485 */ 3440 */
3486static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) 3441static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3487__releases(&gcwq->lock) 3442 unsigned long action,
3488__acquires(&gcwq->lock) 3443 void *hcpu)
3489{
3490 if (!(gcwq->trustee_state == state ||
3491 gcwq->trustee_state == TRUSTEE_DONE)) {
3492 spin_unlock_irq(&gcwq->lock);
3493 __wait_event(gcwq->trustee_wait,
3494 gcwq->trustee_state == state ||
3495 gcwq->trustee_state == TRUSTEE_DONE);
3496 spin_lock_irq(&gcwq->lock);
3497 }
3498}
3499
3500static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3501 unsigned long action,
3502 void *hcpu)
3503{ 3444{
3504 unsigned int cpu = (unsigned long)hcpu; 3445 unsigned int cpu = (unsigned long)hcpu;
3505 struct global_cwq *gcwq = get_gcwq(cpu); 3446 struct global_cwq *gcwq = get_gcwq(cpu);
3506 struct task_struct *new_trustee = NULL; 3447 struct worker_pool *pool;
3507 struct worker *uninitialized_var(new_worker);
3508 unsigned long flags;
3509
3510 action &= ~CPU_TASKS_FROZEN;
3511 3448
3512 switch (action) { 3449 switch (action & ~CPU_TASKS_FROZEN) {
3513 case CPU_DOWN_PREPARE:
3514 new_trustee = kthread_create(trustee_thread, gcwq,
3515 "workqueue_trustee/%d\n", cpu);
3516 if (IS_ERR(new_trustee))
3517 return notifier_from_errno(PTR_ERR(new_trustee));
3518 kthread_bind(new_trustee, cpu);
3519 /* fall through */
3520 case CPU_UP_PREPARE: 3450 case CPU_UP_PREPARE:
3521 BUG_ON(gcwq->first_idle); 3451 for_each_worker_pool(pool, gcwq) {
3522 new_worker = create_worker(gcwq, false); 3452 struct worker *worker;
3523 if (!new_worker) {
3524 if (new_trustee)
3525 kthread_stop(new_trustee);
3526 return NOTIFY_BAD;
3527 }
3528 }
3529
3530 /* some are called w/ irq disabled, don't disturb irq status */
3531 spin_lock_irqsave(&gcwq->lock, flags);
3532 3453
3533 switch (action) { 3454 if (pool->nr_workers)
3534 case CPU_DOWN_PREPARE: 3455 continue;
3535 /* initialize trustee and tell it to acquire the gcwq */
3536 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3537 gcwq->trustee = new_trustee;
3538 gcwq->trustee_state = TRUSTEE_START;
3539 wake_up_process(gcwq->trustee);
3540 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3541 /* fall through */
3542 case CPU_UP_PREPARE:
3543 BUG_ON(gcwq->first_idle);
3544 gcwq->first_idle = new_worker;
3545 break;
3546 3456
3547 case CPU_DYING: 3457 worker = create_worker(pool);
3548 /* 3458 if (!worker)
3549 * Before this, the trustee and all workers except for 3459 return NOTIFY_BAD;
3550 * the ones which are still executing works from
3551 * before the last CPU down must be on the cpu. After
3552 * this, they'll all be diasporas.
3553 */
3554 gcwq->flags |= GCWQ_DISASSOCIATED;
3555 break;
3556 3460
3557 case CPU_POST_DEAD: 3461 spin_lock_irq(&gcwq->lock);
3558 gcwq->trustee_state = TRUSTEE_BUTCHER; 3462 start_worker(worker);
3559 /* fall through */ 3463 spin_unlock_irq(&gcwq->lock);
3560 case CPU_UP_CANCELED: 3464 }
3561 destroy_worker(gcwq->first_idle);
3562 gcwq->first_idle = NULL;
3563 break; 3465 break;
3564 3466
3565 case CPU_DOWN_FAILED: 3467 case CPU_DOWN_FAILED:
3566 case CPU_ONLINE: 3468 case CPU_ONLINE:
3469 gcwq_claim_management_and_lock(gcwq);
3567 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3470 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3568 if (gcwq->trustee_state != TRUSTEE_DONE) { 3471 rebind_workers(gcwq);
3569 gcwq->trustee_state = TRUSTEE_RELEASE; 3472 gcwq_release_management_and_unlock(gcwq);
3570 wake_up_process(gcwq->trustee);
3571 wait_trustee_state(gcwq, TRUSTEE_DONE);
3572 }
3573
3574 /*
3575 * Trustee is done and there might be no worker left.
3576 * Put the first_idle in and request a real manager to
3577 * take a look.
3578 */
3579 spin_unlock_irq(&gcwq->lock);
3580 kthread_bind(gcwq->first_idle->task, cpu);
3581 spin_lock_irq(&gcwq->lock);
3582 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3583 start_worker(gcwq->first_idle);
3584 gcwq->first_idle = NULL;
3585 break; 3473 break;
3586 } 3474 }
3475 return NOTIFY_OK;
3476}
3587 3477
3588 spin_unlock_irqrestore(&gcwq->lock, flags); 3478/*
3479 * Workqueues should be brought down after normal priority CPU notifiers.
3480 * This will be registered as low priority CPU notifier.
3481 */
3482static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3483 unsigned long action,
3484 void *hcpu)
3485{
3486 unsigned int cpu = (unsigned long)hcpu;
3487 struct work_struct unbind_work;
3589 3488
3590 return notifier_from_errno(0); 3489 switch (action & ~CPU_TASKS_FROZEN) {
3490 case CPU_DOWN_PREPARE:
3491 /* unbinding should happen on the local CPU */
3492 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3493 schedule_work_on(cpu, &unbind_work);
3494 flush_work(&unbind_work);
3495 break;
3496 }
3497 return NOTIFY_OK;
3591} 3498}
3592 3499
3593#ifdef CONFIG_SMP 3500#ifdef CONFIG_SMP
@@ -3746,6 +3653,7 @@ void thaw_workqueues(void)
3746 3653
3747 for_each_gcwq_cpu(cpu) { 3654 for_each_gcwq_cpu(cpu) {
3748 struct global_cwq *gcwq = get_gcwq(cpu); 3655 struct global_cwq *gcwq = get_gcwq(cpu);
3656 struct worker_pool *pool;
3749 struct workqueue_struct *wq; 3657 struct workqueue_struct *wq;
3750 3658
3751 spin_lock_irq(&gcwq->lock); 3659 spin_lock_irq(&gcwq->lock);
@@ -3767,7 +3675,8 @@ void thaw_workqueues(void)
3767 cwq_activate_first_delayed(cwq); 3675 cwq_activate_first_delayed(cwq);
3768 } 3676 }
3769 3677
3770 wake_up_worker(gcwq); 3678 for_each_worker_pool(pool, gcwq)
3679 wake_up_worker(pool);
3771 3680
3772 spin_unlock_irq(&gcwq->lock); 3681 spin_unlock_irq(&gcwq->lock);
3773 } 3682 }
@@ -3783,46 +3692,57 @@ static int __init init_workqueues(void)
3783 unsigned int cpu; 3692 unsigned int cpu;
3784 int i; 3693 int i;
3785 3694
3786 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); 3695 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3696 cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3787 3697
3788 /* initialize gcwqs */ 3698 /* initialize gcwqs */
3789 for_each_gcwq_cpu(cpu) { 3699 for_each_gcwq_cpu(cpu) {
3790 struct global_cwq *gcwq = get_gcwq(cpu); 3700 struct global_cwq *gcwq = get_gcwq(cpu);
3701 struct worker_pool *pool;
3791 3702
3792 spin_lock_init(&gcwq->lock); 3703 spin_lock_init(&gcwq->lock);
3793 INIT_LIST_HEAD(&gcwq->worklist);
3794 gcwq->cpu = cpu; 3704 gcwq->cpu = cpu;
3795 gcwq->flags |= GCWQ_DISASSOCIATED; 3705 gcwq->flags |= GCWQ_DISASSOCIATED;
3796 3706
3797 INIT_LIST_HEAD(&gcwq->idle_list);
3798 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) 3707 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3799 INIT_HLIST_HEAD(&gcwq->busy_hash[i]); 3708 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3800 3709
3801 init_timer_deferrable(&gcwq->idle_timer); 3710 for_each_worker_pool(pool, gcwq) {
3802 gcwq->idle_timer.function = idle_worker_timeout; 3711 pool->gcwq = gcwq;
3803 gcwq->idle_timer.data = (unsigned long)gcwq; 3712 INIT_LIST_HEAD(&pool->worklist);
3713 INIT_LIST_HEAD(&pool->idle_list);
3714
3715 init_timer_deferrable(&pool->idle_timer);
3716 pool->idle_timer.function = idle_worker_timeout;
3717 pool->idle_timer.data = (unsigned long)pool;
3804 3718
3805 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, 3719 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
3806 (unsigned long)gcwq); 3720 (unsigned long)pool);
3807 3721
3808 ida_init(&gcwq->worker_ida); 3722 mutex_init(&pool->manager_mutex);
3723 ida_init(&pool->worker_ida);
3724 }
3809 3725
3810 gcwq->trustee_state = TRUSTEE_DONE; 3726 init_waitqueue_head(&gcwq->rebind_hold);
3811 init_waitqueue_head(&gcwq->trustee_wait);
3812 } 3727 }
3813 3728
3814 /* create the initial worker */ 3729 /* create the initial worker */
3815 for_each_online_gcwq_cpu(cpu) { 3730 for_each_online_gcwq_cpu(cpu) {
3816 struct global_cwq *gcwq = get_gcwq(cpu); 3731 struct global_cwq *gcwq = get_gcwq(cpu);
3817 struct worker *worker; 3732 struct worker_pool *pool;
3818 3733
3819 if (cpu != WORK_CPU_UNBOUND) 3734 if (cpu != WORK_CPU_UNBOUND)
3820 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3735 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3821 worker = create_worker(gcwq, true); 3736
3822 BUG_ON(!worker); 3737 for_each_worker_pool(pool, gcwq) {
3823 spin_lock_irq(&gcwq->lock); 3738 struct worker *worker;
3824 start_worker(worker); 3739
3825 spin_unlock_irq(&gcwq->lock); 3740 worker = create_worker(pool);
3741 BUG_ON(!worker);
3742 spin_lock_irq(&gcwq->lock);
3743 start_worker(worker);
3744 spin_unlock_irq(&gcwq->lock);
3745 }
3826 } 3746 }
3827 3747
3828 system_wq = alloc_workqueue("events", 0, 0); 3748 system_wq = alloc_workqueue("events", 0, 0);