aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2012-10-09 15:20:05 -0400
committerThomas Gleixner <tglx@linutronix.de>2012-10-09 15:20:05 -0400
commitdb8c246937713e60b7628661ccc187eeb81f2bae (patch)
tree6351e8bca23eef40fce85396d1c6f6cfffbd4b66 /kernel
parentc5f66e99b7cb091e3d51ae8e8156892e8feb7fa3 (diff)
parent28f2b02bc581ffc835bc1691b18d03f62fcf0395 (diff)
Merge branch 'fortglx/3.7/time' of git://git.linaro.org/people/jstultz/linux into timers/core
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c76
-rw-r--r--kernel/audit.c51
-rw-r--r--kernel/audit_tree.c29
-rw-r--r--kernel/audit_watch.c25
-rw-r--r--kernel/cgroup.c53
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c130
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c11
-rw-r--r--kernel/debug/kdb/kdb_main.c106
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/events/callchain.c9
-rw-r--r--kernel/events/core.c79
-rw-r--r--kernel/events/internal.h3
-rw-r--r--kernel/events/uprobes.c622
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c52
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/irq/handle.c7
-rw-r--r--kernel/irq/irqdomain.c370
-rw-r--r--kernel/irq/manage.c38
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c37
-rw-r--r--kernel/kthread.c88
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c50
-rw-r--r--kernel/power/main.c45
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/power/swap.c82
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/power/wakelock.c7
-rw-r--r--kernel/printk.c276
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcutiny.c4
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c72
-rw-r--r--kernel/rcutree.c478
-rw-r--r--kernel/rcutree.h46
-rw-r--r--kernel/rcutree_plugin.h223
-rw-r--r--kernel/rcutree_trace.c148
-rw-r--r--kernel/resource.c37
-rw-r--r--kernel/sched/core.c133
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/fair.c153
-rw-r--r--kernel/sched/rt.c13
-rw-r--r--kernel/sched/sched.h31
-rw-r--r--kernel/sched/stop_task.c22
-rw-r--r--kernel/signal.c15
-rw-r--r--kernel/smp.c20
-rw-r--r--kernel/smpboot.h2
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c57
-rw-r--r--kernel/sysctl.c69
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/task_work.c95
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/alarmtimer.c118
-rw-r--r--kernel/time/jiffies.c32
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c552
-rw-r--r--kernel/timer.c9
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c40
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_functions.c36
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_syscalls.c8
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/workqueue.c1208
78 files changed, 3421 insertions, 2670 deletions
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bbe..9d3118384858 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1;
62#define MAX_WORK 32768 62#define MAX_WORK 32768
63 63
64static LIST_HEAD(async_pending); 64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running); 65static ASYNC_DOMAIN(async_running);
66static LIST_HEAD(async_domains);
66static DEFINE_SPINLOCK(async_lock); 67static DEFINE_SPINLOCK(async_lock);
68static DEFINE_MUTEX(async_register_mutex);
67 69
68struct async_entry { 70struct async_entry {
69 struct list_head list; 71 struct list_head list;
@@ -71,7 +73,7 @@ struct async_entry {
71 async_cookie_t cookie; 73 async_cookie_t cookie;
72 async_func_ptr *func; 74 async_func_ptr *func;
73 void *data; 75 void *data;
74 struct list_head *running; 76 struct async_domain *running;
75}; 77};
76 78
77static DECLARE_WAIT_QUEUE_HEAD(async_done); 79static DECLARE_WAIT_QUEUE_HEAD(async_done);
@@ -82,13 +84,12 @@ static atomic_t entry_count;
82/* 84/*
83 * MUST be called with the lock held! 85 * MUST be called with the lock held!
84 */ 86 */
85static async_cookie_t __lowest_in_progress(struct list_head *running) 87static async_cookie_t __lowest_in_progress(struct async_domain *running)
86{ 88{
87 struct async_entry *entry; 89 struct async_entry *entry;
88 90
89 if (!list_empty(running)) { 91 if (!list_empty(&running->domain)) {
90 entry = list_first_entry(running, 92 entry = list_first_entry(&running->domain, typeof(*entry), list);
91 struct async_entry, list);
92 return entry->cookie; 93 return entry->cookie;
93 } 94 }
94 95
@@ -99,7 +100,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running)
99 return next_cookie; /* "infinity" value */ 100 return next_cookie; /* "infinity" value */
100} 101}
101 102
102static async_cookie_t lowest_in_progress(struct list_head *running) 103static async_cookie_t lowest_in_progress(struct async_domain *running)
103{ 104{
104 unsigned long flags; 105 unsigned long flags;
105 async_cookie_t ret; 106 async_cookie_t ret;
@@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work)
119 container_of(work, struct async_entry, work); 120 container_of(work, struct async_entry, work);
120 unsigned long flags; 121 unsigned long flags;
121 ktime_t uninitialized_var(calltime), delta, rettime; 122 ktime_t uninitialized_var(calltime), delta, rettime;
123 struct async_domain *running = entry->running;
122 124
123 /* 1) move self to the running queue */ 125 /* 1) move self to the running queue */
124 spin_lock_irqsave(&async_lock, flags); 126 spin_lock_irqsave(&async_lock, flags);
125 list_move_tail(&entry->list, entry->running); 127 list_move_tail(&entry->list, &running->domain);
126 spin_unlock_irqrestore(&async_lock, flags); 128 spin_unlock_irqrestore(&async_lock, flags);
127 129
128 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
@@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work)
145 /* 3) remove self from the running queue */ 147 /* 3) remove self from the running queue */
146 spin_lock_irqsave(&async_lock, flags); 148 spin_lock_irqsave(&async_lock, flags);
147 list_del(&entry->list); 149 list_del(&entry->list);
150 if (running->registered && --running->count == 0)
151 list_del_init(&running->node);
148 152
149 /* 4) free the entry */ 153 /* 4) free the entry */
150 kfree(entry); 154 kfree(entry);
@@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work)
156 wake_up(&async_done); 160 wake_up(&async_done);
157} 161}
158 162
159static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) 163static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
160{ 164{
161 struct async_entry *entry; 165 struct async_entry *entry;
162 unsigned long flags; 166 unsigned long flags;
@@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
187 spin_lock_irqsave(&async_lock, flags); 191 spin_lock_irqsave(&async_lock, flags);
188 newcookie = entry->cookie = next_cookie++; 192 newcookie = entry->cookie = next_cookie++;
189 list_add_tail(&entry->list, &async_pending); 193 list_add_tail(&entry->list, &async_pending);
194 if (running->registered && running->count++ == 0)
195 list_add_tail(&running->node, &async_domains);
190 atomic_inc(&entry_count); 196 atomic_inc(&entry_count);
191 spin_unlock_irqrestore(&async_lock, flags); 197 spin_unlock_irqrestore(&async_lock, flags);
192 198
@@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
223 * Note: This function may be called from atomic or non-atomic contexts. 229 * Note: This function may be called from atomic or non-atomic contexts.
224 */ 230 */
225async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 231async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
226 struct list_head *running) 232 struct async_domain *running)
227{ 233{
228 return __async_schedule(ptr, data, running); 234 return __async_schedule(ptr, data, running);
229} 235}
@@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
236 */ 242 */
237void async_synchronize_full(void) 243void async_synchronize_full(void)
238{ 244{
245 mutex_lock(&async_register_mutex);
239 do { 246 do {
240 async_synchronize_cookie(next_cookie); 247 struct async_domain *domain = NULL;
241 } while (!list_empty(&async_running) || !list_empty(&async_pending)); 248
249 spin_lock_irq(&async_lock);
250 if (!list_empty(&async_domains))
251 domain = list_first_entry(&async_domains, typeof(*domain), node);
252 spin_unlock_irq(&async_lock);
253
254 async_synchronize_cookie_domain(next_cookie, domain);
255 } while (!list_empty(&async_domains));
256 mutex_unlock(&async_register_mutex);
242} 257}
243EXPORT_SYMBOL_GPL(async_synchronize_full); 258EXPORT_SYMBOL_GPL(async_synchronize_full);
244 259
245/** 260/**
261 * async_unregister_domain - ensure no more anonymous waiters on this domain
262 * @domain: idle domain to flush out of any async_synchronize_full instances
263 *
264 * async_synchronize_{cookie|full}_domain() are not flushed since callers
265 * of these routines should know the lifetime of @domain
266 *
267 * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
268 */
269void async_unregister_domain(struct async_domain *domain)
270{
271 mutex_lock(&async_register_mutex);
272 spin_lock_irq(&async_lock);
273 WARN_ON(!domain->registered || !list_empty(&domain->node) ||
274 !list_empty(&domain->domain));
275 domain->registered = 0;
276 spin_unlock_irq(&async_lock);
277 mutex_unlock(&async_register_mutex);
278}
279EXPORT_SYMBOL_GPL(async_unregister_domain);
280
281/**
246 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 282 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
247 * @list: running list to synchronize on 283 * @domain: running list to synchronize on
248 * 284 *
249 * This function waits until all asynchronous function calls for the 285 * This function waits until all asynchronous function calls for the
250 * synchronization domain specified by the running list @list have been done. 286 * synchronization domain specified by the running list @domain have been done.
251 */ 287 */
252void async_synchronize_full_domain(struct list_head *list) 288void async_synchronize_full_domain(struct async_domain *domain)
253{ 289{
254 async_synchronize_cookie_domain(next_cookie, list); 290 async_synchronize_cookie_domain(next_cookie, domain);
255} 291}
256EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 292EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
257 293
@@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
261 * @running: running list to synchronize on 297 * @running: running list to synchronize on
262 * 298 *
263 * This function waits until all asynchronous function calls for the 299 * This function waits until all asynchronous function calls for the
264 * synchronization domain specified by the running list @list submitted 300 * synchronization domain specified by running list @running submitted
265 * prior to @cookie have been done. 301 * prior to @cookie have been done.
266 */ 302 */
267void async_synchronize_cookie_domain(async_cookie_t cookie, 303void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
268 struct list_head *running)
269{ 304{
270 ktime_t uninitialized_var(starttime), delta, endtime; 305 ktime_t uninitialized_var(starttime), delta, endtime;
271 306
307 if (!running)
308 return;
309
272 if (initcall_debug && system_state == SYSTEM_BOOTING) { 310 if (initcall_debug && system_state == SYSTEM_BOOTING) {
273 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 311 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
274 starttime = ktime_get(); 312 starttime = ktime_get();
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c7f2c61416b..ea3b7b6191c7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
384static void audit_printk_skb(struct sk_buff *skb) 384static void audit_printk_skb(struct sk_buff *skb)
385{ 385{
386 struct nlmsghdr *nlh = nlmsg_hdr(skb); 386 struct nlmsghdr *nlh = nlmsg_hdr(skb);
387 char *data = NLMSG_DATA(nlh); 387 char *data = nlmsg_data(nlh);
388 388
389 if (nlh->nlmsg_type != AUDIT_EOE) { 389 if (nlh->nlmsg_type != AUDIT_EOE) {
390 if (printk_ratelimit()) 390 if (printk_ratelimit())
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
516 if (!skb) 516 if (!skb)
517 return NULL; 517 return NULL;
518 518
519 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); 519 nlh = nlmsg_put(skb, pid, seq, t, size, flags);
520 data = NLMSG_DATA(nlh); 520 if (!nlh)
521 goto out_kfree_skb;
522 data = nlmsg_data(nlh);
521 memcpy(data, payload, size); 523 memcpy(data, payload, size);
522 return skb; 524 return skb;
523 525
524nlmsg_failure: /* Used by NLMSG_NEW */ 526out_kfree_skb:
525 if (skb) 527 kfree_skb(skb);
526 kfree_skb(skb);
527 return NULL; 528 return NULL;
528} 529}
529 530
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
680 sessionid = audit_get_sessionid(current); 681 sessionid = audit_get_sessionid(current);
681 security_task_getsecid(current, &sid); 682 security_task_getsecid(current, &sid);
682 seq = nlh->nlmsg_seq; 683 seq = nlh->nlmsg_seq;
683 data = NLMSG_DATA(nlh); 684 data = nlmsg_data(nlh);
684 685
685 switch (msg_type) { 686 switch (msg_type) {
686 case AUDIT_GET: 687 case AUDIT_GET:
@@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff *skb)
961static int __init audit_init(void) 962static int __init audit_init(void)
962{ 963{
963 int i; 964 int i;
965 struct netlink_kernel_cfg cfg = {
966 .input = audit_receive,
967 };
964 968
965 if (audit_initialized == AUDIT_DISABLED) 969 if (audit_initialized == AUDIT_DISABLED)
966 return 0; 970 return 0;
967 971
968 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 972 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
969 audit_default ? "enabled" : "disabled"); 973 audit_default ? "enabled" : "disabled");
970 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, 974 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
971 audit_receive, NULL, THIS_MODULE); 975 THIS_MODULE, &cfg);
972 if (!audit_sock) 976 if (!audit_sock)
973 audit_panic("cannot initialize netlink socket"); 977 audit_panic("cannot initialize netlink socket");
974 else 978 else
@@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1060 1064
1061 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); 1065 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1062 if (!ab->skb) 1066 if (!ab->skb)
1063 goto nlmsg_failure; 1067 goto err;
1064 1068
1065 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); 1069 nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
1070 if (!nlh)
1071 goto out_kfree_skb;
1066 1072
1067 return ab; 1073 return ab;
1068 1074
1069nlmsg_failure: /* Used by NLMSG_NEW */ 1075out_kfree_skb:
1070 kfree_skb(ab->skb); 1076 kfree_skb(ab->skb);
1071 ab->skb = NULL; 1077 ab->skb = NULL;
1072err: 1078err:
@@ -1450,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key)
1450} 1456}
1451 1457
1452/** 1458/**
1459 * audit_log_link_denied - report a link restriction denial
1460 * @operation: specific link opreation
1461 * @link: the path that triggered the restriction
1462 */
1463void audit_log_link_denied(const char *operation, struct path *link)
1464{
1465 struct audit_buffer *ab;
1466
1467 ab = audit_log_start(current->audit_context, GFP_KERNEL,
1468 AUDIT_ANOM_LINK);
1469 audit_log_format(ab, "op=%s action=denied", operation);
1470 audit_log_format(ab, " pid=%d comm=", current->pid);
1471 audit_log_untrustedstring(ab, current->comm);
1472 audit_log_d_path(ab, " path=", link);
1473 audit_log_format(ab, " dev=");
1474 audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
1475 audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
1476 audit_log_end(ab);
1477}
1478
1479/**
1453 * audit_log_end - end one audit record 1480 * audit_log_end - end one audit record
1454 * @ab: the audit_buffer 1481 * @ab: the audit_buffer
1455 * 1482 *
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e7..ed206fd88cca 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -250,7 +250,6 @@ static void untag_chunk(struct node *p)
250 spin_unlock(&hash_lock); 250 spin_unlock(&hash_lock);
251 spin_unlock(&entry->lock); 251 spin_unlock(&entry->lock);
252 fsnotify_destroy_mark(entry); 252 fsnotify_destroy_mark(entry);
253 fsnotify_put_mark(entry);
254 goto out; 253 goto out;
255 } 254 }
256 255
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p)
259 258
260 fsnotify_duplicate_mark(&new->mark, entry); 259 fsnotify_duplicate_mark(&new->mark, entry);
261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 260 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
262 free_chunk(new); 261 fsnotify_put_mark(&new->mark);
263 goto Fallback; 262 goto Fallback;
264 } 263 }
265 264
@@ -293,7 +292,7 @@ static void untag_chunk(struct node *p)
293 spin_unlock(&hash_lock); 292 spin_unlock(&hash_lock);
294 spin_unlock(&entry->lock); 293 spin_unlock(&entry->lock);
295 fsnotify_destroy_mark(entry); 294 fsnotify_destroy_mark(entry);
296 fsnotify_put_mark(entry); 295 fsnotify_put_mark(&new->mark); /* drop initial reference */
297 goto out; 296 goto out;
298 297
299Fallback: 298Fallback:
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
322 321
323 entry = &chunk->mark; 322 entry = &chunk->mark;
324 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { 323 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
325 free_chunk(chunk); 324 fsnotify_put_mark(entry);
326 return -ENOSPC; 325 return -ENOSPC;
327 } 326 }
328 327
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
347 insert_hash(chunk); 346 insert_hash(chunk);
348 spin_unlock(&hash_lock); 347 spin_unlock(&hash_lock);
349 spin_unlock(&entry->lock); 348 spin_unlock(&entry->lock);
349 fsnotify_put_mark(entry); /* drop initial reference */
350 return 0; 350 return 0;
351} 351}
352 352
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
396 fsnotify_duplicate_mark(chunk_entry, old_entry); 396 fsnotify_duplicate_mark(chunk_entry, old_entry);
397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { 397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
398 spin_unlock(&old_entry->lock); 398 spin_unlock(&old_entry->lock);
399 free_chunk(chunk); 399 fsnotify_put_mark(chunk_entry);
400 fsnotify_put_mark(old_entry); 400 fsnotify_put_mark(old_entry);
401 return -ENOSPC; 401 return -ENOSPC;
402 } 402 }
@@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
444 spin_unlock(&chunk_entry->lock); 444 spin_unlock(&chunk_entry->lock);
445 spin_unlock(&old_entry->lock); 445 spin_unlock(&old_entry->lock);
446 fsnotify_destroy_mark(old_entry); 446 fsnotify_destroy_mark(old_entry);
447 fsnotify_put_mark(chunk_entry); /* drop initial reference */
447 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
448 fsnotify_put_mark(old_entry); /* and kill it */
449 return 0; 449 return 0;
450} 450}
451 451
@@ -595,7 +595,7 @@ void audit_trim_trees(void)
595 595
596 root_mnt = collect_mounts(&path); 596 root_mnt = collect_mounts(&path);
597 path_put(&path); 597 path_put(&path);
598 if (!root_mnt) 598 if (IS_ERR(root_mnt))
599 goto skip_it; 599 goto skip_it;
600 600
601 spin_lock(&hash_lock); 601 spin_lock(&hash_lock);
@@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
669 goto Err; 669 goto Err;
670 mnt = collect_mounts(&path); 670 mnt = collect_mounts(&path);
671 path_put(&path); 671 path_put(&path);
672 if (!mnt) { 672 if (IS_ERR(mnt)) {
673 err = -ENOMEM; 673 err = PTR_ERR(mnt);
674 goto Err; 674 goto Err;
675 } 675 }
676 676
@@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
719 return err; 719 return err;
720 tagged = collect_mounts(&path2); 720 tagged = collect_mounts(&path2);
721 path_put(&path2); 721 path_put(&path2);
722 if (!tagged) 722 if (IS_ERR(tagged))
723 return -ENOMEM; 723 return PTR_ERR(tagged);
724 724
725 err = kern_path(old, 0, &path1); 725 err = kern_path(old, 0, &path1);
726 if (err) { 726 if (err) {
@@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
916 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); 916 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
917 917
918 evict_chunk(chunk); 918 evict_chunk(chunk);
919 fsnotify_put_mark(entry); 919
920 /*
921 * We are guaranteed to have at least one reference to the mark from
922 * either the inode or the caller of fsnotify_destroy_mark().
923 */
924 BUG_ON(atomic_read(&entry->refcnt) < 1);
920} 925}
921 926
922static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, 927static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d9..3823281401b5 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(struct audit_watch *watch, struct path *parent) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata nd; 358 struct dentry *d = kern_path_locked(watch->path, parent);
359 struct dentry *d; 359 if (IS_ERR(d))
360 int err;
361
362 err = kern_path_parent(watch->path, &nd);
363 if (err)
364 return err;
365
366 if (nd.last_type != LAST_NORM) {
367 path_put(&nd.path);
368 return -EINVAL;
369 }
370
371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 if (IS_ERR(d)) {
374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 path_put(&nd.path);
376 return PTR_ERR(d); 360 return PTR_ERR(d);
377 } 361 mutex_unlock(&parent->dentry->d_inode->i_mutex);
378 if (d->d_inode) { 362 if (d->d_inode) {
379 /* update watch filter fields */ 363 /* update watch filter fields */
380 watch->dev = d->d_inode->i_sb->s_dev; 364 watch->dev = d->d_inode->i_sb->s_dev;
381 watch->ino = d->d_inode->i_ino; 365 watch->ino = d->d_inode->i_ino;
382 } 366 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
384
385 *parent = nd.path;
386 dput(d); 367 dput(d);
387 return 0; 368 return 0;
388} 369}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b303dfc7dce0..79818507e444 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
822 */ 822 */
823 823
824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); 825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
827static int cgroup_populate_dir(struct cgroup *cgrp); 827static int cgroup_populate_dir(struct cgroup *cgrp);
828static const struct inode_operations cgroup_dir_inode_operations; 828static const struct inode_operations cgroup_dir_inode_operations;
@@ -954,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
954 954
955 dget(d); 955 dget(d);
956 d_delete(d); 956 d_delete(d);
957 simple_unlink(d->d_inode, d); 957 simple_unlink(cgrp->dentry->d_inode, d);
958 list_del_init(&cfe->node); 958 list_del_init(&cfe->node);
959 dput(d); 959 dput(d);
960 960
@@ -1068,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1068 BUG_ON(cgrp->subsys[i]); 1068 BUG_ON(cgrp->subsys[i]);
1069 BUG_ON(!dummytop->subsys[i]); 1069 BUG_ON(!dummytop->subsys[i]);
1070 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1070 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
1071 mutex_lock(&ss->hierarchy_mutex);
1072 cgrp->subsys[i] = dummytop->subsys[i]; 1071 cgrp->subsys[i] = dummytop->subsys[i];
1073 cgrp->subsys[i]->cgroup = cgrp; 1072 cgrp->subsys[i]->cgroup = cgrp;
1074 list_move(&ss->sibling, &root->subsys_list); 1073 list_move(&ss->sibling, &root->subsys_list);
1075 ss->root = root; 1074 ss->root = root;
1076 if (ss->bind) 1075 if (ss->bind)
1077 ss->bind(cgrp); 1076 ss->bind(cgrp);
1078 mutex_unlock(&ss->hierarchy_mutex);
1079 /* refcount was already taken, and we're keeping it */ 1077 /* refcount was already taken, and we're keeping it */
1080 } else if (bit & removed_bits) { 1078 } else if (bit & removed_bits) {
1081 /* We're removing this subsystem */ 1079 /* We're removing this subsystem */
1082 BUG_ON(ss == NULL); 1080 BUG_ON(ss == NULL);
1083 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1081 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1084 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1082 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1085 mutex_lock(&ss->hierarchy_mutex);
1086 if (ss->bind) 1083 if (ss->bind)
1087 ss->bind(dummytop); 1084 ss->bind(dummytop);
1088 dummytop->subsys[i]->cgroup = dummytop; 1085 dummytop->subsys[i]->cgroup = dummytop;
1089 cgrp->subsys[i] = NULL; 1086 cgrp->subsys[i] = NULL;
1090 subsys[i]->root = &rootnode; 1087 subsys[i]->root = &rootnode;
1091 list_move(&ss->sibling, &rootnode.subsys_list); 1088 list_move(&ss->sibling, &rootnode.subsys_list);
1092 mutex_unlock(&ss->hierarchy_mutex);
1093 /* subsystem is now free - drop reference on module */ 1089 /* subsystem is now free - drop reference on module */
1094 module_put(ss->module); 1090 module_put(ss->module);
1095 } else if (bit & final_bits) { 1091 } else if (bit & final_bits) {
@@ -1587,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1587 opts.new_root = new_root; 1583 opts.new_root = new_root;
1588 1584
1589 /* Locate an existing or new sb for this hierarchy */ 1585 /* Locate an existing or new sb for this hierarchy */
1590 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); 1586 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1591 if (IS_ERR(sb)) { 1587 if (IS_ERR(sb)) {
1592 ret = PTR_ERR(sb); 1588 ret = PTR_ERR(sb);
1593 cgroup_drop_root(opts.new_root); 1589 cgroup_drop_root(opts.new_root);
@@ -2570,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2570 .rename = cgroup_rename, 2566 .rename = cgroup_rename,
2571}; 2567};
2572 2568
2573static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 2569static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2574{ 2570{
2575 if (dentry->d_name.len > NAME_MAX) 2571 if (dentry->d_name.len > NAME_MAX)
2576 return ERR_PTR(-ENAMETOOLONG); 2572 return ERR_PTR(-ENAMETOOLONG);
@@ -3915,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3915 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 3911 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3916} 3912}
3917 3913
3918static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
3919{
3920 /* We need to take each hierarchy_mutex in a consistent order */
3921 int i;
3922
3923 /*
3924 * No worry about a race with rebind_subsystems that might mess up the
3925 * locking order, since both parties are under cgroup_mutex.
3926 */
3927 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3928 struct cgroup_subsys *ss = subsys[i];
3929 if (ss == NULL)
3930 continue;
3931 if (ss->root == root)
3932 mutex_lock(&ss->hierarchy_mutex);
3933 }
3934}
3935
3936static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3937{
3938 int i;
3939
3940 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3941 struct cgroup_subsys *ss = subsys[i];
3942 if (ss == NULL)
3943 continue;
3944 if (ss->root == root)
3945 mutex_unlock(&ss->hierarchy_mutex);
3946 }
3947}
3948
3949/* 3914/*
3950 * cgroup_create - create a cgroup 3915 * cgroup_create - create a cgroup
3951 * @parent: cgroup that will be parent of the new cgroup 3916 * @parent: cgroup that will be parent of the new cgroup
@@ -4006,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4006 ss->post_clone(cgrp); 3971 ss->post_clone(cgrp);
4007 } 3972 }
4008 3973
4009 cgroup_lock_hierarchy(root);
4010 list_add(&cgrp->sibling, &cgrp->parent->children); 3974 list_add(&cgrp->sibling, &cgrp->parent->children);
4011 cgroup_unlock_hierarchy(root);
4012 root->number_of_cgroups++; 3975 root->number_of_cgroups++;
4013 3976
4014 err = cgroup_create_dir(cgrp, dentry, mode); 3977 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -4035,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4035 3998
4036 err_remove: 3999 err_remove:
4037 4000
4038 cgroup_lock_hierarchy(root);
4039 list_del(&cgrp->sibling); 4001 list_del(&cgrp->sibling);
4040 cgroup_unlock_hierarchy(root);
4041 root->number_of_cgroups--; 4002 root->number_of_cgroups--;
4042 4003
4043 err_destroy: 4004 err_destroy:
@@ -4245,10 +4206,8 @@ again:
4245 list_del_init(&cgrp->release_list); 4206 list_del_init(&cgrp->release_list);
4246 raw_spin_unlock(&release_list_lock); 4207 raw_spin_unlock(&release_list_lock);
4247 4208
4248 cgroup_lock_hierarchy(cgrp->root);
4249 /* delete this cgroup from parent->children */ 4209 /* delete this cgroup from parent->children */
4250 list_del_init(&cgrp->sibling); 4210 list_del_init(&cgrp->sibling);
4251 cgroup_unlock_hierarchy(cgrp->root);
4252 4211
4253 list_del_init(&cgrp->allcg_node); 4212 list_del_init(&cgrp->allcg_node);
4254 4213
@@ -4322,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4322 * need to invoke fork callbacks here. */ 4281 * need to invoke fork callbacks here. */
4323 BUG_ON(!list_empty(&init_task.tasks)); 4282 BUG_ON(!list_empty(&init_task.tasks));
4324 4283
4325 mutex_init(&ss->hierarchy_mutex);
4326 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4327 ss->active = 1; 4284 ss->active = 1;
4328 4285
4329 /* this function shouldn't be used with modular subsystems, since they 4286 /* this function shouldn't be used with modular subsystems, since they
@@ -4450,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4450 } 4407 }
4451 write_unlock(&css_set_lock); 4408 write_unlock(&css_set_lock);
4452 4409
4453 mutex_init(&ss->hierarchy_mutex);
4454 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4455 ss->active = 1; 4410 ss->active = 1;
4456 4411
4457 /* success! */ 4412 /* success! */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..14d32588cccd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
416 416
417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
418 mutex_lock(&zonelists_mutex); 418 mutex_lock(&zonelists_mutex);
419 build_all_zonelists(NULL); 419 build_all_zonelists(NULL, NULL);
420 mutex_unlock(&zonelists_mutex); 420 mutex_unlock(&zonelists_mutex);
421 } 421 }
422#endif 422#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 147 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 148} cpuset_flagbits_t;
149 149
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
150/* convenient tests for these bits */ 156/* convenient tests for these bits */
151static inline int is_cpu_exclusive(const struct cpuset *cs) 157static inline int is_cpu_exclusive(const struct cpuset *cs)
152{ 158{
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1990} 1996}
1991 1997
1992/* 1998/*
1993 * Walk the specified cpuset subtree and look for empty cpusets. 1999 * Helper function to traverse cpusets.
1994 * The tasks of such cpuset must be moved to a parent cpuset. 2000 * It can be used to walk the cpuset tree from top to bottom, completing
2001 * one layer before dropping down to the next (thus always processing a
2002 * node before any of its children).
2003 */
2004static struct cpuset *cpuset_next(struct list_head *queue)
2005{
2006 struct cpuset *cp;
2007 struct cpuset *child; /* scans child cpusets of cp */
2008 struct cgroup *cont;
2009
2010 if (list_empty(queue))
2011 return NULL;
2012
2013 cp = list_first_entry(queue, struct cpuset, stack_list);
2014 list_del(queue->next);
2015 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2016 child = cgroup_cs(cont);
2017 list_add_tail(&child->stack_list, queue);
2018 }
2019
2020 return cp;
2021}
2022
2023
2024/*
2025 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
2026 * online/offline) and update the cpusets accordingly.
2027 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
2028 * cpuset must be moved to a parent cpuset.
1995 * 2029 *
1996 * Called with cgroup_mutex held. We take callback_mutex to modify 2030 * Called with cgroup_mutex held. We take callback_mutex to modify
1997 * cpus_allowed and mems_allowed. 2031 * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2000 * before dropping down to the next. It always processes a node before 2034 * before dropping down to the next. It always processes a node before
2001 * any of its children. 2035 * any of its children.
2002 * 2036 *
2003 * For now, since we lack memory hot unplug, we'll never see a cpuset 2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
2004 * that has tasks along with an empty 'mems'. But if we did see such 2038 * if all present pages from a node are offlined.
2005 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
2006 */ 2039 */
2007static void scan_for_empty_cpusets(struct cpuset *root) 2040static void
2041scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2008{ 2042{
2009 LIST_HEAD(queue); 2043 LIST_HEAD(queue);
2010 struct cpuset *cp; /* scans cpusets being updated */ 2044 struct cpuset *cp; /* scans cpusets being updated */
2011 struct cpuset *child; /* scans child cpusets of cp */
2012 struct cgroup *cont;
2013 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2045 static nodemask_t oldmems; /* protected by cgroup_mutex */
2014 2046
2015 list_add_tail((struct list_head *)&root->stack_list, &queue); 2047 list_add_tail((struct list_head *)&root->stack_list, &queue);
2016 2048
2017 while (!list_empty(&queue)) { 2049 switch (event) {
2018 cp = list_first_entry(&queue, struct cpuset, stack_list); 2050 case CPUSET_CPU_OFFLINE:
2019 list_del(queue.next); 2051 while ((cp = cpuset_next(&queue)) != NULL) {
2020 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2052
2021 child = cgroup_cs(cont); 2053 /* Continue past cpusets with all cpus online */
2022 list_add_tail(&child->stack_list, &queue); 2054 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
2055 continue;
2056
2057 /* Remove offline cpus from this cpuset. */
2058 mutex_lock(&callback_mutex);
2059 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2060 cpu_active_mask);
2061 mutex_unlock(&callback_mutex);
2062
2063 /* Move tasks from the empty cpuset to a parent */
2064 if (cpumask_empty(cp->cpus_allowed))
2065 remove_tasks_in_empty_cpuset(cp);
2066 else
2067 update_tasks_cpumask(cp, NULL);
2023 } 2068 }
2069 break;
2024 2070
2025 /* Continue past cpusets with all cpus, mems online */ 2071 case CPUSET_MEM_OFFLINE:
2026 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && 2072 while ((cp = cpuset_next(&queue)) != NULL) {
2027 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2028 continue;
2029 2073
2030 oldmems = cp->mems_allowed; 2074 /* Continue past cpusets with all mems online */
2075 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY]))
2077 continue;
2031 2078
2032 /* Remove offline cpus and mems from this cpuset. */ 2079 oldmems = cp->mems_allowed;
2033 mutex_lock(&callback_mutex); 2080
2034 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2081 /* Remove offline mems from this cpuset. */
2035 cpu_active_mask); 2082 mutex_lock(&callback_mutex);
2036 nodes_and(cp->mems_allowed, cp->mems_allowed, 2083 nodes_and(cp->mems_allowed, cp->mems_allowed,
2037 node_states[N_HIGH_MEMORY]); 2084 node_states[N_HIGH_MEMORY]);
2038 mutex_unlock(&callback_mutex); 2085 mutex_unlock(&callback_mutex);
2039 2086
2040 /* Move tasks from the empty cpuset to a parent */ 2087 /* Move tasks from the empty cpuset to a parent */
2041 if (cpumask_empty(cp->cpus_allowed) || 2088 if (nodes_empty(cp->mems_allowed))
2042 nodes_empty(cp->mems_allowed)) 2089 remove_tasks_in_empty_cpuset(cp);
2043 remove_tasks_in_empty_cpuset(cp); 2090 else
2044 else { 2091 update_tasks_nodemask(cp, &oldmems, NULL);
2045 update_tasks_cpumask(cp, NULL);
2046 update_tasks_nodemask(cp, &oldmems, NULL);
2047 } 2092 }
2048 } 2093 }
2049} 2094}
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2054 * (of no affect) on systems that are actively using CPU hotplug 2099 * (of no affect) on systems that are actively using CPU hotplug
2055 * but making no active use of cpusets. 2100 * but making no active use of cpusets.
2056 * 2101 *
2102 * The only exception to this is suspend/resume, where we don't
2103 * modify cpusets at all.
2104 *
2057 * This routine ensures that top_cpuset.cpus_allowed tracks 2105 * This routine ensures that top_cpuset.cpus_allowed tracks
2058 * cpu_active_mask on each CPU hotplug (cpuhp) event. 2106 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2059 * 2107 *
2060 * Called within get_online_cpus(). Needs to call cgroup_lock() 2108 * Called within get_online_cpus(). Needs to call cgroup_lock()
2061 * before calling generate_sched_domains(). 2109 * before calling generate_sched_domains().
2110 *
2111 * @cpu_online: Indicates whether this is a CPU online event (true) or
2112 * a CPU offline event (false).
2062 */ 2113 */
2063void cpuset_update_active_cpus(void) 2114void cpuset_update_active_cpus(bool cpu_online)
2064{ 2115{
2065 struct sched_domain_attr *attr; 2116 struct sched_domain_attr *attr;
2066 cpumask_var_t *doms; 2117 cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
2070 mutex_lock(&callback_mutex); 2121 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2122 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2123 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2124
2125 if (!cpu_online)
2126 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
2127
2074 ndoms = generate_sched_domains(&doms, &attr); 2128 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock(); 2129 cgroup_unlock();
2076 2130
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
2082/* 2136/*
2083 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
2084 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2085 * See also the previous routine cpuset_track_online_cpus(). 2139 * See cpuset_update_active_cpus() for CPU hotplug handling.
2086 */ 2140 */
2087static int cpuset_track_online_nodes(struct notifier_block *self, 2141static int cpuset_track_online_nodes(struct notifier_block *self,
2088 unsigned long action, void *arg) 2142 unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2101 case MEM_OFFLINE: 2155 case MEM_OFFLINE:
2102 /* 2156 /*
2103 * needn't update top_cpuset.mems_allowed explicitly because 2157 * needn't update top_cpuset.mems_allowed explicitly because
2104 * scan_for_empty_cpusets() will update it. 2158 * scan_cpusets_upon_hotplug() will update it.
2105 */ 2159 */
2106 scan_for_empty_cpusets(&top_cpuset); 2160 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2107 break; 2161 break;
2108 default: 2162 default:
2109 break; 2163 break;
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff17..be7b33b73d30 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
12#include <linux/kdb.h> 12#include <linux/kdb.h>
13#include <linux/kdebug.h> 13#include <linux/kdebug.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/hardirq.h>
15#include "kdb_private.h" 16#include "kdb_private.h"
16#include "../debug_core.h" 17#include "../debug_core.h"
17 18
@@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks)
52 if (atomic_read(&kgdb_setting_breakpoint)) 53 if (atomic_read(&kgdb_setting_breakpoint))
53 reason = KDB_REASON_KEYBOARD; 54 reason = KDB_REASON_KEYBOARD;
54 55
56 if (in_nmi())
57 reason = KDB_REASON_NMI;
58
55 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { 59 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
56 if ((bp->bp_enabled) && (bp->bp_addr == addr)) { 60 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
57 reason = KDB_REASON_BREAK; 61 reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index bb9520f0f6ff..0a69d2adc4f3 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -715,9 +715,6 @@ kdb_printit:
715 /* check for having reached the LINES number of printed lines */ 715 /* check for having reached the LINES number of printed lines */
716 if (kdb_nextline == linecount) { 716 if (kdb_nextline == linecount) {
717 char buf1[16] = ""; 717 char buf1[16] = "";
718#if defined(CONFIG_SMP)
719 char buf2[32];
720#endif
721 718
722 /* Watch out for recursion here. Any routine that calls 719 /* Watch out for recursion here. Any routine that calls
723 * kdb_printf will come back through here. And kdb_read 720 * kdb_printf will come back through here. And kdb_read
@@ -732,14 +729,6 @@ kdb_printit:
732 if (moreprompt == NULL) 729 if (moreprompt == NULL)
733 moreprompt = "more> "; 730 moreprompt = "more> ";
734 731
735#if defined(CONFIG_SMP)
736 if (strchr(moreprompt, '%')) {
737 sprintf(buf2, moreprompt, get_cpu());
738 put_cpu();
739 moreprompt = buf2;
740 }
741#endif
742
743 kdb_input_flush(); 732 kdb_input_flush();
744 c = console_drivers; 733 c = console_drivers;
745 734
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..31df1706b9a9 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/kmsg_dump.h>
17#include <linux/reboot.h> 18#include <linux/reboot.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/sysrq.h> 20#include <linux/sysrq.h>
@@ -138,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
138static char *__env[] = { 139static char *__env[] = {
139#if defined(CONFIG_SMP) 140#if defined(CONFIG_SMP)
140 "PROMPT=[%d]kdb> ", 141 "PROMPT=[%d]kdb> ",
141 "MOREPROMPT=[%d]more> ",
142#else 142#else
143 "PROMPT=kdb> ", 143 "PROMPT=kdb> ",
144 "MOREPROMPT=more> ",
145#endif 144#endif
145 "MOREPROMPT=more> ",
146 "RADIX=16", 146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */ 147 "MDCOUNT=8", /* lines of md output */
148 KDB_PLATFORM_ENV, 148 KDB_PLATFORM_ENV,
@@ -1235,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1235 *cmdbuf = '\0'; 1235 *cmdbuf = '\0';
1236 *(cmd_hist[cmd_head]) = '\0'; 1236 *(cmd_hist[cmd_head]) = '\0';
1237 1237
1238 if (KDB_FLAG(ONLY_DO_DUMP)) {
1239 /* kdb is off but a catastrophic error requires a dump.
1240 * Take the dump and reboot.
1241 * Turn on logging so the kdb output appears in the log
1242 * buffer in the dump.
1243 */
1244 const char *setargs[] = { "set", "LOGGING", "1" };
1245 kdb_set(2, setargs);
1246 kdb_reboot(0, NULL);
1247 /*NOTREACHED*/
1248 }
1249
1250do_full_getstr: 1238do_full_getstr:
1251#if defined(CONFIG_SMP) 1239#if defined(CONFIG_SMP)
1252 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), 1240 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
@@ -2040,8 +2028,15 @@ static int kdb_env(int argc, const char **argv)
2040 */ 2028 */
2041static int kdb_dmesg(int argc, const char **argv) 2029static int kdb_dmesg(int argc, const char **argv)
2042{ 2030{
2043 char *syslog_data[4], *start, *end, c = '\0', *p; 2031 int diag;
2044 int diag, logging, logsize, lines = 0, adjust = 0, n; 2032 int logging;
2033 int lines = 0;
2034 int adjust = 0;
2035 int n = 0;
2036 int skip = 0;
2037 struct kmsg_dumper dumper = { .active = 1 };
2038 size_t len;
2039 char buf[201];
2045 2040
2046 if (argc > 2) 2041 if (argc > 2)
2047 return KDB_ARGCOUNT; 2042 return KDB_ARGCOUNT;
@@ -2064,22 +2059,10 @@ static int kdb_dmesg(int argc, const char **argv)
2064 kdb_set(2, setargs); 2059 kdb_set(2, setargs);
2065 } 2060 }
2066 2061
2067 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] 2062 kmsg_dump_rewind_nolock(&dumper);
2068 * logical start, end+1. */ 2063 while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
2069 kdb_syslog_data(syslog_data); 2064 n++;
2070 if (syslog_data[2] == syslog_data[3]) 2065
2071 return 0;
2072 logsize = syslog_data[1] - syslog_data[0];
2073 start = syslog_data[2];
2074 end = syslog_data[3];
2075#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
2076 for (n = 0, p = start; p < end; ++p) {
2077 c = *KDB_WRAP(p);
2078 if (c == '\n')
2079 ++n;
2080 }
2081 if (c != '\n')
2082 ++n;
2083 if (lines < 0) { 2066 if (lines < 0) {
2084 if (adjust >= n) 2067 if (adjust >= n)
2085 kdb_printf("buffer only contains %d lines, nothing " 2068 kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2070,11 @@ static int kdb_dmesg(int argc, const char **argv)
2087 else if (adjust - lines >= n) 2070 else if (adjust - lines >= n)
2088 kdb_printf("buffer only contains %d lines, last %d " 2071 kdb_printf("buffer only contains %d lines, last %d "
2089 "lines printed\n", n, n - adjust); 2072 "lines printed\n", n, n - adjust);
2090 if (adjust) { 2073 skip = adjust;
2091 for (; start < end && adjust; ++start) { 2074 lines = abs(lines);
2092 if (*KDB_WRAP(start) == '\n')
2093 --adjust;
2094 }
2095 if (start < end)
2096 ++start;
2097 }
2098 for (p = start; p < end && lines; ++p) {
2099 if (*KDB_WRAP(p) == '\n')
2100 ++lines;
2101 }
2102 end = p;
2103 } else if (lines > 0) { 2075 } else if (lines > 0) {
2104 int skip = n - (adjust + lines); 2076 skip = n - lines - adjust;
2077 lines = abs(lines);
2105 if (adjust >= n) { 2078 if (adjust >= n) {
2106 kdb_printf("buffer only contains %d lines, " 2079 kdb_printf("buffer only contains %d lines, "
2107 "nothing printed\n", n); 2080 "nothing printed\n", n);
@@ -2112,35 +2085,24 @@ static int kdb_dmesg(int argc, const char **argv)
2112 kdb_printf("buffer only contains %d lines, first " 2085 kdb_printf("buffer only contains %d lines, first "
2113 "%d lines printed\n", n, lines); 2086 "%d lines printed\n", n, lines);
2114 } 2087 }
2115 for (; start < end && skip; ++start) { 2088 } else {
2116 if (*KDB_WRAP(start) == '\n') 2089 lines = n;
2117 --skip;
2118 }
2119 for (p = start; p < end && lines; ++p) {
2120 if (*KDB_WRAP(p) == '\n')
2121 --lines;
2122 }
2123 end = p;
2124 } 2090 }
2125 /* Do a line at a time (max 200 chars) to reduce protocol overhead */ 2091
2126 c = '\n'; 2092 if (skip >= n || skip < 0)
2127 while (start != end) { 2093 return 0;
2128 char buf[201]; 2094
2129 p = buf; 2095 kmsg_dump_rewind_nolock(&dumper);
2130 if (KDB_FLAG(CMD_INTERRUPT)) 2096 while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
2131 return 0; 2097 if (skip) {
2132 while (start < end && (c = *KDB_WRAP(start)) && 2098 skip--;
2133 (p - buf) < sizeof(buf)-1) { 2099 continue;
2134 ++start;
2135 *p++ = c;
2136 if (c == '\n')
2137 break;
2138 } 2100 }
2139 *p = '\0'; 2101 if (!lines--)
2140 kdb_printf("%s", buf); 2102 break;
2103
2104 kdb_printf("%.*s\n", (int)len - 1, buf);
2141 } 2105 }
2142 if (c != '\n')
2143 kdb_printf("\n");
2144 2106
2145 return 0; 2107 return 0;
2146} 2108}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
205extern int kdb_grep_leading; 205extern int kdb_grep_leading;
206extern int kdb_grep_trailing; 206extern int kdb_grep_trailing;
207extern char *kdb_cmds[]; 207extern char *kdb_cmds[];
208extern void kdb_syslog_data(char *syslog_data[]);
209extern unsigned long kdb_task_state_string(const char *); 208extern unsigned long kdb_task_state_string(const char *);
210extern char kdb_task_state_char (const struct task_struct *); 209extern char kdb_task_state_char (const struct task_struct *);
211extern unsigned long kdb_task_state(const struct task_struct *p, 210extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f399..98d4597f43d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,7 +153,8 @@ put_callchain_entry(int rctx)
153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx); 153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
154} 154}
155 155
156struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) 156struct perf_callchain_entry *
157perf_callchain(struct perf_event *event, struct pt_regs *regs)
157{ 158{
158 int rctx; 159 int rctx;
159 struct perf_callchain_entry *entry; 160 struct perf_callchain_entry *entry;
@@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
178 } 179 }
179 180
180 if (regs) { 181 if (regs) {
182 /*
183 * Disallow cross-task user callchains.
184 */
185 if (event->ctx->task && event->ctx->task != current)
186 goto exit_put;
187
181 perf_callchain_store(entry, PERF_CONTEXT_USER); 188 perf_callchain_store(entry, PERF_CONTEXT_USER);
182 perf_callchain_user(entry, regs); 189 perf_callchain_user(entry, regs);
183 } 190 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7d71d6ec972..b7935fcec7d9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1645 lockdep_assert_held(&ctx->mutex); 1645 lockdep_assert_held(&ctx->mutex);
1646 1646
1647 event->ctx = ctx; 1647 event->ctx = ctx;
1648 if (event->cpu != -1)
1649 event->cpu = cpu;
1648 1650
1649 if (!task) { 1651 if (!task) {
1650 /* 1652 /*
@@ -4037,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4037 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 4039 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4038 int size = 1; 4040 int size = 1;
4039 4041
4040 data->callchain = perf_callchain(regs); 4042 data->callchain = perf_callchain(event, regs);
4041 4043
4042 if (data->callchain) 4044 if (data->callchain)
4043 size += data->callchain->nr; 4045 size += data->callchain->nr;
@@ -5207,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event,
5207} 5209}
5208 5210
5209void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 5211void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5210 struct pt_regs *regs, struct hlist_head *head, int rctx) 5212 struct pt_regs *regs, struct hlist_head *head, int rctx,
5213 struct task_struct *task)
5211{ 5214{
5212 struct perf_sample_data data; 5215 struct perf_sample_data data;
5213 struct perf_event *event; 5216 struct perf_event *event;
@@ -5226,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5226 perf_swevent_event(event, count, &data, regs); 5229 perf_swevent_event(event, count, &data, regs);
5227 } 5230 }
5228 5231
5232 /*
5233 * If we got specified a target task, also iterate its context and
5234 * deliver this event there too.
5235 */
5236 if (task && task != current) {
5237 struct perf_event_context *ctx;
5238 struct trace_entry *entry = record;
5239
5240 rcu_read_lock();
5241 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
5242 if (!ctx)
5243 goto unlock;
5244
5245 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5246 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5247 continue;
5248 if (event->attr.config != entry->type)
5249 continue;
5250 if (perf_tp_event_match(event, &data, regs))
5251 perf_swevent_event(event, count, &data, regs);
5252 }
5253unlock:
5254 rcu_read_unlock();
5255 }
5256
5229 perf_swevent_put_recursion_context(rctx); 5257 perf_swevent_put_recursion_context(rctx);
5230} 5258}
5231EXPORT_SYMBOL_GPL(perf_tp_event); 5259EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -6252,6 +6280,8 @@ SYSCALL_DEFINE5(perf_event_open,
6252 } 6280 }
6253 } 6281 }
6254 6282
6283 get_online_cpus();
6284
6255 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 6285 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6256 NULL, NULL); 6286 NULL, NULL);
6257 if (IS_ERR(event)) { 6287 if (IS_ERR(event)) {
@@ -6304,7 +6334,7 @@ SYSCALL_DEFINE5(perf_event_open,
6304 /* 6334 /*
6305 * Get the target context (task or percpu): 6335 * Get the target context (task or percpu):
6306 */ 6336 */
6307 ctx = find_get_context(pmu, task, cpu); 6337 ctx = find_get_context(pmu, task, event->cpu);
6308 if (IS_ERR(ctx)) { 6338 if (IS_ERR(ctx)) {
6309 err = PTR_ERR(ctx); 6339 err = PTR_ERR(ctx);
6310 goto err_alloc; 6340 goto err_alloc;
@@ -6377,20 +6407,23 @@ SYSCALL_DEFINE5(perf_event_open,
6377 mutex_lock(&ctx->mutex); 6407 mutex_lock(&ctx->mutex);
6378 6408
6379 if (move_group) { 6409 if (move_group) {
6380 perf_install_in_context(ctx, group_leader, cpu); 6410 synchronize_rcu();
6411 perf_install_in_context(ctx, group_leader, event->cpu);
6381 get_ctx(ctx); 6412 get_ctx(ctx);
6382 list_for_each_entry(sibling, &group_leader->sibling_list, 6413 list_for_each_entry(sibling, &group_leader->sibling_list,
6383 group_entry) { 6414 group_entry) {
6384 perf_install_in_context(ctx, sibling, cpu); 6415 perf_install_in_context(ctx, sibling, event->cpu);
6385 get_ctx(ctx); 6416 get_ctx(ctx);
6386 } 6417 }
6387 } 6418 }
6388 6419
6389 perf_install_in_context(ctx, event, cpu); 6420 perf_install_in_context(ctx, event, event->cpu);
6390 ++ctx->generation; 6421 ++ctx->generation;
6391 perf_unpin_context(ctx); 6422 perf_unpin_context(ctx);
6392 mutex_unlock(&ctx->mutex); 6423 mutex_unlock(&ctx->mutex);
6393 6424
6425 put_online_cpus();
6426
6394 event->owner = current; 6427 event->owner = current;
6395 6428
6396 mutex_lock(&current->perf_event_mutex); 6429 mutex_lock(&current->perf_event_mutex);
@@ -6419,6 +6452,7 @@ err_context:
6419err_alloc: 6452err_alloc:
6420 free_event(event); 6453 free_event(event);
6421err_task: 6454err_task:
6455 put_online_cpus();
6422 if (task) 6456 if (task)
6423 put_task_struct(task); 6457 put_task_struct(task);
6424err_group_fd: 6458err_group_fd:
@@ -6479,6 +6513,39 @@ err:
6479} 6513}
6480EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 6514EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6481 6515
6516void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
6517{
6518 struct perf_event_context *src_ctx;
6519 struct perf_event_context *dst_ctx;
6520 struct perf_event *event, *tmp;
6521 LIST_HEAD(events);
6522
6523 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
6524 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
6525
6526 mutex_lock(&src_ctx->mutex);
6527 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
6528 event_entry) {
6529 perf_remove_from_context(event);
6530 put_ctx(src_ctx);
6531 list_add(&event->event_entry, &events);
6532 }
6533 mutex_unlock(&src_ctx->mutex);
6534
6535 synchronize_rcu();
6536
6537 mutex_lock(&dst_ctx->mutex);
6538 list_for_each_entry_safe(event, tmp, &events, event_entry) {
6539 list_del(&event->event_entry);
6540 if (event->state >= PERF_EVENT_STATE_OFF)
6541 event->state = PERF_EVENT_STATE_INACTIVE;
6542 perf_install_in_context(dst_ctx, event, dst_cpu);
6543 get_ctx(dst_ctx);
6544 }
6545 mutex_unlock(&dst_ctx->mutex);
6546}
6547EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
6548
6482static void sync_child_event(struct perf_event *child_event, 6549static void sync_child_event(struct perf_event *child_event,
6483 struct task_struct *child) 6550 struct task_struct *child)
6484{ 6551{
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90afc..a096c19f2c2a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle,
101} 101}
102 102
103/* Callchain handling */ 103/* Callchain handling */
104extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); 104extern struct perf_callchain_entry *
105perf_callchain(struct perf_event *event, struct pt_regs *regs);
105extern int get_callchain_buffers(void); 106extern int get_callchain_buffers(void);
106extern void put_callchain_buffers(void); 107extern void put_callchain_buffers(void);
107 108
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 985be4d80fe8..c08a22d02f72 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -32,19 +32,36 @@
32#include <linux/swap.h> /* try_to_free_swap */ 32#include <linux/swap.h> /* try_to_free_swap */
33#include <linux/ptrace.h> /* user_enable_single_step */ 33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */ 34#include <linux/kdebug.h> /* notifier mechanism */
35#include "../../mm/internal.h" /* munlock_vma_page */
35 36
36#include <linux/uprobes.h> 37#include <linux/uprobes.h>
37 38
38#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) 39#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
39#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE 40#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
40 41
41static struct srcu_struct uprobes_srcu;
42static struct rb_root uprobes_tree = RB_ROOT; 42static struct rb_root uprobes_tree = RB_ROOT;
43 43
44static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ 44static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
45 45
46#define UPROBES_HASH_SZ 13 46#define UPROBES_HASH_SZ 13
47 47
48/*
49 * We need separate register/unregister and mmap/munmap lock hashes because
50 * of mmap_sem nesting.
51 *
52 * uprobe_register() needs to install probes on (potentially) all processes
53 * and thus needs to acquire multiple mmap_sems (consequtively, not
54 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
55 * for the particular process doing the mmap.
56 *
57 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
58 * because of lock order against i_mmap_mutex. This means there's a hole in
59 * the register vma iteration where a mmap() can happen.
60 *
61 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
62 * install a probe where one is already installed.
63 */
64
48/* serialize (un)register */ 65/* serialize (un)register */
49static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; 66static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
50 67
@@ -61,17 +78,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
61 */ 78 */
62static atomic_t uprobe_events = ATOMIC_INIT(0); 79static atomic_t uprobe_events = ATOMIC_INIT(0);
63 80
64/*
65 * Maintain a temporary per vma info that can be used to search if a vma
66 * has already been handled. This structure is introduced since extending
67 * vm_area_struct wasnt recommended.
68 */
69struct vma_info {
70 struct list_head probe_list;
71 struct mm_struct *mm;
72 loff_t vaddr;
73};
74
75struct uprobe { 81struct uprobe {
76 struct rb_node rb_node; /* node in the rb tree */ 82 struct rb_node rb_node; /* node in the rb tree */
77 atomic_t ref; 83 atomic_t ref;
@@ -100,20 +106,21 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
100 if (!is_register) 106 if (!is_register)
101 return true; 107 return true;
102 108
103 if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) 109 if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
110 == (VM_READ|VM_EXEC))
104 return true; 111 return true;
105 112
106 return false; 113 return false;
107} 114}
108 115
109static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) 116static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
110{ 117{
111 loff_t vaddr; 118 return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
112 119}
113 vaddr = vma->vm_start + offset;
114 vaddr -= vma->vm_pgoff << PAGE_SHIFT;
115 120
116 return vaddr; 121static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
122{
123 return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
117} 124}
118 125
119/** 126/**
@@ -121,41 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
121 * based on replace_page in mm/ksm.c 128 * based on replace_page in mm/ksm.c
122 * 129 *
123 * @vma: vma that holds the pte pointing to page 130 * @vma: vma that holds the pte pointing to page
131 * @addr: address the old @page is mapped at
124 * @page: the cowed page we are replacing by kpage 132 * @page: the cowed page we are replacing by kpage
125 * @kpage: the modified page we replace page by 133 * @kpage: the modified page we replace page by
126 * 134 *
127 * Returns 0 on success, -EFAULT on failure. 135 * Returns 0 on success, -EFAULT on failure.
128 */ 136 */
129static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) 137static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
138 struct page *page, struct page *kpage)
130{ 139{
131 struct mm_struct *mm = vma->vm_mm; 140 struct mm_struct *mm = vma->vm_mm;
132 pgd_t *pgd;
133 pud_t *pud;
134 pmd_t *pmd;
135 pte_t *ptep;
136 spinlock_t *ptl; 141 spinlock_t *ptl;
137 unsigned long addr; 142 pte_t *ptep;
138 int err = -EFAULT; 143 int err;
139
140 addr = page_address_in_vma(page, vma);
141 if (addr == -EFAULT)
142 goto out;
143
144 pgd = pgd_offset(mm, addr);
145 if (!pgd_present(*pgd))
146 goto out;
147
148 pud = pud_offset(pgd, addr);
149 if (!pud_present(*pud))
150 goto out;
151 144
152 pmd = pmd_offset(pud, addr); 145 /* For try_to_free_swap() and munlock_vma_page() below */
153 if (!pmd_present(*pmd)) 146 lock_page(page);
154 goto out;
155 147
156 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 148 err = -EAGAIN;
149 ptep = page_check_address(page, mm, addr, &ptl, 0);
157 if (!ptep) 150 if (!ptep)
158 goto out; 151 goto unlock;
159 152
160 get_page(kpage); 153 get_page(kpage);
161 page_add_new_anon_rmap(kpage, vma, addr); 154 page_add_new_anon_rmap(kpage, vma, addr);
@@ -172,11 +165,15 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
172 page_remove_rmap(page); 165 page_remove_rmap(page);
173 if (!page_mapped(page)) 166 if (!page_mapped(page))
174 try_to_free_swap(page); 167 try_to_free_swap(page);
175 put_page(page);
176 pte_unmap_unlock(ptep, ptl); 168 pte_unmap_unlock(ptep, ptl);
177 err = 0;
178 169
179out: 170 if (vma->vm_flags & VM_LOCKED)
171 munlock_vma_page(page);
172 put_page(page);
173
174 err = 0;
175 unlock:
176 unlock_page(page);
180 return err; 177 return err;
181} 178}
182 179
@@ -218,79 +215,46 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
218 unsigned long vaddr, uprobe_opcode_t opcode) 215 unsigned long vaddr, uprobe_opcode_t opcode)
219{ 216{
220 struct page *old_page, *new_page; 217 struct page *old_page, *new_page;
221 struct address_space *mapping;
222 void *vaddr_old, *vaddr_new; 218 void *vaddr_old, *vaddr_new;
223 struct vm_area_struct *vma; 219 struct vm_area_struct *vma;
224 struct uprobe *uprobe;
225 loff_t addr;
226 int ret; 220 int ret;
227 221
222retry:
228 /* Read the page with vaddr into memory */ 223 /* Read the page with vaddr into memory */
229 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); 224 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
230 if (ret <= 0) 225 if (ret <= 0)
231 return ret; 226 return ret;
232 227
233 ret = -EINVAL;
234
235 /*
236 * We are interested in text pages only. Our pages of interest
237 * should be mapped for read and execute only. We desist from
238 * adding probes in write mapped pages since the breakpoints
239 * might end up in the file copy.
240 */
241 if (!valid_vma(vma, is_swbp_insn(&opcode)))
242 goto put_out;
243
244 uprobe = container_of(auprobe, struct uprobe, arch);
245 mapping = uprobe->inode->i_mapping;
246 if (mapping != vma->vm_file->f_mapping)
247 goto put_out;
248
249 addr = vma_address(vma, uprobe->offset);
250 if (vaddr != (unsigned long)addr)
251 goto put_out;
252
253 ret = -ENOMEM; 228 ret = -ENOMEM;
254 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 229 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
255 if (!new_page) 230 if (!new_page)
256 goto put_out; 231 goto put_old;
257 232
258 __SetPageUptodate(new_page); 233 __SetPageUptodate(new_page);
259 234
260 /*
261 * lock page will serialize against do_wp_page()'s
262 * PageAnon() handling
263 */
264 lock_page(old_page);
265 /* copy the page now that we've got it stable */ 235 /* copy the page now that we've got it stable */
266 vaddr_old = kmap_atomic(old_page); 236 vaddr_old = kmap_atomic(old_page);
267 vaddr_new = kmap_atomic(new_page); 237 vaddr_new = kmap_atomic(new_page);
268 238
269 memcpy(vaddr_new, vaddr_old, PAGE_SIZE); 239 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
270 240 memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
271 /* poke the new insn in, ASSUMES we don't cross page boundary */
272 vaddr &= ~PAGE_MASK;
273 BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
274 memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
275 241
276 kunmap_atomic(vaddr_new); 242 kunmap_atomic(vaddr_new);
277 kunmap_atomic(vaddr_old); 243 kunmap_atomic(vaddr_old);
278 244
279 ret = anon_vma_prepare(vma); 245 ret = anon_vma_prepare(vma);
280 if (ret) 246 if (ret)
281 goto unlock_out; 247 goto put_new;
282 248
283 lock_page(new_page); 249 ret = __replace_page(vma, vaddr, old_page, new_page);
284 ret = __replace_page(vma, old_page, new_page);
285 unlock_page(new_page);
286 250
287unlock_out: 251put_new:
288 unlock_page(old_page);
289 page_cache_release(new_page); 252 page_cache_release(new_page);
290 253put_old:
291put_out:
292 put_page(old_page); 254 put_page(old_page);
293 255
256 if (unlikely(ret == -EAGAIN))
257 goto retry;
294 return ret; 258 return ret;
295} 259}
296 260
@@ -312,7 +276,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
312 void *vaddr_new; 276 void *vaddr_new;
313 int ret; 277 int ret;
314 278
315 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); 279 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
316 if (ret <= 0) 280 if (ret <= 0)
317 return ret; 281 return ret;
318 282
@@ -333,10 +297,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
333 uprobe_opcode_t opcode; 297 uprobe_opcode_t opcode;
334 int result; 298 int result;
335 299
300 if (current->mm == mm) {
301 pagefault_disable();
302 result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
303 sizeof(opcode));
304 pagefault_enable();
305
306 if (likely(result == 0))
307 goto out;
308 }
309
336 result = read_opcode(mm, vaddr, &opcode); 310 result = read_opcode(mm, vaddr, &opcode);
337 if (result) 311 if (result)
338 return result; 312 return result;
339 313out:
340 if (is_swbp_insn(&opcode)) 314 if (is_swbp_insn(&opcode))
341 return 1; 315 return 1;
342 316
@@ -355,7 +329,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
355int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 329int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
356{ 330{
357 int result; 331 int result;
358 332 /*
333 * See the comment near uprobes_hash().
334 */
359 result = is_swbp_at_addr(mm, vaddr); 335 result = is_swbp_at_addr(mm, vaddr);
360 if (result == 1) 336 if (result == 1)
361 return -EEXIST; 337 return -EEXIST;
@@ -520,7 +496,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
520 uprobe->inode = igrab(inode); 496 uprobe->inode = igrab(inode);
521 uprobe->offset = offset; 497 uprobe->offset = offset;
522 init_rwsem(&uprobe->consumer_rwsem); 498 init_rwsem(&uprobe->consumer_rwsem);
523 INIT_LIST_HEAD(&uprobe->pending_list);
524 499
525 /* add to uprobes_tree, sorted on inode:offset */ 500 /* add to uprobes_tree, sorted on inode:offset */
526 cur_uprobe = insert_uprobe(uprobe); 501 cur_uprobe = insert_uprobe(uprobe);
@@ -588,20 +563,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
588} 563}
589 564
590static int 565static int
591__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, 566__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
592 unsigned long nbytes, unsigned long offset) 567 unsigned long nbytes, loff_t offset)
593{ 568{
594 struct file *filp = vma->vm_file;
595 struct page *page; 569 struct page *page;
596 void *vaddr; 570 void *vaddr;
597 unsigned long off1; 571 unsigned long off;
598 unsigned long idx; 572 pgoff_t idx;
599 573
600 if (!filp) 574 if (!filp)
601 return -EINVAL; 575 return -EINVAL;
602 576
603 idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); 577 if (!mapping->a_ops->readpage)
604 off1 = offset &= ~PAGE_MASK; 578 return -EIO;
579
580 idx = offset >> PAGE_CACHE_SHIFT;
581 off = offset & ~PAGE_MASK;
605 582
606 /* 583 /*
607 * Ensure that the page that has the original instruction is 584 * Ensure that the page that has the original instruction is
@@ -612,22 +589,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins
612 return PTR_ERR(page); 589 return PTR_ERR(page);
613 590
614 vaddr = kmap_atomic(page); 591 vaddr = kmap_atomic(page);
615 memcpy(insn, vaddr + off1, nbytes); 592 memcpy(insn, vaddr + off, nbytes);
616 kunmap_atomic(vaddr); 593 kunmap_atomic(vaddr);
617 page_cache_release(page); 594 page_cache_release(page);
618 595
619 return 0; 596 return 0;
620} 597}
621 598
622static int 599static int copy_insn(struct uprobe *uprobe, struct file *filp)
623copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
624{ 600{
625 struct address_space *mapping; 601 struct address_space *mapping;
626 unsigned long nbytes; 602 unsigned long nbytes;
627 int bytes; 603 int bytes;
628 604
629 addr &= ~PAGE_MASK; 605 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
630 nbytes = PAGE_SIZE - addr;
631 mapping = uprobe->inode->i_mapping; 606 mapping = uprobe->inode->i_mapping;
632 607
633 /* Instruction at end of binary; copy only available bytes */ 608 /* Instruction at end of binary; copy only available bytes */
@@ -638,13 +613,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
638 613
639 /* Instruction at the page-boundary; copy bytes in second page */ 614 /* Instruction at the page-boundary; copy bytes in second page */
640 if (nbytes < bytes) { 615 if (nbytes < bytes) {
641 if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, 616 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
642 bytes - nbytes, uprobe->offset + nbytes)) 617 bytes - nbytes, uprobe->offset + nbytes);
643 return -ENOMEM; 618 if (err)
644 619 return err;
645 bytes = nbytes; 620 bytes = nbytes;
646 } 621 }
647 return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); 622 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
648} 623}
649 624
650/* 625/*
@@ -672,9 +647,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
672 */ 647 */
673static int 648static int
674install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, 649install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
675 struct vm_area_struct *vma, loff_t vaddr) 650 struct vm_area_struct *vma, unsigned long vaddr)
676{ 651{
677 unsigned long addr;
678 int ret; 652 int ret;
679 653
680 /* 654 /*
@@ -687,20 +661,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
687 if (!uprobe->consumers) 661 if (!uprobe->consumers)
688 return -EEXIST; 662 return -EEXIST;
689 663
690 addr = (unsigned long)vaddr;
691
692 if (!(uprobe->flags & UPROBE_COPY_INSN)) { 664 if (!(uprobe->flags & UPROBE_COPY_INSN)) {
693 ret = copy_insn(uprobe, vma, addr); 665 ret = copy_insn(uprobe, vma->vm_file);
694 if (ret) 666 if (ret)
695 return ret; 667 return ret;
696 668
697 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) 669 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
698 return -EEXIST; 670 return -ENOTSUPP;
699 671
700 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); 672 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
701 if (ret) 673 if (ret)
702 return ret; 674 return ret;
703 675
676 /* write_opcode() assumes we don't cross page boundary */
677 BUG_ON((uprobe->offset & ~PAGE_MASK) +
678 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
679
704 uprobe->flags |= UPROBE_COPY_INSN; 680 uprobe->flags |= UPROBE_COPY_INSN;
705 } 681 }
706 682
@@ -713,7 +689,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
713 * Hence increment before and decrement on failure. 689 * Hence increment before and decrement on failure.
714 */ 690 */
715 atomic_inc(&mm->uprobes_state.count); 691 atomic_inc(&mm->uprobes_state.count);
716 ret = set_swbp(&uprobe->arch, mm, addr); 692 ret = set_swbp(&uprobe->arch, mm, vaddr);
717 if (ret) 693 if (ret)
718 atomic_dec(&mm->uprobes_state.count); 694 atomic_dec(&mm->uprobes_state.count);
719 695
@@ -721,27 +697,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
721} 697}
722 698
723static void 699static void
724remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) 700remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
725{ 701{
726 if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) 702 if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
727 atomic_dec(&mm->uprobes_state.count); 703 atomic_dec(&mm->uprobes_state.count);
728} 704}
729 705
730/* 706/*
731 * There could be threads that have hit the breakpoint and are entering the 707 * There could be threads that have already hit the breakpoint. They
732 * notifier code and trying to acquire the uprobes_treelock. The thread 708 * will recheck the current insn and restart if find_uprobe() fails.
733 * calling delete_uprobe() that is removing the uprobe from the rb_tree can 709 * See find_active_uprobe().
734 * race with these threads and might acquire the uprobes_treelock compared
735 * to some of the breakpoint hit threads. In such a case, the breakpoint
736 * hit threads will not find the uprobe. The current unregistering thread
737 * waits till all other threads have hit a breakpoint, to acquire the
738 * uprobes_treelock before the uprobe is removed from the rbtree.
739 */ 710 */
740static void delete_uprobe(struct uprobe *uprobe) 711static void delete_uprobe(struct uprobe *uprobe)
741{ 712{
742 unsigned long flags; 713 unsigned long flags;
743 714
744 synchronize_srcu(&uprobes_srcu);
745 spin_lock_irqsave(&uprobes_treelock, flags); 715 spin_lock_irqsave(&uprobes_treelock, flags);
746 rb_erase(&uprobe->rb_node, &uprobes_tree); 716 rb_erase(&uprobe->rb_node, &uprobes_tree);
747 spin_unlock_irqrestore(&uprobes_treelock, flags); 717 spin_unlock_irqrestore(&uprobes_treelock, flags);
@@ -750,139 +720,136 @@ static void delete_uprobe(struct uprobe *uprobe)
750 atomic_dec(&uprobe_events); 720 atomic_dec(&uprobe_events);
751} 721}
752 722
753static struct vma_info * 723struct map_info {
754__find_next_vma_info(struct address_space *mapping, struct list_head *head, 724 struct map_info *next;
755 struct vma_info *vi, loff_t offset, bool is_register) 725 struct mm_struct *mm;
726 unsigned long vaddr;
727};
728
729static inline struct map_info *free_map_info(struct map_info *info)
730{
731 struct map_info *next = info->next;
732 kfree(info);
733 return next;
734}
735
736static struct map_info *
737build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
756{ 738{
739 unsigned long pgoff = offset >> PAGE_SHIFT;
757 struct prio_tree_iter iter; 740 struct prio_tree_iter iter;
758 struct vm_area_struct *vma; 741 struct vm_area_struct *vma;
759 struct vma_info *tmpvi; 742 struct map_info *curr = NULL;
760 unsigned long pgoff; 743 struct map_info *prev = NULL;
761 int existing_vma; 744 struct map_info *info;
762 loff_t vaddr; 745 int more = 0;
763
764 pgoff = offset >> PAGE_SHIFT;
765 746
747 again:
748 mutex_lock(&mapping->i_mmap_mutex);
766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 749 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
767 if (!valid_vma(vma, is_register)) 750 if (!valid_vma(vma, is_register))
768 continue; 751 continue;
769 752
770 existing_vma = 0; 753 if (!prev && !more) {
771 vaddr = vma_address(vma, offset); 754 /*
772 755 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
773 list_for_each_entry(tmpvi, head, probe_list) { 756 * reclaim. This is optimistic, no harm done if it fails.
774 if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { 757 */
775 existing_vma = 1; 758 prev = kmalloc(sizeof(struct map_info),
776 break; 759 GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
777 } 760 if (prev)
761 prev->next = NULL;
778 } 762 }
779 763 if (!prev) {
780 /* 764 more++;
781 * Another vma needs a probe to be installed. However skip 765 continue;
782 * installing the probe if the vma is about to be unlinked.
783 */
784 if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
785 vi->mm = vma->vm_mm;
786 vi->vaddr = vaddr;
787 list_add(&vi->probe_list, head);
788
789 return vi;
790 } 766 }
791 }
792
793 return NULL;
794}
795 767
796/* 768 if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
797 * Iterate in the rmap prio tree and find a vma where a probe has not 769 continue;
798 * yet been inserted.
799 */
800static struct vma_info *
801find_next_vma_info(struct address_space *mapping, struct list_head *head,
802 loff_t offset, bool is_register)
803{
804 struct vma_info *vi, *retvi;
805 770
806 vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); 771 info = prev;
807 if (!vi) 772 prev = prev->next;
808 return ERR_PTR(-ENOMEM); 773 info->next = curr;
774 curr = info;
809 775
810 mutex_lock(&mapping->i_mmap_mutex); 776 info->mm = vma->vm_mm;
811 retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); 777 info->vaddr = offset_to_vaddr(vma, offset);
778 }
812 mutex_unlock(&mapping->i_mmap_mutex); 779 mutex_unlock(&mapping->i_mmap_mutex);
813 780
814 if (!retvi) 781 if (!more)
815 kfree(vi); 782 goto out;
783
784 prev = curr;
785 while (curr) {
786 mmput(curr->mm);
787 curr = curr->next;
788 }
816 789
817 return retvi; 790 do {
791 info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
792 if (!info) {
793 curr = ERR_PTR(-ENOMEM);
794 goto out;
795 }
796 info->next = prev;
797 prev = info;
798 } while (--more);
799
800 goto again;
801 out:
802 while (prev)
803 prev = free_map_info(prev);
804 return curr;
818} 805}
819 806
820static int register_for_each_vma(struct uprobe *uprobe, bool is_register) 807static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
821{ 808{
822 struct list_head try_list; 809 struct map_info *info;
823 struct vm_area_struct *vma; 810 int err = 0;
824 struct address_space *mapping;
825 struct vma_info *vi, *tmpvi;
826 struct mm_struct *mm;
827 loff_t vaddr;
828 int ret;
829
830 mapping = uprobe->inode->i_mapping;
831 INIT_LIST_HEAD(&try_list);
832 811
833 ret = 0; 812 info = build_map_info(uprobe->inode->i_mapping,
813 uprobe->offset, is_register);
814 if (IS_ERR(info))
815 return PTR_ERR(info);
834 816
835 for (;;) { 817 while (info) {
836 vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); 818 struct mm_struct *mm = info->mm;
837 if (!vi) 819 struct vm_area_struct *vma;
838 break;
839 820
840 if (IS_ERR(vi)) { 821 if (err)
841 ret = PTR_ERR(vi); 822 goto free;
842 break;
843 }
844 823
845 mm = vi->mm; 824 down_write(&mm->mmap_sem);
846 down_read(&mm->mmap_sem); 825 vma = find_vma(mm, info->vaddr);
847 vma = find_vma(mm, (unsigned long)vi->vaddr); 826 if (!vma || !valid_vma(vma, is_register) ||
848 if (!vma || !valid_vma(vma, is_register)) { 827 vma->vm_file->f_mapping->host != uprobe->inode)
849 list_del(&vi->probe_list); 828 goto unlock;
850 kfree(vi);
851 up_read(&mm->mmap_sem);
852 mmput(mm);
853 continue;
854 }
855 vaddr = vma_address(vma, uprobe->offset);
856 if (vma->vm_file->f_mapping->host != uprobe->inode ||
857 vaddr != vi->vaddr) {
858 list_del(&vi->probe_list);
859 kfree(vi);
860 up_read(&mm->mmap_sem);
861 mmput(mm);
862 continue;
863 }
864 829
865 if (is_register) 830 if (vma->vm_start > info->vaddr ||
866 ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); 831 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
867 else 832 goto unlock;
868 remove_breakpoint(uprobe, mm, vi->vaddr);
869 833
870 up_read(&mm->mmap_sem);
871 mmput(mm);
872 if (is_register) { 834 if (is_register) {
873 if (ret && ret == -EEXIST) 835 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
874 ret = 0; 836 /*
875 if (ret) 837 * We can race against uprobe_mmap(), see the
876 break; 838 * comment near uprobe_hash().
839 */
840 if (err == -EEXIST)
841 err = 0;
842 } else {
843 remove_breakpoint(uprobe, mm, info->vaddr);
877 } 844 }
845 unlock:
846 up_write(&mm->mmap_sem);
847 free:
848 mmput(mm);
849 info = free_map_info(info);
878 } 850 }
879 851
880 list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { 852 return err;
881 list_del(&vi->probe_list);
882 kfree(vi);
883 }
884
885 return ret;
886} 853}
887 854
888static int __uprobe_register(struct uprobe *uprobe) 855static int __uprobe_register(struct uprobe *uprobe)
@@ -977,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
977 put_uprobe(uprobe); 944 put_uprobe(uprobe);
978} 945}
979 946
980/* 947static struct rb_node *
981 * Of all the nodes that correspond to the given inode, return the node 948find_node_in_range(struct inode *inode, loff_t min, loff_t max)
982 * with the least offset.
983 */
984static struct rb_node *find_least_offset_node(struct inode *inode)
985{ 949{
986 struct uprobe u = { .inode = inode, .offset = 0};
987 struct rb_node *n = uprobes_tree.rb_node; 950 struct rb_node *n = uprobes_tree.rb_node;
988 struct rb_node *close_node = NULL;
989 struct uprobe *uprobe;
990 int match;
991 951
992 while (n) { 952 while (n) {
993 uprobe = rb_entry(n, struct uprobe, rb_node); 953 struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
994 match = match_uprobe(&u, uprobe);
995 954
996 if (uprobe->inode == inode) 955 if (inode < u->inode) {
997 close_node = n;
998
999 if (!match)
1000 return close_node;
1001
1002 if (match < 0)
1003 n = n->rb_left; 956 n = n->rb_left;
1004 else 957 } else if (inode > u->inode) {
1005 n = n->rb_right; 958 n = n->rb_right;
959 } else {
960 if (max < u->offset)
961 n = n->rb_left;
962 else if (min > u->offset)
963 n = n->rb_right;
964 else
965 break;
966 }
1006 } 967 }
1007 968
1008 return close_node; 969 return n;
1009} 970}
1010 971
1011/* 972/*
1012 * For a given inode, build a list of probes that need to be inserted. 973 * For a given range in vma, build a list of probes that need to be inserted.
1013 */ 974 */
1014static void build_probe_list(struct inode *inode, struct list_head *head) 975static void build_probe_list(struct inode *inode,
976 struct vm_area_struct *vma,
977 unsigned long start, unsigned long end,
978 struct list_head *head)
1015{ 979{
1016 struct uprobe *uprobe; 980 loff_t min, max;
1017 unsigned long flags; 981 unsigned long flags;
1018 struct rb_node *n; 982 struct rb_node *n, *t;
1019 983 struct uprobe *u;
1020 spin_lock_irqsave(&uprobes_treelock, flags);
1021
1022 n = find_least_offset_node(inode);
1023 984
1024 for (; n; n = rb_next(n)) { 985 INIT_LIST_HEAD(head);
1025 uprobe = rb_entry(n, struct uprobe, rb_node); 986 min = vaddr_to_offset(vma, start);
1026 if (uprobe->inode != inode) 987 max = min + (end - start) - 1;
1027 break;
1028 988
1029 list_add(&uprobe->pending_list, head); 989 spin_lock_irqsave(&uprobes_treelock, flags);
1030 atomic_inc(&uprobe->ref); 990 n = find_node_in_range(inode, min, max);
991 if (n) {
992 for (t = n; t; t = rb_prev(t)) {
993 u = rb_entry(t, struct uprobe, rb_node);
994 if (u->inode != inode || u->offset < min)
995 break;
996 list_add(&u->pending_list, head);
997 atomic_inc(&u->ref);
998 }
999 for (t = n; (t = rb_next(t)); ) {
1000 u = rb_entry(t, struct uprobe, rb_node);
1001 if (u->inode != inode || u->offset > max)
1002 break;
1003 list_add(&u->pending_list, head);
1004 atomic_inc(&u->ref);
1005 }
1031 } 1006 }
1032
1033 spin_unlock_irqrestore(&uprobes_treelock, flags); 1007 spin_unlock_irqrestore(&uprobes_treelock, flags);
1034} 1008}
1035 1009
@@ -1059,28 +1033,21 @@ int uprobe_mmap(struct vm_area_struct *vma)
1059 if (!inode) 1033 if (!inode)
1060 return 0; 1034 return 0;
1061 1035
1062 INIT_LIST_HEAD(&tmp_list);
1063 mutex_lock(uprobes_mmap_hash(inode)); 1036 mutex_lock(uprobes_mmap_hash(inode));
1064 build_probe_list(inode, &tmp_list); 1037 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1065 1038
1066 ret = 0; 1039 ret = 0;
1067 count = 0; 1040 count = 0;
1068 1041
1069 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1042 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1070 loff_t vaddr;
1071
1072 list_del(&uprobe->pending_list);
1073 if (!ret) { 1043 if (!ret) {
1074 vaddr = vma_address(vma, uprobe->offset); 1044 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1075
1076 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
1077 put_uprobe(uprobe);
1078 continue;
1079 }
1080 1045
1081 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1046 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1082 1047 /*
1083 /* Ignore double add: */ 1048 * We can race against uprobe_register(), see the
1049 * comment near uprobe_hash().
1050 */
1084 if (ret == -EEXIST) { 1051 if (ret == -EEXIST) {
1085 ret = 0; 1052 ret = 0;
1086 1053
@@ -1121,6 +1088,9 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1121 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1088 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
1122 return; 1089 return;
1123 1090
1091 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1092 return;
1093
1124 if (!atomic_read(&vma->vm_mm->uprobes_state.count)) 1094 if (!atomic_read(&vma->vm_mm->uprobes_state.count))
1125 return; 1095 return;
1126 1096
@@ -1128,24 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1128 if (!inode) 1098 if (!inode)
1129 return; 1099 return;
1130 1100
1131 INIT_LIST_HEAD(&tmp_list);
1132 mutex_lock(uprobes_mmap_hash(inode)); 1101 mutex_lock(uprobes_mmap_hash(inode));
1133 build_probe_list(inode, &tmp_list); 1102 build_probe_list(inode, vma, start, end, &tmp_list);
1134 1103
1135 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1104 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1136 loff_t vaddr; 1105 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1137 1106 /*
1138 list_del(&uprobe->pending_list); 1107 * An unregister could have removed the probe before
1139 vaddr = vma_address(vma, uprobe->offset); 1108 * unmap. So check before we decrement the count.
1140 1109 */
1141 if (vaddr >= start && vaddr < end) { 1110 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1142 /* 1111 atomic_dec(&vma->vm_mm->uprobes_state.count);
1143 * An unregister could have removed the probe before
1144 * unmap. So check before we decrement the count.
1145 */
1146 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1147 atomic_dec(&vma->vm_mm->uprobes_state.count);
1148 }
1149 put_uprobe(uprobe); 1112 put_uprobe(uprobe);
1150 } 1113 }
1151 mutex_unlock(uprobes_mmap_hash(inode)); 1114 mutex_unlock(uprobes_mmap_hash(inode));
@@ -1378,9 +1341,6 @@ void uprobe_free_utask(struct task_struct *t)
1378{ 1341{
1379 struct uprobe_task *utask = t->utask; 1342 struct uprobe_task *utask = t->utask;
1380 1343
1381 if (t->uprobe_srcu_id != -1)
1382 srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
1383
1384 if (!utask) 1344 if (!utask)
1385 return; 1345 return;
1386 1346
@@ -1398,7 +1358,6 @@ void uprobe_free_utask(struct task_struct *t)
1398void uprobe_copy_process(struct task_struct *t) 1358void uprobe_copy_process(struct task_struct *t)
1399{ 1359{
1400 t->utask = NULL; 1360 t->utask = NULL;
1401 t->uprobe_srcu_id = -1;
1402} 1361}
1403 1362
1404/* 1363/*
@@ -1417,7 +1376,6 @@ static struct uprobe_task *add_utask(void)
1417 if (unlikely(!utask)) 1376 if (unlikely(!utask))
1418 return NULL; 1377 return NULL;
1419 1378
1420 utask->active_uprobe = NULL;
1421 current->utask = utask; 1379 current->utask = utask;
1422 return utask; 1380 return utask;
1423} 1381}
@@ -1479,41 +1437,61 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1479 return false; 1437 return false;
1480} 1438}
1481 1439
1440static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1441{
1442 struct mm_struct *mm = current->mm;
1443 struct uprobe *uprobe = NULL;
1444 struct vm_area_struct *vma;
1445
1446 down_read(&mm->mmap_sem);
1447 vma = find_vma(mm, bp_vaddr);
1448 if (vma && vma->vm_start <= bp_vaddr) {
1449 if (valid_vma(vma, false)) {
1450 struct inode *inode = vma->vm_file->f_mapping->host;
1451 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1452
1453 uprobe = find_uprobe(inode, offset);
1454 }
1455
1456 if (!uprobe)
1457 *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
1458 } else {
1459 *is_swbp = -EFAULT;
1460 }
1461 up_read(&mm->mmap_sem);
1462
1463 return uprobe;
1464}
1465
1482/* 1466/*
1483 * Run handler and ask thread to singlestep. 1467 * Run handler and ask thread to singlestep.
1484 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1468 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1485 */ 1469 */
1486static void handle_swbp(struct pt_regs *regs) 1470static void handle_swbp(struct pt_regs *regs)
1487{ 1471{
1488 struct vm_area_struct *vma;
1489 struct uprobe_task *utask; 1472 struct uprobe_task *utask;
1490 struct uprobe *uprobe; 1473 struct uprobe *uprobe;
1491 struct mm_struct *mm;
1492 unsigned long bp_vaddr; 1474 unsigned long bp_vaddr;
1475 int uninitialized_var(is_swbp);
1493 1476
1494 uprobe = NULL;
1495 bp_vaddr = uprobe_get_swbp_addr(regs); 1477 bp_vaddr = uprobe_get_swbp_addr(regs);
1496 mm = current->mm; 1478 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1497 down_read(&mm->mmap_sem);
1498 vma = find_vma(mm, bp_vaddr);
1499
1500 if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
1501 struct inode *inode;
1502 loff_t offset;
1503
1504 inode = vma->vm_file->f_mapping->host;
1505 offset = bp_vaddr - vma->vm_start;
1506 offset += (vma->vm_pgoff << PAGE_SHIFT);
1507 uprobe = find_uprobe(inode, offset);
1508 }
1509
1510 srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
1511 current->uprobe_srcu_id = -1;
1512 up_read(&mm->mmap_sem);
1513 1479
1514 if (!uprobe) { 1480 if (!uprobe) {
1515 /* No matching uprobe; signal SIGTRAP. */ 1481 if (is_swbp > 0) {
1516 send_sig(SIGTRAP, current, 0); 1482 /* No matching uprobe; signal SIGTRAP. */
1483 send_sig(SIGTRAP, current, 0);
1484 } else {
1485 /*
1486 * Either we raced with uprobe_unregister() or we can't
1487 * access this memory. The latter is only possible if
1488 * another thread plays with our ->mm. In both cases
1489 * we can simply restart. If this vma was unmapped we
1490 * can pretend this insn was not executed yet and get
1491 * the (correct) SIGSEGV after restart.
1492 */
1493 instruction_pointer_set(regs, bp_vaddr);
1494 }
1517 return; 1495 return;
1518 } 1496 }
1519 1497
@@ -1620,7 +1598,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1620 utask->state = UTASK_BP_HIT; 1598 utask->state = UTASK_BP_HIT;
1621 1599
1622 set_thread_flag(TIF_UPROBE); 1600 set_thread_flag(TIF_UPROBE);
1623 current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
1624 1601
1625 return 1; 1602 return 1;
1626} 1603}
@@ -1655,7 +1632,6 @@ static int __init init_uprobes(void)
1655 mutex_init(&uprobes_mutex[i]); 1632 mutex_init(&uprobes_mutex[i]);
1656 mutex_init(&uprobes_mmap_mutex[i]); 1633 mutex_init(&uprobes_mmap_mutex[i]);
1657 } 1634 }
1658 init_srcu_struct(&uprobes_srcu);
1659 1635
1660 return register_die_notifier(&uprobe_exception_nb); 1636 return register_die_notifier(&uprobe_exception_nb);
1661} 1637}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f59cc334516..f65345f9e5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
483 rcu_read_unlock(); 483 rcu_read_unlock();
484 for (;;) { 484 for (;;) {
485 unsigned long set; 485 unsigned long set;
486 i = j * __NFDBITS; 486 i = j * BITS_PER_LONG;
487 if (i >= fdt->max_fds) 487 if (i >= fdt->max_fds)
488 break; 488 break;
489 set = fdt->open_fds[j++]; 489 set = fdt->open_fds[j++];
@@ -953,14 +953,11 @@ void do_exit(long code)
953 exit_signals(tsk); /* sets PF_EXITING */ 953 exit_signals(tsk); /* sets PF_EXITING */
954 /* 954 /*
955 * tsk->flags are checked in the futex code to protect against 955 * tsk->flags are checked in the futex code to protect against
956 * an exiting task cleaning up the robust pi futexes, and in 956 * an exiting task cleaning up the robust pi futexes.
957 * task_work_add() to avoid the race with exit_task_work().
958 */ 957 */
959 smp_mb(); 958 smp_mb();
960 raw_spin_unlock_wait(&tsk->pi_lock); 959 raw_spin_unlock_wait(&tsk->pi_lock);
961 960
962 exit_task_work(tsk);
963
964 if (unlikely(in_atomic())) 961 if (unlikely(in_atomic()))
965 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 962 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
966 current->comm, task_pid_nr(current), 963 current->comm, task_pid_nr(current),
@@ -995,6 +992,7 @@ void do_exit(long code)
995 exit_shm(tsk); 992 exit_shm(tsk);
996 exit_files(tsk); 993 exit_files(tsk);
997 exit_fs(tsk); 994 exit_fs(tsk);
995 exit_task_work(tsk);
998 check_stack_usage(); 996 check_stack_usage();
999 exit_thread(); 997 exit_thread();
1000 998
diff --git a/kernel/fork.c b/kernel/fork.c
index f00e319d8376..2c8857e12855 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,6 +114,10 @@ int nr_processes(void)
114 return total; 114 return total;
115} 115}
116 116
117void __weak arch_release_task_struct(struct task_struct *tsk)
118{
119}
120
117#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 121#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
118static struct kmem_cache *task_struct_cachep; 122static struct kmem_cache *task_struct_cachep;
119 123
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
122 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); 126 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
123} 127}
124 128
125void __weak arch_release_task_struct(struct task_struct *tsk) { }
126
127static inline void free_task_struct(struct task_struct *tsk) 129static inline void free_task_struct(struct task_struct *tsk)
128{ 130{
129 arch_release_task_struct(tsk);
130 kmem_cache_free(task_struct_cachep, tsk); 131 kmem_cache_free(task_struct_cachep, tsk);
131} 132}
132#endif 133#endif
133 134
135void __weak arch_release_thread_info(struct thread_info *ti)
136{
137}
138
134#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR 139#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
135void __weak arch_release_thread_info(struct thread_info *ti) { }
136 140
137/* 141/*
138 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 142 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
150 154
151static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
152{ 156{
153 arch_release_thread_info(ti);
154 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
155} 158}
156# else 159# else
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
164 167
165static void free_thread_info(struct thread_info *ti) 168static void free_thread_info(struct thread_info *ti)
166{ 169{
167 arch_release_thread_info(ti);
168 kmem_cache_free(thread_info_cache, ti); 170 kmem_cache_free(thread_info_cache, ti);
169} 171}
170 172
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
205void free_task(struct task_struct *tsk) 207void free_task(struct task_struct *tsk)
206{ 208{
207 account_kernel_stack(tsk->stack, -1); 209 account_kernel_stack(tsk->stack, -1);
210 arch_release_thread_info(tsk->stack);
208 free_thread_info(tsk->stack); 211 free_thread_info(tsk->stack);
209 rt_mutex_debug_task_free(tsk); 212 rt_mutex_debug_task_free(tsk);
210 ftrace_graph_exit_task(tsk); 213 ftrace_graph_exit_task(tsk);
211 put_seccomp_filter(tsk); 214 put_seccomp_filter(tsk);
215 arch_release_task_struct(tsk);
212 free_task_struct(tsk); 216 free_task_struct(tsk);
213} 217}
214EXPORT_SYMBOL(free_task); 218EXPORT_SYMBOL(free_task);
@@ -298,23 +302,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
298 return NULL; 302 return NULL;
299 303
300 ti = alloc_thread_info_node(tsk, node); 304 ti = alloc_thread_info_node(tsk, node);
301 if (!ti) { 305 if (!ti)
302 free_task_struct(tsk); 306 goto free_tsk;
303 return NULL;
304 }
305 307
306 err = arch_dup_task_struct(tsk, orig); 308 err = arch_dup_task_struct(tsk, orig);
309 if (err)
310 goto free_ti;
307 311
308 /*
309 * We defer looking at err, because we will need this setup
310 * for the clean up path to work correctly.
311 */
312 tsk->stack = ti; 312 tsk->stack = ti;
313 setup_thread_stack(tsk, orig);
314
315 if (err)
316 goto out;
317 313
314 setup_thread_stack(tsk, orig);
318 clear_user_return_notifier(tsk); 315 clear_user_return_notifier(tsk);
319 clear_tsk_need_resched(tsk); 316 clear_tsk_need_resched(tsk);
320 stackend = end_of_stack(tsk); 317 stackend = end_of_stack(tsk);
@@ -338,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
338 335
339 return tsk; 336 return tsk;
340 337
341out: 338free_ti:
342 free_thread_info(ti); 339 free_thread_info(ti);
340free_tsk:
343 free_task_struct(tsk); 341 free_task_struct(tsk);
344 return NULL; 342 return NULL;
345} 343}
@@ -383,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
383 struct file *file; 381 struct file *file;
384 382
385 if (mpnt->vm_flags & VM_DONTCOPY) { 383 if (mpnt->vm_flags & VM_DONTCOPY) {
386 long pages = vma_pages(mpnt);
387 mm->total_vm -= pages;
388 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 384 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
389 -pages); 385 -vma_pages(mpnt));
390 continue; 386 continue;
391 } 387 }
392 charge = 0; 388 charge = 0;
393 if (mpnt->vm_flags & VM_ACCOUNT) { 389 if (mpnt->vm_flags & VM_ACCOUNT) {
394 unsigned long len; 390 unsigned long len = vma_pages(mpnt);
395 len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 391
396 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 392 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
397 goto fail_nomem; 393 goto fail_nomem;
398 charge = len; 394 charge = len;
@@ -459,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
459 if (retval) 455 if (retval)
460 goto out; 456 goto out;
461 457
462 if (file && uprobe_mmap(tmp)) 458 if (file)
463 goto out; 459 uprobe_mmap(tmp);
464 } 460 }
465 /* a new mm has just been created */ 461 /* a new mm has just been created */
466 arch_dup_mmap(oldmm, mm); 462 arch_dup_mmap(oldmm, mm);
@@ -1310,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1310#ifdef CONFIG_DEBUG_MUTEXES 1306#ifdef CONFIG_DEBUG_MUTEXES
1311 p->blocked_on = NULL; /* not blocked yet */ 1307 p->blocked_on = NULL; /* not blocked yet */
1312#endif 1308#endif
1313#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1309#ifdef CONFIG_MEMCG
1314 p->memcg_batch.do_batch = 0; 1310 p->memcg_batch.do_batch = 0;
1315 p->memcg_batch.memcg = NULL; 1311 p->memcg_batch.memcg = NULL;
1316#endif 1312#endif
@@ -1420,7 +1416,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1420 */ 1416 */
1421 p->group_leader = p; 1417 p->group_leader = p;
1422 INIT_LIST_HEAD(&p->thread_group); 1418 INIT_LIST_HEAD(&p->thread_group);
1423 INIT_HLIST_HEAD(&p->task_works); 1419 p->task_works = NULL;
1424 1420
1425 /* Now that the task is set up, run cgroup callbacks if 1421 /* Now that the task is set up, run cgroup callbacks if
1426 * necessary. We need to run them before the task is visible 1422 * necessary. We need to run them before the task is visible
diff --git a/kernel/futex.c b/kernel/futex.c
index e2b0fb9a0b3b..3717e7b306e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2231 * @uaddr2: the pi futex we will take prior to returning to user-space 2231 * @uaddr2: the pi futex we will take prior to returning to user-space
2232 * 2232 *
2233 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2233 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2234 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and 2234 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
2235 * complete the acquisition of the rt_mutex prior to returning to userspace. 2235 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2236 * This ensures the rt_mutex maintains an owner when it has waiters; without 2236 * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
2237 * one, the pi logic wouldn't know which task to boost/deboost, if there was a 2237 * without one, the pi logic would not know which task to boost/deboost, if
2238 * need to. 2238 * there was a need to.
2239 * 2239 *
2240 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2240 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2241 * via the following: 2241 * via the following:
@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2272 struct futex_q q = futex_q_init; 2272 struct futex_q q = futex_q_init;
2273 int res, ret; 2273 int res, ret;
2274 2274
2275 if (uaddr == uaddr2)
2276 return -EINVAL;
2277
2275 if (!bitset) 2278 if (!bitset)
2276 return -EINVAL; 2279 return -EINVAL;
2277 2280
@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2343 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 2346 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2344 * the pi_state. 2347 * the pi_state.
2345 */ 2348 */
2346 WARN_ON(!&q.pi_state); 2349 WARN_ON(!q.pi_state);
2347 pi_mutex = &q.pi_state->pi_mutex; 2350 pi_mutex = &q.pi_state->pi_mutex;
2348 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); 2351 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2349 debug_rt_mutex_free_waiter(&rt_waiter); 2352 debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2370 * fault, unlock the rt_mutex and return the fault to userspace. 2373 * fault, unlock the rt_mutex and return the fault to userspace.
2371 */ 2374 */
2372 if (ret == -EFAULT) { 2375 if (ret == -EFAULT) {
2373 if (rt_mutex_owner(pi_mutex) == current) 2376 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2374 rt_mutex_unlock(pi_mutex); 2377 rt_mutex_unlock(pi_mutex);
2375 } else if (ret == -EINTR) { 2378 } else if (ret == -EINTR) {
2376 /* 2379 /*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bdb180325551..131ca176b497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,7 @@ irqreturn_t
133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) 133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
134{ 134{
135 irqreturn_t retval = IRQ_NONE; 135 irqreturn_t retval = IRQ_NONE;
136 unsigned int random = 0, irq = desc->irq_data.irq; 136 unsigned int flags = 0, irq = desc->irq_data.irq;
137 137
138 do { 138 do {
139 irqreturn_t res; 139 irqreturn_t res;
@@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
161 161
162 /* Fall through to add to randomness */ 162 /* Fall through to add to randomness */
163 case IRQ_HANDLED: 163 case IRQ_HANDLED:
164 random |= action->flags; 164 flags |= action->flags;
165 break; 165 break;
166 166
167 default: 167 default:
@@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
172 action = action->next; 172 action = action->next;
173 } while (action); 173 } while (action);
174 174
175 if (random & IRQF_SAMPLE_RANDOM) 175 add_interrupt_randomness(irq, flags);
176 add_interrupt_randomness(irq);
177 176
178 if (!noirqdebug) 177 if (!noirqdebug)
179 note_interrupt(irq, desc, retval); 178 note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 41c1564103f1..49a77727db42 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
10#include <linux/mutex.h> 10#include <linux/mutex.h>
11#include <linux/of.h> 11#include <linux/of.h>
12#include <linux/of_address.h> 12#include <linux/of_address.h>
13#include <linux/topology.h>
13#include <linux/seq_file.h> 14#include <linux/seq_file.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -45,7 +46,8 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
45{ 46{
46 struct irq_domain *domain; 47 struct irq_domain *domain;
47 48
48 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 49 domain = kzalloc_node(sizeof(*domain), GFP_KERNEL,
50 of_node_to_nid(of_node));
49 if (WARN_ON(!domain)) 51 if (WARN_ON(!domain))
50 return NULL; 52 return NULL;
51 53
@@ -138,6 +140,36 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
138} 140}
139 141
140/** 142/**
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain.
144 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain
147 * @ops: map/unmap domain callbacks
148 * @host_data: Controller private data pointer
149 *
150 * Allocates a legacy irq_domain if irq_base is positive or a linear
151 * domain otherwise.
152 *
153 * This is intended to implement the expected behaviour for most
154 * interrupt controllers which is that a linear mapping should
155 * normally be used unless the system requires a legacy mapping in
156 * order to support supplying interrupt numbers during non-DT
157 * registration of devices.
158 */
159struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
160 unsigned int size,
161 unsigned int first_irq,
162 const struct irq_domain_ops *ops,
163 void *host_data)
164{
165 if (first_irq > 0)
166 return irq_domain_add_legacy(of_node, size, first_irq, 0,
167 ops, host_data);
168 else
169 return irq_domain_add_linear(of_node, size, ops, host_data);
170}
171
172/**
141 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. 173 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
142 * @of_node: pointer to interrupt controller's device tree node. 174 * @of_node: pointer to interrupt controller's device tree node.
143 * @size: total number of irqs in legacy mapping 175 * @size: total number of irqs in legacy mapping
@@ -203,7 +235,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
203 * one can then use irq_create_mapping() to 235 * one can then use irq_create_mapping() to
204 * explicitly change them 236 * explicitly change them
205 */ 237 */
206 ops->map(domain, irq, hwirq); 238 if (ops->map)
239 ops->map(domain, irq, hwirq);
207 240
208 /* Clear norequest flags */ 241 /* Clear norequest flags */
209 irq_clear_status_flags(irq, IRQ_NOREQUEST); 242 irq_clear_status_flags(irq, IRQ_NOREQUEST);
@@ -215,7 +248,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
215EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 248EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
216 249
217/** 250/**
218 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. 251 * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
219 * @of_node: pointer to interrupt controller's device tree node. 252 * @of_node: pointer to interrupt controller's device tree node.
220 * @size: Number of interrupts in the domain. 253 * @size: Number of interrupts in the domain.
221 * @ops: map/unmap domain callbacks 254 * @ops: map/unmap domain callbacks
@@ -229,7 +262,8 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
229 struct irq_domain *domain; 262 struct irq_domain *domain;
230 unsigned int *revmap; 263 unsigned int *revmap;
231 264
232 revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); 265 revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
266 of_node_to_nid(of_node));
233 if (WARN_ON(!revmap)) 267 if (WARN_ON(!revmap))
234 return NULL; 268 return NULL;
235 269
@@ -330,24 +364,112 @@ void irq_set_default_host(struct irq_domain *domain)
330} 364}
331EXPORT_SYMBOL_GPL(irq_set_default_host); 365EXPORT_SYMBOL_GPL(irq_set_default_host);
332 366
333static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, 367static void irq_domain_disassociate_many(struct irq_domain *domain,
334 irq_hw_number_t hwirq) 368 unsigned int irq_base, int count)
335{ 369{
336 struct irq_data *irq_data = irq_get_irq_data(virq); 370 /*
371 * disassociate in reverse order;
372 * not strictly necessary, but nice for unwinding
373 */
374 while (count--) {
375 int irq = irq_base + count;
376 struct irq_data *irq_data = irq_get_irq_data(irq);
377 irq_hw_number_t hwirq = irq_data->hwirq;
378
379 if (WARN_ON(!irq_data || irq_data->domain != domain))
380 continue;
381
382 irq_set_status_flags(irq, IRQ_NOREQUEST);
383
384 /* remove chip and handler */
385 irq_set_chip_and_handler(irq, NULL, NULL);
386
387 /* Make sure it's completed */
388 synchronize_irq(irq);
389
390 /* Tell the PIC about it */
391 if (domain->ops->unmap)
392 domain->ops->unmap(domain, irq);
393 smp_mb();
337 394
338 irq_data->hwirq = hwirq;
339 irq_data->domain = domain;
340 if (domain->ops->map(domain, virq, hwirq)) {
341 pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq);
342 irq_data->domain = NULL; 395 irq_data->domain = NULL;
343 irq_data->hwirq = 0; 396 irq_data->hwirq = 0;
344 return -1; 397
398 /* Clear reverse map */
399 switch(domain->revmap_type) {
400 case IRQ_DOMAIN_MAP_LINEAR:
401 if (hwirq < domain->revmap_data.linear.size)
402 domain->revmap_data.linear.revmap[hwirq] = 0;
403 break;
404 case IRQ_DOMAIN_MAP_TREE:
405 mutex_lock(&revmap_trees_mutex);
406 radix_tree_delete(&domain->revmap_data.tree, hwirq);
407 mutex_unlock(&revmap_trees_mutex);
408 break;
409 }
345 } 410 }
411}
412
413int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
414 irq_hw_number_t hwirq_base, int count)
415{
416 unsigned int virq = irq_base;
417 irq_hw_number_t hwirq = hwirq_base;
418 int i, ret;
419
420 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
421 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
422
423 for (i = 0; i < count; i++) {
424 struct irq_data *irq_data = irq_get_irq_data(virq + i);
425
426 if (WARN(!irq_data, "error: irq_desc not allocated; "
427 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
428 return -EINVAL;
429 if (WARN(irq_data->domain, "error: irq_desc already associated; "
430 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
431 return -EINVAL;
432 };
433
434 for (i = 0; i < count; i++, virq++, hwirq++) {
435 struct irq_data *irq_data = irq_get_irq_data(virq);
436
437 irq_data->hwirq = hwirq;
438 irq_data->domain = domain;
439 if (domain->ops->map) {
440 ret = domain->ops->map(domain, virq, hwirq);
441 if (ret != 0) {
442 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
443 virq, hwirq, ret);
444 WARN_ON(1);
445 irq_data->domain = NULL;
446 irq_data->hwirq = 0;
447 goto err_unmap;
448 }
449 }
346 450
347 irq_clear_status_flags(virq, IRQ_NOREQUEST); 451 switch (domain->revmap_type) {
452 case IRQ_DOMAIN_MAP_LINEAR:
453 if (hwirq < domain->revmap_data.linear.size)
454 domain->revmap_data.linear.revmap[hwirq] = virq;
455 break;
456 case IRQ_DOMAIN_MAP_TREE:
457 mutex_lock(&revmap_trees_mutex);
458 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
459 mutex_unlock(&revmap_trees_mutex);
460 break;
461 }
462
463 irq_clear_status_flags(virq, IRQ_NOREQUEST);
464 }
348 465
349 return 0; 466 return 0;
467
468 err_unmap:
469 irq_domain_disassociate_many(domain, irq_base, i);
470 return -EINVAL;
350} 471}
472EXPORT_SYMBOL_GPL(irq_domain_associate_many);
351 473
352/** 474/**
353 * irq_create_direct_mapping() - Allocate an irq for direct mapping 475 * irq_create_direct_mapping() - Allocate an irq for direct mapping
@@ -364,10 +486,10 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
364 if (domain == NULL) 486 if (domain == NULL)
365 domain = irq_default_domain; 487 domain = irq_default_domain;
366 488
367 BUG_ON(domain == NULL); 489 if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
368 WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); 490 return 0;
369 491
370 virq = irq_alloc_desc_from(1, 0); 492 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
371 if (!virq) { 493 if (!virq) {
372 pr_debug("create_direct virq allocation failed\n"); 494 pr_debug("create_direct virq allocation failed\n");
373 return 0; 495 return 0;
@@ -380,7 +502,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
380 } 502 }
381 pr_debug("create_direct obtained virq %d\n", virq); 503 pr_debug("create_direct obtained virq %d\n", virq);
382 504
383 if (irq_setup_virq(domain, virq, virq)) { 505 if (irq_domain_associate(domain, virq, virq)) {
384 irq_free_desc(virq); 506 irq_free_desc(virq);
385 return 0; 507 return 0;
386 } 508 }
@@ -433,27 +555,64 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
433 hint = hwirq % nr_irqs; 555 hint = hwirq % nr_irqs;
434 if (hint == 0) 556 if (hint == 0)
435 hint++; 557 hint++;
436 virq = irq_alloc_desc_from(hint, 0); 558 virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
437 if (virq <= 0) 559 if (virq <= 0)
438 virq = irq_alloc_desc_from(1, 0); 560 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
439 if (virq <= 0) { 561 if (virq <= 0) {
440 pr_debug("-> virq allocation failed\n"); 562 pr_debug("-> virq allocation failed\n");
441 return 0; 563 return 0;
442 } 564 }
443 565
444 if (irq_setup_virq(domain, virq, hwirq)) { 566 if (irq_domain_associate(domain, virq, hwirq)) {
445 if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) 567 irq_free_desc(virq);
446 irq_free_desc(virq);
447 return 0; 568 return 0;
448 } 569 }
449 570
450 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", 571 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
451 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); 572 hwirq, of_node_full_name(domain->of_node), virq);
452 573
453 return virq; 574 return virq;
454} 575}
455EXPORT_SYMBOL_GPL(irq_create_mapping); 576EXPORT_SYMBOL_GPL(irq_create_mapping);
456 577
578/**
579 * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
580 * @domain: domain owning the interrupt range
581 * @irq_base: beginning of linux IRQ range
582 * @hwirq_base: beginning of hardware IRQ range
583 * @count: Number of interrupts to map
584 *
585 * This routine is used for allocating and mapping a range of hardware
586 * irqs to linux irqs where the linux irq numbers are at pre-defined
587 * locations. For use by controllers that already have static mappings
588 * to insert in to the domain.
589 *
590 * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
591 * domain insertion.
592 *
593 * 0 is returned upon success, while any failure to establish a static
594 * mapping is treated as an error.
595 */
596int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
597 irq_hw_number_t hwirq_base, int count)
598{
599 int ret;
600
601 ret = irq_alloc_descs(irq_base, irq_base, count,
602 of_node_to_nid(domain->of_node));
603 if (unlikely(ret < 0))
604 return ret;
605
606 ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count);
607 if (unlikely(ret < 0)) {
608 irq_free_descs(irq_base, count);
609 return ret;
610 }
611
612 return 0;
613}
614EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
615
457unsigned int irq_create_of_mapping(struct device_node *controller, 616unsigned int irq_create_of_mapping(struct device_node *controller,
458 const u32 *intspec, unsigned int intsize) 617 const u32 *intspec, unsigned int intsize)
459{ 618{
@@ -477,7 +636,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
477 return intspec[0]; 636 return intspec[0];
478#endif 637#endif
479 pr_warning("no irq domain found for %s !\n", 638 pr_warning("no irq domain found for %s !\n",
480 controller->full_name); 639 of_node_full_name(controller));
481 return 0; 640 return 0;
482 } 641 }
483 642
@@ -511,7 +670,6 @@ void irq_dispose_mapping(unsigned int virq)
511{ 670{
512 struct irq_data *irq_data = irq_get_irq_data(virq); 671 struct irq_data *irq_data = irq_get_irq_data(virq);
513 struct irq_domain *domain; 672 struct irq_domain *domain;
514 irq_hw_number_t hwirq;
515 673
516 if (!virq || !irq_data) 674 if (!virq || !irq_data)
517 return; 675 return;
@@ -524,33 +682,7 @@ void irq_dispose_mapping(unsigned int virq)
524 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) 682 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
525 return; 683 return;
526 684
527 irq_set_status_flags(virq, IRQ_NOREQUEST); 685 irq_domain_disassociate_many(domain, virq, 1);
528
529 /* remove chip and handler */
530 irq_set_chip_and_handler(virq, NULL, NULL);
531
532 /* Make sure it's completed */
533 synchronize_irq(virq);
534
535 /* Tell the PIC about it */
536 if (domain->ops->unmap)
537 domain->ops->unmap(domain, virq);
538 smp_mb();
539
540 /* Clear reverse map */
541 hwirq = irq_data->hwirq;
542 switch(domain->revmap_type) {
543 case IRQ_DOMAIN_MAP_LINEAR:
544 if (hwirq < domain->revmap_data.linear.size)
545 domain->revmap_data.linear.revmap[hwirq] = 0;
546 break;
547 case IRQ_DOMAIN_MAP_TREE:
548 mutex_lock(&revmap_trees_mutex);
549 radix_tree_delete(&domain->revmap_data.tree, hwirq);
550 mutex_unlock(&revmap_trees_mutex);
551 break;
552 }
553
554 irq_free_desc(virq); 686 irq_free_desc(virq);
555} 687}
556EXPORT_SYMBOL_GPL(irq_dispose_mapping); 688EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -559,16 +691,11 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping);
559 * irq_find_mapping() - Find a linux irq from an hw irq number. 691 * irq_find_mapping() - Find a linux irq from an hw irq number.
560 * @domain: domain owning this hardware interrupt 692 * @domain: domain owning this hardware interrupt
561 * @hwirq: hardware irq number in that domain space 693 * @hwirq: hardware irq number in that domain space
562 *
563 * This is a slow path, for use by generic code. It's expected that an
564 * irq controller implementation directly calls the appropriate low level
565 * mapping function.
566 */ 694 */
567unsigned int irq_find_mapping(struct irq_domain *domain, 695unsigned int irq_find_mapping(struct irq_domain *domain,
568 irq_hw_number_t hwirq) 696 irq_hw_number_t hwirq)
569{ 697{
570 unsigned int i; 698 struct irq_data *data;
571 unsigned int hint = hwirq % nr_irqs;
572 699
573 /* Look for default domain if nececssary */ 700 /* Look for default domain if nececssary */
574 if (domain == NULL) 701 if (domain == NULL)
@@ -576,115 +703,47 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
576 if (domain == NULL) 703 if (domain == NULL)
577 return 0; 704 return 0;
578 705
579 /* legacy -> bail early */ 706 switch (domain->revmap_type) {
580 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) 707 case IRQ_DOMAIN_MAP_LEGACY:
581 return irq_domain_legacy_revmap(domain, hwirq); 708 return irq_domain_legacy_revmap(domain, hwirq);
582 709 case IRQ_DOMAIN_MAP_LINEAR:
583 /* Slow path does a linear search of the map */ 710 return irq_linear_revmap(domain, hwirq);
584 if (hint == 0) 711 case IRQ_DOMAIN_MAP_TREE:
585 hint = 1; 712 rcu_read_lock();
586 i = hint; 713 data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
587 do { 714 rcu_read_unlock();
588 struct irq_data *data = irq_get_irq_data(i); 715 if (data)
716 return data->irq;
717 break;
718 case IRQ_DOMAIN_MAP_NOMAP:
719 data = irq_get_irq_data(hwirq);
589 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 720 if (data && (data->domain == domain) && (data->hwirq == hwirq))
590 return i; 721 return hwirq;
591 i++; 722 break;
592 if (i >= nr_irqs) 723 }
593 i = 1; 724
594 } while(i != hint);
595 return 0; 725 return 0;
596} 726}
597EXPORT_SYMBOL_GPL(irq_find_mapping); 727EXPORT_SYMBOL_GPL(irq_find_mapping);
598 728
599/** 729/**
600 * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
601 * @domain: domain owning this hardware interrupt
602 * @hwirq: hardware irq number in that domain space
603 *
604 * This is a fast path, for use by irq controller code that uses radix tree
605 * revmaps
606 */
607unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
608 irq_hw_number_t hwirq)
609{
610 struct irq_data *irq_data;
611
612 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
613 return irq_find_mapping(domain, hwirq);
614
615 /*
616 * Freeing an irq can delete nodes along the path to
617 * do the lookup via call_rcu.
618 */
619 rcu_read_lock();
620 irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
621 rcu_read_unlock();
622
623 /*
624 * If found in radix tree, then fine.
625 * Else fallback to linear lookup - this should not happen in practice
626 * as it means that we failed to insert the node in the radix tree.
627 */
628 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
629}
630EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup);
631
632/**
633 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
634 * @domain: domain owning this hardware interrupt
635 * @virq: linux irq number
636 * @hwirq: hardware irq number in that domain space
637 *
638 * This is for use by irq controllers that use a radix tree reverse
639 * mapping for fast lookup.
640 */
641void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
642 irq_hw_number_t hwirq)
643{
644 struct irq_data *irq_data = irq_get_irq_data(virq);
645
646 if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
647 return;
648
649 if (virq) {
650 mutex_lock(&revmap_trees_mutex);
651 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
652 mutex_unlock(&revmap_trees_mutex);
653 }
654}
655EXPORT_SYMBOL_GPL(irq_radix_revmap_insert);
656
657/**
658 * irq_linear_revmap() - Find a linux irq from a hw irq number. 730 * irq_linear_revmap() - Find a linux irq from a hw irq number.
659 * @domain: domain owning this hardware interrupt 731 * @domain: domain owning this hardware interrupt
660 * @hwirq: hardware irq number in that domain space 732 * @hwirq: hardware irq number in that domain space
661 * 733 *
662 * This is a fast path, for use by irq controller code that uses linear 734 * This is a fast path that can be called directly by irq controller code to
663 * revmaps. It does fallback to the slow path if the revmap doesn't exist 735 * save a handful of instructions.
664 * yet and will create the revmap entry with appropriate locking
665 */ 736 */
666unsigned int irq_linear_revmap(struct irq_domain *domain, 737unsigned int irq_linear_revmap(struct irq_domain *domain,
667 irq_hw_number_t hwirq) 738 irq_hw_number_t hwirq)
668{ 739{
669 unsigned int *revmap; 740 BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
670
671 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
672 return irq_find_mapping(domain, hwirq);
673 741
674 /* Check revmap bounds */ 742 /* Check revmap bounds; complain if exceeded */
675 if (unlikely(hwirq >= domain->revmap_data.linear.size)) 743 if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
676 return irq_find_mapping(domain, hwirq); 744 return 0;
677
678 /* Check if revmap was allocated */
679 revmap = domain->revmap_data.linear.revmap;
680 if (unlikely(revmap == NULL))
681 return irq_find_mapping(domain, hwirq);
682
683 /* Fill up revmap with slow path if no mapping found */
684 if (unlikely(!revmap[hwirq]))
685 revmap[hwirq] = irq_find_mapping(domain, hwirq);
686 745
687 return revmap[hwirq]; 746 return domain->revmap_data.linear.revmap[hwirq];
688} 747}
689EXPORT_SYMBOL_GPL(irq_linear_revmap); 748EXPORT_SYMBOL_GPL(irq_linear_revmap);
690 749
@@ -725,8 +784,8 @@ static int virq_debug_show(struct seq_file *m, void *private)
725 data = irq_desc_get_chip_data(desc); 784 data = irq_desc_get_chip_data(desc);
726 seq_printf(m, data ? "0x%p " : " %p ", data); 785 seq_printf(m, data ? "0x%p " : " %p ", data);
727 786
728 if (desc->irq_data.domain && desc->irq_data.domain->of_node) 787 if (desc->irq_data.domain)
729 p = desc->irq_data.domain->of_node->full_name; 788 p = of_node_full_name(desc->irq_data.domain->of_node);
730 else 789 else
731 p = none; 790 p = none;
732 seq_printf(m, "%s\n", p); 791 seq_printf(m, "%s\n", p);
@@ -761,12 +820,6 @@ static int __init irq_debugfs_init(void)
761__initcall(irq_debugfs_init); 820__initcall(irq_debugfs_init);
762#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ 821#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
763 822
764static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
765 irq_hw_number_t hwirq)
766{
767 return 0;
768}
769
770/** 823/**
771 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings 824 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
772 * 825 *
@@ -829,7 +882,6 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
829EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); 882EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
830 883
831const struct irq_domain_ops irq_domain_simple_ops = { 884const struct irq_domain_ops irq_domain_simple_ops = {
832 .map = irq_domain_simple_map,
833 .xlate = irq_domain_xlate_onetwocell, 885 .xlate = irq_domain_xlate_onetwocell,
834}; 886};
835EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 887EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8c548232ba39..4c69326aa773 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc)
781 wake_up(&desc->wait_for_threads); 781 wake_up(&desc->wait_for_threads);
782} 782}
783 783
784static void irq_thread_dtor(struct task_work *unused) 784static void irq_thread_dtor(struct callback_head *unused)
785{ 785{
786 struct task_struct *tsk = current; 786 struct task_struct *tsk = current;
787 struct irq_desc *desc; 787 struct irq_desc *desc;
@@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused)
813 */ 813 */
814static int irq_thread(void *data) 814static int irq_thread(void *data)
815{ 815{
816 struct task_work on_exit_work; 816 struct callback_head on_exit_work;
817 static const struct sched_param param = { 817 static const struct sched_param param = {
818 .sched_priority = MAX_USER_RT_PRIO/2, 818 .sched_priority = MAX_USER_RT_PRIO/2,
819 }; 819 };
@@ -830,7 +830,7 @@ static int irq_thread(void *data)
830 830
831 sched_setscheduler(current, SCHED_FIFO, &param); 831 sched_setscheduler(current, SCHED_FIFO, &param);
832 832
833 init_task_work(&on_exit_work, irq_thread_dtor, NULL); 833 init_task_work(&on_exit_work, irq_thread_dtor);
834 task_work_add(current, &on_exit_work, false); 834 task_work_add(current, &on_exit_work, false);
835 835
836 while (!irq_wait_for_interrupt(action)) { 836 while (!irq_wait_for_interrupt(action)) {
@@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
893 return -ENOSYS; 893 return -ENOSYS;
894 if (!try_module_get(desc->owner)) 894 if (!try_module_get(desc->owner))
895 return -ENODEV; 895 return -ENODEV;
896 /*
897 * Some drivers like serial.c use request_irq() heavily,
898 * so we have to be careful not to interfere with a
899 * running system.
900 */
901 if (new->flags & IRQF_SAMPLE_RANDOM) {
902 /*
903 * This function might sleep, we want to call it first,
904 * outside of the atomic block.
905 * Yes, this might clear the entropy pool if the wrong
906 * driver is attempted to be loaded, without actually
907 * installing a new handler, but is this really a problem,
908 * only the sysadmin is able to do this.
909 */
910 rand_initialize_irq(irq);
911 }
912 896
913 /* 897 /*
914 * Check whether the interrupt nests into another interrupt 898 * Check whether the interrupt nests into another interrupt
@@ -960,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
960 } 944 }
961 945
962 /* 946 /*
947 * Drivers are often written to work w/o knowledge about the
948 * underlying irq chip implementation, so a request for a
949 * threaded irq without a primary hard irq context handler
950 * requires the ONESHOT flag to be set. Some irq chips like
951 * MSI based interrupts are per se one shot safe. Check the
952 * chip flags, so we can avoid the unmask dance at the end of
953 * the threaded handler for those.
954 */
955 if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
956 new->flags &= ~IRQF_ONESHOT;
957
958 /*
963 * The following block of code has to be executed atomically 959 * The following block of code has to be executed atomically
964 */ 960 */
965 raw_spin_lock_irqsave(&desc->lock, flags); 961 raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1033,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1033 */ 1029 */
1034 new->thread_mask = 1 << ffz(thread_mask); 1030 new->thread_mask = 1 << ffz(thread_mask);
1035 1031
1036 } else if (new->handler == irq_default_primary_handler) { 1032 } else if (new->handler == irq_default_primary_handler &&
1033 !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
1037 /* 1034 /*
1038 * The interrupt was requested with handler = NULL, so 1035 * The interrupt was requested with handler = NULL, so
1039 * we use the default primary handler for it. But it 1036 * we use the default primary handler for it. But it
@@ -1354,7 +1351,6 @@ EXPORT_SYMBOL(free_irq);
1354 * Flags: 1351 * Flags:
1355 * 1352 *
1356 * IRQF_SHARED Interrupt is shared 1353 * IRQF_SHARED Interrupt is shared
1357 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1358 * IRQF_TRIGGER_* Specify active edge(s) or level 1354 * IRQF_TRIGGER_* Specify active edge(s) or level
1359 * 1355 *
1360 */ 1356 */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e2e472f6aeb..0668d58d6413 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void)
1424 1424
1425void crash_save_vmcoreinfo(void) 1425void crash_save_vmcoreinfo(void)
1426{ 1426{
1427 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1427 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1428 update_vmcoreinfo_note(); 1428 update_vmcoreinfo_note();
1429} 1429}
1430 1430
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff2c7cb86d77..6f99aead66c6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,6 +45,13 @@ extern int max_threads;
45 45
46static struct workqueue_struct *khelper_wq; 46static struct workqueue_struct *khelper_wq;
47 47
48/*
49 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
50 * locking to protect this global - it is private to the singleton khelper
51 * thread and should only ever be modified by that thread.
52 */
53static const struct task_struct *kmod_thread_locker;
54
48#define CAP_BSET (void *)1 55#define CAP_BSET (void *)1
49#define CAP_PI (void *)2 56#define CAP_PI (void *)2
50 57
@@ -221,6 +228,13 @@ fail:
221 return 0; 228 return 0;
222} 229}
223 230
231static int call_helper(void *data)
232{
233 /* Worker thread started blocking khelper thread. */
234 kmod_thread_locker = current;
235 return ____call_usermodehelper(data);
236}
237
224static void call_usermodehelper_freeinfo(struct subprocess_info *info) 238static void call_usermodehelper_freeinfo(struct subprocess_info *info)
225{ 239{
226 if (info->cleanup) 240 if (info->cleanup)
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work)
295 if (wait == UMH_WAIT_PROC) 309 if (wait == UMH_WAIT_PROC)
296 pid = kernel_thread(wait_for_helper, sub_info, 310 pid = kernel_thread(wait_for_helper, sub_info,
297 CLONE_FS | CLONE_FILES | SIGCHLD); 311 CLONE_FS | CLONE_FILES | SIGCHLD);
298 else 312 else {
299 pid = kernel_thread(____call_usermodehelper, sub_info, 313 pid = kernel_thread(call_helper, sub_info,
300 CLONE_VFORK | SIGCHLD); 314 CLONE_VFORK | SIGCHLD);
315 /* Worker thread stopped blocking khelper thread. */
316 kmod_thread_locker = NULL;
317 }
301 318
302 switch (wait) { 319 switch (wait) {
303 case UMH_NO_WAIT: 320 case UMH_NO_WAIT:
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
548 retval = -EBUSY; 565 retval = -EBUSY;
549 goto out; 566 goto out;
550 } 567 }
568 /*
569 * Worker thread must not wait for khelper thread at below
570 * wait_for_completion() if the thread was created with CLONE_VFORK
571 * flag, for khelper thread is already waiting for the thread at
572 * wait_for_completion() in do_fork().
573 */
574 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
575 retval = -EBUSY;
576 goto out;
577 }
551 578
552 sub_info->complete = &done; 579 sub_info->complete = &done;
553 sub_info->wait = wait; 580 sub_info->wait = wait;
@@ -577,6 +604,12 @@ unlock:
577 return retval; 604 return retval;
578} 605}
579 606
607/*
608 * call_usermodehelper_fns() will not run the caller-provided cleanup function
609 * if a memory allocation failure is experienced. So the caller might need to
610 * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
611 * the necessaary cleanup within the caller.
612 */
580int call_usermodehelper_fns( 613int call_usermodehelper_fns(
581 char *path, char **argv, char **envp, int wait, 614 char *path, char **argv, char **envp, int wait,
582 int (*init)(struct subprocess_info *info, struct cred *new), 615 int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..b579af57ea10 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
360 struct kthread_work, node); 360 struct kthread_work, node);
361 list_del_init(&work->node); 361 list_del_init(&work->node);
362 } 362 }
363 worker->current_work = work;
363 spin_unlock_irq(&worker->lock); 364 spin_unlock_irq(&worker->lock);
364 365
365 if (work) { 366 if (work) {
366 __set_current_state(TASK_RUNNING); 367 __set_current_state(TASK_RUNNING);
367 work->func(work); 368 work->func(work);
368 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
369 work->done_seq = work->queue_seq;
370 smp_mb(); /* mb worker-b1 paired with flush-b0 */
371 if (atomic_read(&work->flushing))
372 wake_up_all(&work->done);
373 } else if (!freezing(current)) 369 } else if (!freezing(current))
374 schedule(); 370 schedule();
375 371
@@ -378,6 +374,19 @@ repeat:
378} 374}
379EXPORT_SYMBOL_GPL(kthread_worker_fn); 375EXPORT_SYMBOL_GPL(kthread_worker_fn);
380 376
377/* insert @work before @pos in @worker */
378static void insert_kthread_work(struct kthread_worker *worker,
379 struct kthread_work *work,
380 struct list_head *pos)
381{
382 lockdep_assert_held(&worker->lock);
383
384 list_add_tail(&work->node, pos);
385 work->worker = worker;
386 if (likely(worker->task))
387 wake_up_process(worker->task);
388}
389
381/** 390/**
382 * queue_kthread_work - queue a kthread_work 391 * queue_kthread_work - queue a kthread_work
383 * @worker: target kthread_worker 392 * @worker: target kthread_worker
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
395 404
396 spin_lock_irqsave(&worker->lock, flags); 405 spin_lock_irqsave(&worker->lock, flags);
397 if (list_empty(&work->node)) { 406 if (list_empty(&work->node)) {
398 list_add_tail(&work->node, &worker->work_list); 407 insert_kthread_work(worker, work, &worker->work_list);
399 work->queue_seq++;
400 if (likely(worker->task))
401 wake_up_process(worker->task);
402 ret = true; 408 ret = true;
403 } 409 }
404 spin_unlock_irqrestore(&worker->lock, flags); 410 spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
406} 412}
407EXPORT_SYMBOL_GPL(queue_kthread_work); 413EXPORT_SYMBOL_GPL(queue_kthread_work);
408 414
415struct kthread_flush_work {
416 struct kthread_work work;
417 struct completion done;
418};
419
420static void kthread_flush_work_fn(struct kthread_work *work)
421{
422 struct kthread_flush_work *fwork =
423 container_of(work, struct kthread_flush_work, work);
424 complete(&fwork->done);
425}
426
409/** 427/**
410 * flush_kthread_work - flush a kthread_work 428 * flush_kthread_work - flush a kthread_work
411 * @work: work to flush 429 * @work: work to flush
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
414 */ 432 */
415void flush_kthread_work(struct kthread_work *work) 433void flush_kthread_work(struct kthread_work *work)
416{ 434{
417 int seq = work->queue_seq; 435 struct kthread_flush_work fwork = {
418 436 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
419 atomic_inc(&work->flushing); 437 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
438 };
439 struct kthread_worker *worker;
440 bool noop = false;
420 441
421 /* 442retry:
422 * mb flush-b0 paired with worker-b1, to make sure either 443 worker = work->worker;
423 * worker sees the above increment or we see done_seq update. 444 if (!worker)
424 */ 445 return;
425 smp_mb__after_atomic_inc();
426 446
427 /* A - B <= 0 tests whether B is in front of A regardless of overflow */ 447 spin_lock_irq(&worker->lock);
428 wait_event(work->done, seq - work->done_seq <= 0); 448 if (work->worker != worker) {
429 atomic_dec(&work->flushing); 449 spin_unlock_irq(&worker->lock);
450 goto retry;
451 }
430 452
431 /* 453 if (!list_empty(&work->node))
432 * rmb flush-b1 paired with worker-b0, to make sure our caller 454 insert_kthread_work(worker, &fwork.work, work->node.next);
433 * sees every change made by work->func(). 455 else if (worker->current_work == work)
434 */ 456 insert_kthread_work(worker, &fwork.work, worker->work_list.next);
435 smp_mb__after_atomic_dec(); 457 else
436} 458 noop = true;
437EXPORT_SYMBOL_GPL(flush_kthread_work);
438 459
439struct kthread_flush_work { 460 spin_unlock_irq(&worker->lock);
440 struct kthread_work work;
441 struct completion done;
442};
443 461
444static void kthread_flush_work_fn(struct kthread_work *work) 462 if (!noop)
445{ 463 wait_for_completion(&fwork.done);
446 struct kthread_flush_work *fwork =
447 container_of(work, struct kthread_flush_work, work);
448 complete(&fwork->done);
449} 464}
465EXPORT_SYMBOL_GPL(flush_kthread_work);
450 466
451/** 467/**
452 * flush_kthread_worker - flush all current works on a kthread_worker 468 * flush_kthread_worker - flush all current works on a kthread_worker
diff --git a/kernel/panic.c b/kernel/panic.c
index d2a5f4ecc6dd..e1b2822fff97 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...)
75 int state = 0; 75 int state = 0;
76 76
77 /* 77 /*
78 * Disable local interrupts. This will prevent panic_smp_self_stop
79 * from deadlocking the first cpu that invokes the panic, since
80 * there is nothing to prevent an interrupt handler (that runs
81 * after the panic_lock is acquired) from invoking panic again.
82 */
83 local_irq_disable();
84
85 /*
78 * It's possible to come here directly from a panic-assertion and 86 * It's possible to come here directly from a panic-assertion and
79 * not have preempt disabled. Some functions called from here want 87 * not have preempt disabled. Some functions called from here want
80 * preempt to be disabled. No point enabling it later though... 88 * preempt to be disabled. No point enabling it later though...
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8f9b4eb974e0..a70518c9d82f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -175,7 +175,7 @@ config PM_TEST_SUSPEND
175 You probably want to have your system's RTC driver statically 175 You probably want to have your system's RTC driver statically
176 linked, ensuring that it's available when this test runs. 176 linked, ensuring that it's available when this test runs.
177 177
178config CAN_PM_TRACE 178config PM_SLEEP_DEBUG
179 def_bool y 179 def_bool y
180 depends on PM_DEBUG && PM_SLEEP 180 depends on PM_DEBUG && PM_SLEEP
181 181
@@ -196,7 +196,7 @@ config PM_TRACE
196 196
197config PM_TRACE_RTC 197config PM_TRACE_RTC
198 bool "Suspend/resume event tracing" 198 bool "Suspend/resume event tracing"
199 depends on CAN_PM_TRACE 199 depends on PM_SLEEP_DEBUG
200 depends on X86 200 depends on X86
201 select PM_TRACE 201 select PM_TRACE
202 ---help--- 202 ---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..b26f5f1e773e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,6 +5,7 @@
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
8 * 9 *
9 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
10 */ 11 */
@@ -27,7 +28,6 @@
27#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
29#include <linux/genhd.h> 30#include <linux/genhd.h>
30#include <scsi/scsi_scan.h>
31 31
32#include "power.h" 32#include "power.h"
33 33
@@ -46,6 +46,9 @@ enum {
46 HIBERNATION_PLATFORM, 46 HIBERNATION_PLATFORM,
47 HIBERNATION_SHUTDOWN, 47 HIBERNATION_SHUTDOWN,
48 HIBERNATION_REBOOT, 48 HIBERNATION_REBOOT,
49#ifdef CONFIG_SUSPEND
50 HIBERNATION_SUSPEND,
51#endif
49 /* keep last */ 52 /* keep last */
50 __HIBERNATION_AFTER_LAST 53 __HIBERNATION_AFTER_LAST
51}; 54};
@@ -354,6 +357,7 @@ int hibernation_snapshot(int platform_mode)
354 } 357 }
355 358
356 suspend_console(); 359 suspend_console();
360 ftrace_stop();
357 pm_restrict_gfp_mask(); 361 pm_restrict_gfp_mask();
358 362
359 error = dpm_suspend(PMSG_FREEZE); 363 error = dpm_suspend(PMSG_FREEZE);
@@ -379,6 +383,7 @@ int hibernation_snapshot(int platform_mode)
379 if (error || !in_suspend) 383 if (error || !in_suspend)
380 pm_restore_gfp_mask(); 384 pm_restore_gfp_mask();
381 385
386 ftrace_start();
382 resume_console(); 387 resume_console();
383 dpm_complete(msg); 388 dpm_complete(msg);
384 389
@@ -481,6 +486,7 @@ int hibernation_restore(int platform_mode)
481 486
482 pm_prepare_console(); 487 pm_prepare_console();
483 suspend_console(); 488 suspend_console();
489 ftrace_stop();
484 pm_restrict_gfp_mask(); 490 pm_restrict_gfp_mask();
485 error = dpm_suspend_start(PMSG_QUIESCE); 491 error = dpm_suspend_start(PMSG_QUIESCE);
486 if (!error) { 492 if (!error) {
@@ -488,6 +494,7 @@ int hibernation_restore(int platform_mode)
488 dpm_resume_end(PMSG_RECOVER); 494 dpm_resume_end(PMSG_RECOVER);
489 } 495 }
490 pm_restore_gfp_mask(); 496 pm_restore_gfp_mask();
497 ftrace_start();
491 resume_console(); 498 resume_console();
492 pm_restore_console(); 499 pm_restore_console();
493 return error; 500 return error;
@@ -514,6 +521,7 @@ int hibernation_platform_enter(void)
514 521
515 entering_platform_hibernation = true; 522 entering_platform_hibernation = true;
516 suspend_console(); 523 suspend_console();
524 ftrace_stop();
517 error = dpm_suspend_start(PMSG_HIBERNATE); 525 error = dpm_suspend_start(PMSG_HIBERNATE);
518 if (error) { 526 if (error) {
519 if (hibernation_ops->recover) 527 if (hibernation_ops->recover)
@@ -557,6 +565,7 @@ int hibernation_platform_enter(void)
557 Resume_devices: 565 Resume_devices:
558 entering_platform_hibernation = false; 566 entering_platform_hibernation = false;
559 dpm_resume_end(PMSG_RESTORE); 567 dpm_resume_end(PMSG_RESTORE);
568 ftrace_start();
560 resume_console(); 569 resume_console();
561 570
562 Close: 571 Close:
@@ -574,6 +583,10 @@ int hibernation_platform_enter(void)
574 */ 583 */
575static void power_down(void) 584static void power_down(void)
576{ 585{
586#ifdef CONFIG_SUSPEND
587 int error;
588#endif
589
577 switch (hibernation_mode) { 590 switch (hibernation_mode) {
578 case HIBERNATION_REBOOT: 591 case HIBERNATION_REBOOT:
579 kernel_restart(NULL); 592 kernel_restart(NULL);
@@ -583,6 +596,25 @@ static void power_down(void)
583 case HIBERNATION_SHUTDOWN: 596 case HIBERNATION_SHUTDOWN:
584 kernel_power_off(); 597 kernel_power_off();
585 break; 598 break;
599#ifdef CONFIG_SUSPEND
600 case HIBERNATION_SUSPEND:
601 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
602 if (error) {
603 if (hibernation_ops)
604 hibernation_mode = HIBERNATION_PLATFORM;
605 else
606 hibernation_mode = HIBERNATION_SHUTDOWN;
607 power_down();
608 }
609 /*
610 * Restore swap signature.
611 */
612 error = swsusp_unmark();
613 if (error)
614 printk(KERN_ERR "PM: Swap will be unusable! "
615 "Try swapon -a.\n");
616 return;
617#endif
586 } 618 }
587 kernel_halt(); 619 kernel_halt();
588 /* 620 /*
@@ -748,13 +780,6 @@ static int software_resume(void)
748 async_synchronize_full(); 780 async_synchronize_full();
749 } 781 }
750 782
751 /*
752 * We can't depend on SCSI devices being available after loading
753 * one of their modules until scsi_complete_async_scans() is
754 * called and the resume device usually is a SCSI one.
755 */
756 scsi_complete_async_scans();
757
758 swsusp_resume_device = name_to_dev_t(resume_file); 783 swsusp_resume_device = name_to_dev_t(resume_file);
759 if (!swsusp_resume_device) { 784 if (!swsusp_resume_device) {
760 error = -ENODEV; 785 error = -ENODEV;
@@ -827,6 +852,9 @@ static const char * const hibernation_modes[] = {
827 [HIBERNATION_PLATFORM] = "platform", 852 [HIBERNATION_PLATFORM] = "platform",
828 [HIBERNATION_SHUTDOWN] = "shutdown", 853 [HIBERNATION_SHUTDOWN] = "shutdown",
829 [HIBERNATION_REBOOT] = "reboot", 854 [HIBERNATION_REBOOT] = "reboot",
855#ifdef CONFIG_SUSPEND
856 [HIBERNATION_SUSPEND] = "suspend",
857#endif
830}; 858};
831 859
832/* 860/*
@@ -867,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
867 switch (i) { 895 switch (i) {
868 case HIBERNATION_SHUTDOWN: 896 case HIBERNATION_SHUTDOWN:
869 case HIBERNATION_REBOOT: 897 case HIBERNATION_REBOOT:
898#ifdef CONFIG_SUSPEND
899 case HIBERNATION_SUSPEND:
900#endif
870 break; 901 break;
871 case HIBERNATION_PLATFORM: 902 case HIBERNATION_PLATFORM:
872 if (hibernation_ops) 903 if (hibernation_ops)
@@ -907,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
907 switch (mode) { 938 switch (mode) {
908 case HIBERNATION_SHUTDOWN: 939 case HIBERNATION_SHUTDOWN:
909 case HIBERNATION_REBOOT: 940 case HIBERNATION_REBOOT:
941#ifdef CONFIG_SUSPEND
942 case HIBERNATION_SUSPEND:
943#endif
910 hibernation_mode = mode; 944 hibernation_mode = mode;
911 break; 945 break;
912 case HIBERNATION_PLATFORM: 946 case HIBERNATION_PLATFORM:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 428f8a034e96..f458238109cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init);
235 235
236#endif /* CONFIG_PM_SLEEP */ 236#endif /* CONFIG_PM_SLEEP */
237 237
238#ifdef CONFIG_PM_SLEEP_DEBUG
239/*
240 * pm_print_times: print time taken by devices to suspend and resume.
241 *
242 * show() returns whether printing of suspend and resume times is enabled.
243 * store() accepts 0 or 1. 0 disables printing and 1 enables it.
244 */
245bool pm_print_times_enabled;
246
247static ssize_t pm_print_times_show(struct kobject *kobj,
248 struct kobj_attribute *attr, char *buf)
249{
250 return sprintf(buf, "%d\n", pm_print_times_enabled);
251}
252
253static ssize_t pm_print_times_store(struct kobject *kobj,
254 struct kobj_attribute *attr,
255 const char *buf, size_t n)
256{
257 unsigned long val;
258
259 if (kstrtoul(buf, 10, &val))
260 return -EINVAL;
261
262 if (val > 1)
263 return -EINVAL;
264
265 pm_print_times_enabled = !!val;
266 return n;
267}
268
269power_attr(pm_print_times);
270
271static inline void pm_print_times_init(void)
272{
273 pm_print_times_enabled = !!initcall_debug;
274}
275#else /* !CONFIG_PP_SLEEP_DEBUG */
276static inline void pm_print_times_init(void) {}
277#endif /* CONFIG_PM_SLEEP_DEBUG */
278
238struct kobject *power_kobj; 279struct kobject *power_kobj;
239 280
240/** 281/**
@@ -531,6 +572,9 @@ static struct attribute * g[] = {
531#ifdef CONFIG_PM_DEBUG 572#ifdef CONFIG_PM_DEBUG
532 &pm_test_attr.attr, 573 &pm_test_attr.attr,
533#endif 574#endif
575#ifdef CONFIG_PM_SLEEP_DEBUG
576 &pm_print_times_attr.attr,
577#endif
534#endif 578#endif
535 NULL, 579 NULL,
536}; 580};
@@ -566,6 +610,7 @@ static int __init pm_init(void)
566 error = sysfs_create_group(power_kobj, &attr_group); 610 error = sysfs_create_group(power_kobj, &attr_group);
567 if (error) 611 if (error)
568 return error; 612 return error;
613 pm_print_times_init();
569 return pm_autosleep_init(); 614 return pm_autosleep_init();
570} 615}
571 616
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b0bd4beaebfe..7d4b7ffb3c1d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -156,6 +156,9 @@ extern void swsusp_free(void);
156extern int swsusp_read(unsigned int *flags_p); 156extern int swsusp_read(unsigned int *flags_p);
157extern int swsusp_write(unsigned int flags); 157extern int swsusp_write(unsigned int flags);
158extern void swsusp_close(fmode_t); 158extern void swsusp_close(fmode_t);
159#ifdef CONFIG_SUSPEND
160extern int swsusp_unmark(void);
161#endif
159 162
160/* kernel/power/block_io.c */ 163/* kernel/power/block_io.c */
161extern struct block_device *hib_resume_bdev; 164extern struct block_device *hib_resume_bdev;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 396d262b8fd0..c8b7446b27df 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -24,6 +24,7 @@
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/suspend.h> 25#include <linux/suspend.h>
26#include <linux/syscore_ops.h> 26#include <linux/syscore_ops.h>
27#include <linux/ftrace.h>
27#include <trace/events/power.h> 28#include <trace/events/power.h>
28 29
29#include "power.h" 30#include "power.h"
@@ -212,6 +213,7 @@ int suspend_devices_and_enter(suspend_state_t state)
212 goto Close; 213 goto Close;
213 } 214 }
214 suspend_console(); 215 suspend_console();
216 ftrace_stop();
215 suspend_test_start(); 217 suspend_test_start();
216 error = dpm_suspend_start(PMSG_SUSPEND); 218 error = dpm_suspend_start(PMSG_SUSPEND);
217 if (error) { 219 if (error) {
@@ -231,6 +233,7 @@ int suspend_devices_and_enter(suspend_state_t state)
231 suspend_test_start(); 233 suspend_test_start();
232 dpm_resume_end(PMSG_RESUME); 234 dpm_resume_end(PMSG_RESUME);
233 suspend_test_finish("resume devices"); 235 suspend_test_finish("resume devices");
236 ftrace_start();
234 resume_console(); 237 resume_console();
235 Close: 238 Close:
236 if (suspend_ops->end) 239 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11e22c068e8b..3c9d764eb0d8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle,
448 struct timeval start; 448 struct timeval start;
449 struct timeval stop; 449 struct timeval stop;
450 450
451 printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", 451 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
452 nr_to_write); 452 nr_to_write);
453 m = nr_to_write / 100; 453 m = nr_to_write / 10;
454 if (!m) 454 if (!m)
455 m = 1; 455 m = 1;
456 nr_pages = 0; 456 nr_pages = 0;
@@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle,
464 if (ret) 464 if (ret)
465 break; 465 break;
466 if (!(nr_pages % m)) 466 if (!(nr_pages % m))
467 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 467 printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
468 nr_pages / m * 10);
468 nr_pages++; 469 nr_pages++;
469 } 470 }
470 err2 = hib_wait_on_bio_chain(&bio); 471 err2 = hib_wait_on_bio_chain(&bio);
@@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle,
472 if (!ret) 473 if (!ret)
473 ret = err2; 474 ret = err2;
474 if (!ret) 475 if (!ret)
475 printk(KERN_CONT "\b\b\b\bdone\n"); 476 printk(KERN_INFO "PM: Image saving done.\n");
476 else
477 printk(KERN_CONT "\n");
478 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 477 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
479 return ret; 478 return ret;
480} 479}
@@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
668 667
669 printk(KERN_INFO 668 printk(KERN_INFO
670 "PM: Using %u thread(s) for compression.\n" 669 "PM: Using %u thread(s) for compression.\n"
671 "PM: Compressing and saving image data (%u pages) ... ", 670 "PM: Compressing and saving image data (%u pages)...\n",
672 nr_threads, nr_to_write); 671 nr_threads, nr_to_write);
673 m = nr_to_write / 100; 672 m = nr_to_write / 10;
674 if (!m) 673 if (!m)
675 m = 1; 674 m = 1;
676 nr_pages = 0; 675 nr_pages = 0;
@@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle,
690 data_of(*snapshot), PAGE_SIZE); 689 data_of(*snapshot), PAGE_SIZE);
691 690
692 if (!(nr_pages % m)) 691 if (!(nr_pages % m))
693 printk(KERN_CONT "\b\b\b\b%3d%%", 692 printk(KERN_INFO
694 nr_pages / m); 693 "PM: Image saving progress: "
694 "%3d%%\n",
695 nr_pages / m * 10);
695 nr_pages++; 696 nr_pages++;
696 } 697 }
697 if (!off) 698 if (!off)
@@ -761,11 +762,8 @@ out_finish:
761 do_gettimeofday(&stop); 762 do_gettimeofday(&stop);
762 if (!ret) 763 if (!ret)
763 ret = err2; 764 ret = err2;
764 if (!ret) { 765 if (!ret)
765 printk(KERN_CONT "\b\b\b\bdone\n"); 766 printk(KERN_INFO "PM: Image saving done.\n");
766 } else {
767 printk(KERN_CONT "\n");
768 }
769 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 767 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
770out_clean: 768out_clean:
771 if (crc) { 769 if (crc) {
@@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle,
973 int err2; 971 int err2;
974 unsigned nr_pages; 972 unsigned nr_pages;
975 973
976 printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", 974 printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
977 nr_to_read); 975 nr_to_read);
978 m = nr_to_read / 100; 976 m = nr_to_read / 10;
979 if (!m) 977 if (!m)
980 m = 1; 978 m = 1;
981 nr_pages = 0; 979 nr_pages = 0;
@@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle,
993 if (ret) 991 if (ret)
994 break; 992 break;
995 if (!(nr_pages % m)) 993 if (!(nr_pages % m))
996 printk("\b\b\b\b%3d%%", nr_pages / m); 994 printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
995 nr_pages / m * 10);
997 nr_pages++; 996 nr_pages++;
998 } 997 }
999 err2 = hib_wait_on_bio_chain(&bio); 998 err2 = hib_wait_on_bio_chain(&bio);
@@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle,
1001 if (!ret) 1000 if (!ret)
1002 ret = err2; 1001 ret = err2;
1003 if (!ret) { 1002 if (!ret) {
1004 printk("\b\b\b\bdone\n"); 1003 printk(KERN_INFO "PM: Image loading done.\n");
1005 snapshot_write_finalize(snapshot); 1004 snapshot_write_finalize(snapshot);
1006 if (!snapshot_image_loaded(snapshot)) 1005 if (!snapshot_image_loaded(snapshot))
1007 ret = -ENODATA; 1006 ret = -ENODATA;
1008 } else 1007 }
1009 printk("\n");
1010 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1008 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
1011 return ret; 1009 return ret;
1012} 1010}
@@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
1185 1183
1186 printk(KERN_INFO 1184 printk(KERN_INFO
1187 "PM: Using %u thread(s) for decompression.\n" 1185 "PM: Using %u thread(s) for decompression.\n"
1188 "PM: Loading and decompressing image data (%u pages) ... ", 1186 "PM: Loading and decompressing image data (%u pages)...\n",
1189 nr_threads, nr_to_read); 1187 nr_threads, nr_to_read);
1190 m = nr_to_read / 100; 1188 m = nr_to_read / 10;
1191 if (!m) 1189 if (!m)
1192 m = 1; 1190 m = 1;
1193 nr_pages = 0; 1191 nr_pages = 0;
@@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle,
1319 data[thr].unc + off, PAGE_SIZE); 1317 data[thr].unc + off, PAGE_SIZE);
1320 1318
1321 if (!(nr_pages % m)) 1319 if (!(nr_pages % m))
1322 printk("\b\b\b\b%3d%%", nr_pages / m); 1320 printk(KERN_INFO
1321 "PM: Image loading progress: "
1322 "%3d%%\n",
1323 nr_pages / m * 10);
1323 nr_pages++; 1324 nr_pages++;
1324 1325
1325 ret = snapshot_write_next(snapshot); 1326 ret = snapshot_write_next(snapshot);
@@ -1344,7 +1345,7 @@ out_finish:
1344 } 1345 }
1345 do_gettimeofday(&stop); 1346 do_gettimeofday(&stop);
1346 if (!ret) { 1347 if (!ret) {
1347 printk("\b\b\b\bdone\n"); 1348 printk(KERN_INFO "PM: Image loading done.\n");
1348 snapshot_write_finalize(snapshot); 1349 snapshot_write_finalize(snapshot);
1349 if (!snapshot_image_loaded(snapshot)) 1350 if (!snapshot_image_loaded(snapshot))
1350 ret = -ENODATA; 1351 ret = -ENODATA;
@@ -1357,8 +1358,7 @@ out_finish:
1357 } 1358 }
1358 } 1359 }
1359 } 1360 }
1360 } else 1361 }
1361 printk("\n");
1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
1363out_clean: 1363out_clean:
1364 for (i = 0; i < ring_size; i++) 1364 for (i = 0; i < ring_size; i++)
@@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode)
1472 blkdev_put(hib_resume_bdev, mode); 1472 blkdev_put(hib_resume_bdev, mode);
1473} 1473}
1474 1474
1475/**
1476 * swsusp_unmark - Unmark swsusp signature in the resume device
1477 */
1478
1479#ifdef CONFIG_SUSPEND
1480int swsusp_unmark(void)
1481{
1482 int error;
1483
1484 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
1485 if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
1486 memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
1487 error = hib_bio_write_page(swsusp_resume_block,
1488 swsusp_header, NULL);
1489 } else {
1490 printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
1491 error = -ENODEV;
1492 }
1493
1494 /*
1495 * We just returned from suspend, we don't need the image any more.
1496 */
1497 free_all_swap_pages(root_swap);
1498
1499 return error;
1500}
1501#endif
1502
1475static int swsusp_header_init(void) 1503static int swsusp_header_init(void)
1476{ 1504{
1477 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); 1505 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/freezer.h> 26#include <linux/freezer.h>
27#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
30 29
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
84 * appear. 83 * appear.
85 */ 84 */
86 wait_for_device_probe(); 85 wait_for_device_probe();
87 scsi_complete_async_scans();
88 86
89 data->swap = -1; 87 data->swap = -1;
90 data->mode = O_WRONLY; 88 data->mode = O_WRONLY;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index c8fba3380076..8f50de394d22 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -9,6 +9,7 @@
9 * manipulate wakelocks on Android. 9 * manipulate wakelocks on Android.
10 */ 10 */
11 11
12#include <linux/capability.h>
12#include <linux/ctype.h> 13#include <linux/ctype.h>
13#include <linux/device.h> 14#include <linux/device.h>
14#include <linux/err.h> 15#include <linux/err.h>
@@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf)
188 size_t len; 189 size_t len;
189 int ret = 0; 190 int ret = 0;
190 191
192 if (!capable(CAP_BLOCK_SUSPEND))
193 return -EPERM;
194
191 while (*str && !isspace(*str)) 195 while (*str && !isspace(*str))
192 str++; 196 str++;
193 197
@@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf)
231 size_t len; 235 size_t len;
232 int ret = 0; 236 int ret = 0;
233 237
238 if (!capable(CAP_BLOCK_SUSPEND))
239 return -EPERM;
240
234 len = strlen(buf); 241 len = strlen(buf);
235 if (!len) 242 if (!len)
236 return -EINVAL; 243 return -EINVAL;
diff --git a/kernel/printk.c b/kernel/printk.c
index 177fa49357a5..66a2ea37b576 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -216,6 +216,7 @@ struct log {
216 */ 216 */
217static DEFINE_RAW_SPINLOCK(logbuf_lock); 217static DEFINE_RAW_SPINLOCK(logbuf_lock);
218 218
219#ifdef CONFIG_PRINTK
219/* the next printk record to read by syslog(READ) or /proc/kmsg */ 220/* the next printk record to read by syslog(READ) or /proc/kmsg */
220static u64 syslog_seq; 221static u64 syslog_seq;
221static u32 syslog_idx; 222static u32 syslog_idx;
@@ -228,14 +229,19 @@ static u32 log_first_idx;
228 229
229/* index and sequence number of the next record to store in the buffer */ 230/* index and sequence number of the next record to store in the buffer */
230static u64 log_next_seq; 231static u64 log_next_seq;
231#ifdef CONFIG_PRINTK
232static u32 log_next_idx; 232static u32 log_next_idx;
233 233
234/* the next printk record to write to the console */
235static u64 console_seq;
236static u32 console_idx;
237static enum log_flags console_prev;
238
234/* the next printk record to read after the last 'clear' command */ 239/* the next printk record to read after the last 'clear' command */
235static u64 clear_seq; 240static u64 clear_seq;
236static u32 clear_idx; 241static u32 clear_idx;
237 242
238#define LOG_LINE_MAX 1024 243#define PREFIX_MAX 32
244#define LOG_LINE_MAX 1024 - PREFIX_MAX
239 245
240/* record buffer */ 246/* record buffer */
241#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 247#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -360,6 +366,7 @@ static void log_store(int facility, int level,
360struct devkmsg_user { 366struct devkmsg_user {
361 u64 seq; 367 u64 seq;
362 u32 idx; 368 u32 idx;
369 enum log_flags prev;
363 struct mutex lock; 370 struct mutex lock;
364 char buf[8192]; 371 char buf[8192];
365}; 372};
@@ -382,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
382 389
383 line = buf; 390 line = buf;
384 for (i = 0; i < count; i++) { 391 for (i = 0; i < count; i++) {
385 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) 392 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
393 ret = -EFAULT;
386 goto out; 394 goto out;
395 }
387 line += iv[i].iov_len; 396 line += iv[i].iov_len;
388 } 397 }
389 398
@@ -425,6 +434,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
425 struct log *msg; 434 struct log *msg;
426 u64 ts_usec; 435 u64 ts_usec;
427 size_t i; 436 size_t i;
437 char cont = '-';
428 size_t len; 438 size_t len;
429 ssize_t ret; 439 ssize_t ret;
430 440
@@ -462,8 +472,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
462 msg = log_from_idx(user->idx); 472 msg = log_from_idx(user->idx);
463 ts_usec = msg->ts_nsec; 473 ts_usec = msg->ts_nsec;
464 do_div(ts_usec, 1000); 474 do_div(ts_usec, 1000);
465 len = sprintf(user->buf, "%u,%llu,%llu;", 475
466 (msg->facility << 3) | msg->level, user->seq, ts_usec); 476 /*
477 * If we couldn't merge continuation line fragments during the print,
478 * export the stored flags to allow an optional external merge of the
479 * records. Merging the records isn't always neccessarily correct, like
480 * when we hit a race during printing. In most cases though, it produces
481 * better readable output. 'c' in the record flags mark the first
482 * fragment of a line, '+' the following.
483 */
484 if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
485 cont = 'c';
486 else if ((msg->flags & LOG_CONT) ||
487 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
488 cont = '+';
489
490 len = sprintf(user->buf, "%u,%llu,%llu,%c;",
491 (msg->facility << 3) | msg->level,
492 user->seq, ts_usec, cont);
493 user->prev = msg->flags;
467 494
468 /* escape non-printable characters */ 495 /* escape non-printable characters */
469 for (i = 0; i < msg->text_len; i++) { 496 for (i = 0; i < msg->text_len; i++) {
@@ -646,6 +673,15 @@ void log_buf_kexec_setup(void)
646 VMCOREINFO_SYMBOL(log_buf_len); 673 VMCOREINFO_SYMBOL(log_buf_len);
647 VMCOREINFO_SYMBOL(log_first_idx); 674 VMCOREINFO_SYMBOL(log_first_idx);
648 VMCOREINFO_SYMBOL(log_next_idx); 675 VMCOREINFO_SYMBOL(log_next_idx);
676 /*
677 * Export struct log size and field offsets. User space tools can
678 * parse it and detect any changes to structure down the line.
679 */
680 VMCOREINFO_STRUCT_SIZE(log);
681 VMCOREINFO_OFFSET(log, ts_nsec);
682 VMCOREINFO_OFFSET(log, len);
683 VMCOREINFO_OFFSET(log, text_len);
684 VMCOREINFO_OFFSET(log, dict_len);
649} 685}
650#endif 686#endif
651 687
@@ -876,7 +912,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
876 912
877 if (buf) { 913 if (buf) {
878 if (print_prefix(msg, syslog, NULL) + 914 if (print_prefix(msg, syslog, NULL) +
879 text_len + 1>= size - len) 915 text_len + 1 >= size - len)
880 break; 916 break;
881 917
882 if (prefix) 918 if (prefix)
@@ -907,7 +943,7 @@ static int syslog_print(char __user *buf, int size)
907 struct log *msg; 943 struct log *msg;
908 int len = 0; 944 int len = 0;
909 945
910 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 946 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
911 if (!text) 947 if (!text)
912 return -ENOMEM; 948 return -ENOMEM;
913 949
@@ -930,7 +966,8 @@ static int syslog_print(char __user *buf, int size)
930 966
931 skip = syslog_partial; 967 skip = syslog_partial;
932 msg = log_from_idx(syslog_idx); 968 msg = log_from_idx(syslog_idx);
933 n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); 969 n = msg_print_text(msg, syslog_prev, true, text,
970 LOG_LINE_MAX + PREFIX_MAX);
934 if (n - syslog_partial <= size) { 971 if (n - syslog_partial <= size) {
935 /* message fits into buffer, move forward */ 972 /* message fits into buffer, move forward */
936 syslog_idx = log_next(syslog_idx); 973 syslog_idx = log_next(syslog_idx);
@@ -969,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
969 char *text; 1006 char *text;
970 int len = 0; 1007 int len = 0;
971 1008
972 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 1009 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
973 if (!text) 1010 if (!text)
974 return -ENOMEM; 1011 return -ENOMEM;
975 1012
@@ -997,6 +1034,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
997 struct log *msg = log_from_idx(idx); 1034 struct log *msg = log_from_idx(idx);
998 1035
999 len += msg_print_text(msg, prev, true, NULL, 0); 1036 len += msg_print_text(msg, prev, true, NULL, 0);
1037 prev = msg->flags;
1000 idx = log_next(idx); 1038 idx = log_next(idx);
1001 seq++; 1039 seq++;
1002 } 1040 }
@@ -1009,6 +1047,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1009 struct log *msg = log_from_idx(idx); 1047 struct log *msg = log_from_idx(idx);
1010 1048
1011 len -= msg_print_text(msg, prev, true, NULL, 0); 1049 len -= msg_print_text(msg, prev, true, NULL, 0);
1050 prev = msg->flags;
1012 idx = log_next(idx); 1051 idx = log_next(idx);
1013 seq++; 1052 seq++;
1014 } 1053 }
@@ -1022,7 +1061,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1022 struct log *msg = log_from_idx(idx); 1061 struct log *msg = log_from_idx(idx);
1023 int textlen; 1062 int textlen;
1024 1063
1025 textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); 1064 textlen = msg_print_text(msg, prev, true, text,
1065 LOG_LINE_MAX + PREFIX_MAX);
1026 if (textlen < 0) { 1066 if (textlen < 0) {
1027 len = textlen; 1067 len = textlen;
1028 break; 1068 break;
@@ -1192,21 +1232,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1192 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1232 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1193} 1233}
1194 1234
1195#ifdef CONFIG_KGDB_KDB
1196/* kdb dmesg command needs access to the syslog buffer. do_syslog()
1197 * uses locks so it cannot be used during debugging. Just tell kdb
1198 * where the start and end of the physical and logical logs are. This
1199 * is equivalent to do_syslog(3).
1200 */
1201void kdb_syslog_data(char *syslog_data[4])
1202{
1203 syslog_data[0] = log_buf;
1204 syslog_data[1] = log_buf + log_buf_len;
1205 syslog_data[2] = log_buf + log_first_idx;
1206 syslog_data[3] = log_buf + log_next_idx;
1207}
1208#endif /* CONFIG_KGDB_KDB */
1209
1210static bool __read_mostly ignore_loglevel; 1235static bool __read_mostly ignore_loglevel;
1211 1236
1212static int __init ignore_loglevel_setup(char *str) 1237static int __init ignore_loglevel_setup(char *str)
@@ -1364,20 +1389,36 @@ static struct cont {
1364 u64 ts_nsec; /* time of first print */ 1389 u64 ts_nsec; /* time of first print */
1365 u8 level; /* log level of first message */ 1390 u8 level; /* log level of first message */
1366 u8 facility; /* log level of first message */ 1391 u8 facility; /* log level of first message */
1392 enum log_flags flags; /* prefix, newline flags */
1367 bool flushed:1; /* buffer sealed and committed */ 1393 bool flushed:1; /* buffer sealed and committed */
1368} cont; 1394} cont;
1369 1395
1370static void cont_flush(void) 1396static void cont_flush(enum log_flags flags)
1371{ 1397{
1372 if (cont.flushed) 1398 if (cont.flushed)
1373 return; 1399 return;
1374 if (cont.len == 0) 1400 if (cont.len == 0)
1375 return; 1401 return;
1376 1402
1377 log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, 1403 if (cont.cons) {
1378 NULL, 0, cont.buf, cont.len); 1404 /*
1379 1405 * If a fragment of this line was directly flushed to the
1380 cont.flushed = true; 1406 * console; wait for the console to pick up the rest of the
1407 * line. LOG_NOCONS suppresses a duplicated output.
1408 */
1409 log_store(cont.facility, cont.level, flags | LOG_NOCONS,
1410 cont.ts_nsec, NULL, 0, cont.buf, cont.len);
1411 cont.flags = flags;
1412 cont.flushed = true;
1413 } else {
1414 /*
1415 * If no fragment of this line ever reached the console,
1416 * just submit it to the store and free the buffer.
1417 */
1418 log_store(cont.facility, cont.level, flags, 0,
1419 NULL, 0, cont.buf, cont.len);
1420 cont.len = 0;
1421 }
1381} 1422}
1382 1423
1383static bool cont_add(int facility, int level, const char *text, size_t len) 1424static bool cont_add(int facility, int level, const char *text, size_t len)
@@ -1386,7 +1427,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1386 return false; 1427 return false;
1387 1428
1388 if (cont.len + len > sizeof(cont.buf)) { 1429 if (cont.len + len > sizeof(cont.buf)) {
1389 cont_flush(); 1430 /* the line gets too long, split it up in separate records */
1431 cont_flush(LOG_CONT);
1390 return false; 1432 return false;
1391 } 1433 }
1392 1434
@@ -1395,12 +1437,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1395 cont.level = level; 1437 cont.level = level;
1396 cont.owner = current; 1438 cont.owner = current;
1397 cont.ts_nsec = local_clock(); 1439 cont.ts_nsec = local_clock();
1440 cont.flags = 0;
1398 cont.cons = 0; 1441 cont.cons = 0;
1399 cont.flushed = false; 1442 cont.flushed = false;
1400 } 1443 }
1401 1444
1402 memcpy(cont.buf + cont.len, text, len); 1445 memcpy(cont.buf + cont.len, text, len);
1403 cont.len += len; 1446 cont.len += len;
1447
1448 if (cont.len > (sizeof(cont.buf) * 80) / 100)
1449 cont_flush(LOG_CONT);
1450
1404 return true; 1451 return true;
1405} 1452}
1406 1453
@@ -1409,7 +1456,7 @@ static size_t cont_print_text(char *text, size_t size)
1409 size_t textlen = 0; 1456 size_t textlen = 0;
1410 size_t len; 1457 size_t len;
1411 1458
1412 if (cont.cons == 0) { 1459 if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
1413 textlen += print_time(cont.ts_nsec, text); 1460 textlen += print_time(cont.ts_nsec, text);
1414 size -= textlen; 1461 size -= textlen;
1415 } 1462 }
@@ -1424,7 +1471,8 @@ static size_t cont_print_text(char *text, size_t size)
1424 } 1471 }
1425 1472
1426 if (cont.flushed) { 1473 if (cont.flushed) {
1427 text[textlen++] = '\n'; 1474 if (cont.flags & LOG_NEWLINE)
1475 text[textlen++] = '\n';
1428 /* got everything, release buffer */ 1476 /* got everything, release buffer */
1429 cont.len = 0; 1477 cont.len = 0;
1430 } 1478 }
@@ -1496,17 +1544,23 @@ asmlinkage int vprintk_emit(int facility, int level,
1496 lflags |= LOG_NEWLINE; 1544 lflags |= LOG_NEWLINE;
1497 } 1545 }
1498 1546
1499 /* strip syslog prefix and extract log level or control flags */ 1547 /* strip kernel syslog prefix and extract log level or control flags */
1500 if (text[0] == '<' && text[1] && text[2] == '>') { 1548 if (facility == 0) {
1501 switch (text[1]) { 1549 int kern_level = printk_get_level(text);
1502 case '0' ... '7': 1550
1503 if (level == -1) 1551 if (kern_level) {
1504 level = text[1] - '0'; 1552 const char *end_of_header = printk_skip_level(text);
1505 case 'd': /* KERN_DEFAULT */ 1553 switch (kern_level) {
1506 lflags |= LOG_PREFIX; 1554 case '0' ... '7':
1507 case 'c': /* KERN_CONT */ 1555 if (level == -1)
1508 text += 3; 1556 level = kern_level - '0';
1509 text_len -= 3; 1557 case 'd': /* KERN_DEFAULT */
1558 lflags |= LOG_PREFIX;
1559 case 'c': /* KERN_CONT */
1560 break;
1561 }
1562 text_len -= end_of_header - text;
1563 text = (char *)end_of_header;
1510 } 1564 }
1511 } 1565 }
1512 1566
@@ -1522,7 +1576,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1522 * or another task also prints continuation lines. 1576 * or another task also prints continuation lines.
1523 */ 1577 */
1524 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) 1578 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
1525 cont_flush(); 1579 cont_flush(LOG_NEWLINE);
1526 1580
1527 /* buffer line if possible, otherwise store it right away */ 1581 /* buffer line if possible, otherwise store it right away */
1528 if (!cont_add(facility, level, text, text_len)) 1582 if (!cont_add(facility, level, text, text_len))
@@ -1540,7 +1594,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1540 if (cont.len && cont.owner == current) { 1594 if (cont.len && cont.owner == current) {
1541 if (!(lflags & LOG_PREFIX)) 1595 if (!(lflags & LOG_PREFIX))
1542 stored = cont_add(facility, level, text, text_len); 1596 stored = cont_add(facility, level, text, text_len);
1543 cont_flush(); 1597 cont_flush(LOG_NEWLINE);
1544 } 1598 }
1545 1599
1546 if (!stored) 1600 if (!stored)
@@ -1631,9 +1685,20 @@ asmlinkage int printk(const char *fmt, ...)
1631} 1685}
1632EXPORT_SYMBOL(printk); 1686EXPORT_SYMBOL(printk);
1633 1687
1634#else 1688#else /* CONFIG_PRINTK */
1635 1689
1690#define LOG_LINE_MAX 0
1691#define PREFIX_MAX 0
1636#define LOG_LINE_MAX 0 1692#define LOG_LINE_MAX 0
1693static u64 syslog_seq;
1694static u32 syslog_idx;
1695static u64 console_seq;
1696static u32 console_idx;
1697static enum log_flags syslog_prev;
1698static u64 log_first_seq;
1699static u32 log_first_idx;
1700static u64 log_next_seq;
1701static enum log_flags console_prev;
1637static struct cont { 1702static struct cont {
1638 size_t len; 1703 size_t len;
1639 size_t cons; 1704 size_t cons;
@@ -1917,10 +1982,34 @@ void wake_up_klogd(void)
1917 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1982 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1918} 1983}
1919 1984
1920/* the next printk record to write to the console */ 1985static void console_cont_flush(char *text, size_t size)
1921static u64 console_seq; 1986{
1922static u32 console_idx; 1987 unsigned long flags;
1923static enum log_flags console_prev; 1988 size_t len;
1989
1990 raw_spin_lock_irqsave(&logbuf_lock, flags);
1991
1992 if (!cont.len)
1993 goto out;
1994
1995 /*
1996 * We still queue earlier records, likely because the console was
1997 * busy. The earlier ones need to be printed before this one, we
1998 * did not flush any fragment so far, so just let it queue up.
1999 */
2000 if (console_seq < log_next_seq && !cont.cons)
2001 goto out;
2002
2003 len = cont_print_text(text, size);
2004 raw_spin_unlock(&logbuf_lock);
2005 stop_critical_timings();
2006 call_console_drivers(cont.level, text, len);
2007 start_critical_timings();
2008 local_irq_restore(flags);
2009 return;
2010out:
2011 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2012}
1924 2013
1925/** 2014/**
1926 * console_unlock - unlock the console system 2015 * console_unlock - unlock the console system
@@ -1938,7 +2027,7 @@ static enum log_flags console_prev;
1938 */ 2027 */
1939void console_unlock(void) 2028void console_unlock(void)
1940{ 2029{
1941 static char text[LOG_LINE_MAX]; 2030 static char text[LOG_LINE_MAX + PREFIX_MAX];
1942 static u64 seen_seq; 2031 static u64 seen_seq;
1943 unsigned long flags; 2032 unsigned long flags;
1944 bool wake_klogd = false; 2033 bool wake_klogd = false;
@@ -1952,19 +2041,7 @@ void console_unlock(void)
1952 console_may_schedule = 0; 2041 console_may_schedule = 0;
1953 2042
1954 /* flush buffered message fragment immediately to console */ 2043 /* flush buffered message fragment immediately to console */
1955 raw_spin_lock_irqsave(&logbuf_lock, flags); 2044 console_cont_flush(text, sizeof(text));
1956 if (cont.len && (cont.cons < cont.len || cont.flushed)) {
1957 size_t len;
1958
1959 len = cont_print_text(text, sizeof(text));
1960 raw_spin_unlock(&logbuf_lock);
1961 stop_critical_timings();
1962 call_console_drivers(cont.level, text, len);
1963 start_critical_timings();
1964 local_irq_restore(flags);
1965 } else
1966 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1967
1968again: 2045again:
1969 for (;;) { 2046 for (;;) {
1970 struct log *msg; 2047 struct log *msg;
@@ -2001,6 +2078,7 @@ skip:
2001 * will properly dump everything later. 2078 * will properly dump everything later.
2002 */ 2079 */
2003 msg->flags &= ~LOG_NOCONS; 2080 msg->flags &= ~LOG_NOCONS;
2081 console_prev = msg->flags;
2004 goto skip; 2082 goto skip;
2005 } 2083 }
2006 2084
@@ -2525,7 +2603,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2525} 2603}
2526 2604
2527/** 2605/**
2528 * kmsg_dump_get_line - retrieve one kmsg log line 2606 * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
2529 * @dumper: registered kmsg dumper 2607 * @dumper: registered kmsg dumper
2530 * @syslog: include the "<4>" prefixes 2608 * @syslog: include the "<4>" prefixes
2531 * @line: buffer to copy the line to 2609 * @line: buffer to copy the line to
@@ -2540,11 +2618,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2540 * 2618 *
2541 * A return value of FALSE indicates that there are no more records to 2619 * A return value of FALSE indicates that there are no more records to
2542 * read. 2620 * read.
2621 *
2622 * The function is similar to kmsg_dump_get_line(), but grabs no locks.
2543 */ 2623 */
2544bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, 2624bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
2545 char *line, size_t size, size_t *len) 2625 char *line, size_t size, size_t *len)
2546{ 2626{
2547 unsigned long flags;
2548 struct log *msg; 2627 struct log *msg;
2549 size_t l = 0; 2628 size_t l = 0;
2550 bool ret = false; 2629 bool ret = false;
@@ -2552,7 +2631,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2552 if (!dumper->active) 2631 if (!dumper->active)
2553 goto out; 2632 goto out;
2554 2633
2555 raw_spin_lock_irqsave(&logbuf_lock, flags);
2556 if (dumper->cur_seq < log_first_seq) { 2634 if (dumper->cur_seq < log_first_seq) {
2557 /* messages are gone, move to first available one */ 2635 /* messages are gone, move to first available one */
2558 dumper->cur_seq = log_first_seq; 2636 dumper->cur_seq = log_first_seq;
@@ -2560,10 +2638,8 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2560 } 2638 }
2561 2639
2562 /* last entry */ 2640 /* last entry */
2563 if (dumper->cur_seq >= log_next_seq) { 2641 if (dumper->cur_seq >= log_next_seq)
2564 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2565 goto out; 2642 goto out;
2566 }
2567 2643
2568 msg = log_from_idx(dumper->cur_idx); 2644 msg = log_from_idx(dumper->cur_idx);
2569 l = msg_print_text(msg, 0, syslog, line, size); 2645 l = msg_print_text(msg, 0, syslog, line, size);
@@ -2571,12 +2647,41 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2571 dumper->cur_idx = log_next(dumper->cur_idx); 2647 dumper->cur_idx = log_next(dumper->cur_idx);
2572 dumper->cur_seq++; 2648 dumper->cur_seq++;
2573 ret = true; 2649 ret = true;
2574 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2575out: 2650out:
2576 if (len) 2651 if (len)
2577 *len = l; 2652 *len = l;
2578 return ret; 2653 return ret;
2579} 2654}
2655
2656/**
2657 * kmsg_dump_get_line - retrieve one kmsg log line
2658 * @dumper: registered kmsg dumper
2659 * @syslog: include the "<4>" prefixes
2660 * @line: buffer to copy the line to
2661 * @size: maximum size of the buffer
2662 * @len: length of line placed into buffer
2663 *
2664 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2665 * record, and copy one record into the provided buffer.
2666 *
2667 * Consecutive calls will return the next available record moving
2668 * towards the end of the buffer with the youngest messages.
2669 *
2670 * A return value of FALSE indicates that there are no more records to
2671 * read.
2672 */
2673bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2674 char *line, size_t size, size_t *len)
2675{
2676 unsigned long flags;
2677 bool ret;
2678
2679 raw_spin_lock_irqsave(&logbuf_lock, flags);
2680 ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
2681 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2682
2683 return ret;
2684}
2580EXPORT_SYMBOL_GPL(kmsg_dump_get_line); 2685EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
2581 2686
2582/** 2687/**
@@ -2679,6 +2784,24 @@ out:
2679EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); 2784EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
2680 2785
2681/** 2786/**
2787 * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
2788 * @dumper: registered kmsg dumper
2789 *
2790 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2791 * kmsg_dump_get_buffer() can be called again and used multiple
2792 * times within the same dumper.dump() callback.
2793 *
2794 * The function is similar to kmsg_dump_rewind(), but grabs no locks.
2795 */
2796void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
2797{
2798 dumper->cur_seq = clear_seq;
2799 dumper->cur_idx = clear_idx;
2800 dumper->next_seq = log_next_seq;
2801 dumper->next_idx = log_next_idx;
2802}
2803
2804/**
2682 * kmsg_dump_rewind - reset the interator 2805 * kmsg_dump_rewind - reset the interator
2683 * @dumper: registered kmsg dumper 2806 * @dumper: registered kmsg dumper
2684 * 2807 *
@@ -2691,10 +2814,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2691 unsigned long flags; 2814 unsigned long flags;
2692 2815
2693 raw_spin_lock_irqsave(&logbuf_lock, flags); 2816 raw_spin_lock_irqsave(&logbuf_lock, flags);
2694 dumper->cur_seq = clear_seq; 2817 kmsg_dump_rewind_nolock(dumper);
2695 dumper->cur_idx = clear_idx;
2696 dumper->next_seq = log_next_seq;
2697 dumper->next_idx = log_next_idx;
2698 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2818 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2699} 2819}
2700EXPORT_SYMBOL_GPL(kmsg_dump_rewind); 2820EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 95cba41ce1e9..4e6a61b15e86 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -54,6 +54,50 @@
54#ifdef CONFIG_PREEMPT_RCU 54#ifdef CONFIG_PREEMPT_RCU
55 55
56/* 56/*
57 * Preemptible RCU implementation for rcu_read_lock().
58 * Just increment ->rcu_read_lock_nesting, shared state will be updated
59 * if we block.
60 */
61void __rcu_read_lock(void)
62{
63 current->rcu_read_lock_nesting++;
64 barrier(); /* critical section after entry code. */
65}
66EXPORT_SYMBOL_GPL(__rcu_read_lock);
67
68/*
69 * Preemptible RCU implementation for rcu_read_unlock().
70 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
71 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
72 * invoke rcu_read_unlock_special() to clean up after a context switch
73 * in an RCU read-side critical section and other special cases.
74 */
75void __rcu_read_unlock(void)
76{
77 struct task_struct *t = current;
78
79 if (t->rcu_read_lock_nesting != 1) {
80 --t->rcu_read_lock_nesting;
81 } else {
82 barrier(); /* critical section before exit code. */
83 t->rcu_read_lock_nesting = INT_MIN;
84 barrier(); /* assign before ->rcu_read_unlock_special load */
85 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
86 rcu_read_unlock_special(t);
87 barrier(); /* ->rcu_read_unlock_special load before assign */
88 t->rcu_read_lock_nesting = 0;
89 }
90#ifdef CONFIG_PROVE_LOCKING
91 {
92 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
93
94 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
95 }
96#endif /* #ifdef CONFIG_PROVE_LOCKING */
97}
98EXPORT_SYMBOL_GPL(__rcu_read_unlock);
99
100/*
57 * Check for a task exiting while in a preemptible-RCU read-side 101 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings, 102 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep 103 * as debug_check_no_locks_held() already does this if lockdep
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 37a5444204d2..547b1fe5b052 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -172,7 +172,7 @@ void rcu_irq_enter(void)
172 local_irq_restore(flags); 172 local_irq_restore(flags);
173} 173}
174 174
175#ifdef CONFIG_PROVE_RCU 175#ifdef CONFIG_DEBUG_LOCK_ALLOC
176 176
177/* 177/*
178 * Test whether RCU thinks that the current CPU is idle. 178 * Test whether RCU thinks that the current CPU is idle.
@@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void)
183} 183}
184EXPORT_SYMBOL(rcu_is_cpu_idle); 184EXPORT_SYMBOL(rcu_is_cpu_idle);
185 185
186#endif /* #ifdef CONFIG_PROVE_RCU */ 186#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
187 187
188/* 188/*
189 * Test whether the current CPU was interrupted from idle. Nested 189 * Test whether the current CPU was interrupted from idle. Nested
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index fc31a2d65100..918fd1e8509c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
132 RCU_TRACE(.rcb.name = "rcu_preempt") 132 RCU_TRACE(.rcb.name = "rcu_preempt")
133}; 133};
134 134
135static void rcu_read_unlock_special(struct task_struct *t);
136static int rcu_preempted_readers_exp(void); 135static int rcu_preempted_readers_exp(void);
137static void rcu_report_exp_done(void); 136static void rcu_report_exp_done(void);
138 137
@@ -351,8 +350,9 @@ static int rcu_initiate_boost(void)
351 rcu_preempt_ctrlblk.boost_tasks = 350 rcu_preempt_ctrlblk.boost_tasks =
352 rcu_preempt_ctrlblk.gp_tasks; 351 rcu_preempt_ctrlblk.gp_tasks;
353 invoke_rcu_callbacks(); 352 invoke_rcu_callbacks();
354 } else 353 } else {
355 RCU_TRACE(rcu_initiate_boost_trace()); 354 RCU_TRACE(rcu_initiate_boost_trace());
355 }
356 return 1; 356 return 1;
357} 357}
358 358
@@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void)
527} 527}
528 528
529/* 529/*
530 * Tiny-preemptible RCU implementation for rcu_read_lock().
531 * Just increment ->rcu_read_lock_nesting, shared state will be updated
532 * if we block.
533 */
534void __rcu_read_lock(void)
535{
536 current->rcu_read_lock_nesting++;
537 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
538}
539EXPORT_SYMBOL_GPL(__rcu_read_lock);
540
541/*
542 * Handle special cases during rcu_read_unlock(), such as needing to 530 * Handle special cases during rcu_read_unlock(), such as needing to
543 * notify RCU core processing or task having blocked during the RCU 531 * notify RCU core processing or task having blocked during the RCU
544 * read-side critical section. 532 * read-side critical section.
545 */ 533 */
546static noinline void rcu_read_unlock_special(struct task_struct *t) 534void rcu_read_unlock_special(struct task_struct *t)
547{ 535{
548 int empty; 536 int empty;
549 int empty_exp; 537 int empty_exp;
@@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
627} 615}
628 616
629/* 617/*
630 * Tiny-preemptible RCU implementation for rcu_read_unlock().
631 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
632 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
633 * invoke rcu_read_unlock_special() to clean up after a context switch
634 * in an RCU read-side critical section and other special cases.
635 */
636void __rcu_read_unlock(void)
637{
638 struct task_struct *t = current;
639
640 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
641 if (t->rcu_read_lock_nesting != 1)
642 --t->rcu_read_lock_nesting;
643 else {
644 t->rcu_read_lock_nesting = INT_MIN;
645 barrier(); /* assign before ->rcu_read_unlock_special load */
646 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
647 rcu_read_unlock_special(t);
648 barrier(); /* ->rcu_read_unlock_special load before assign */
649 t->rcu_read_lock_nesting = 0;
650 }
651#ifdef CONFIG_PROVE_LOCKING
652 {
653 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
654
655 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
656 }
657#endif /* #ifdef CONFIG_PROVE_LOCKING */
658}
659EXPORT_SYMBOL_GPL(__rcu_read_unlock);
660
661/*
662 * Check for a quiescent state from the current CPU. When a task blocks, 618 * Check for a quiescent state from the current CPU. When a task blocks,
663 * the task is recorded in the rcu_preempt_ctrlblk structure, which is 619 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
664 * checked elsewhere. This is called from the scheduling-clock interrupt. 620 * checked elsewhere. This is called from the scheduling-clock interrupt.
@@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void)
823 rpcp->exp_tasks = NULL; 779 rpcp->exp_tasks = NULL;
824 780
825 /* Wait for tail of ->blkd_tasks list to drain. */ 781 /* Wait for tail of ->blkd_tasks list to drain. */
826 if (!rcu_preempted_readers_exp()) 782 if (!rcu_preempted_readers_exp()) {
827 local_irq_restore(flags); 783 local_irq_restore(flags);
828 else { 784 } else {
829 rcu_initiate_boost(); 785 rcu_initiate_boost();
830 local_irq_restore(flags); 786 local_irq_restore(flags);
831 wait_event(sync_rcu_preempt_exp_wq, 787 wait_event(sync_rcu_preempt_exp_wq,
@@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
846 */ 802 */
847int rcu_preempt_needs_cpu(void) 803int rcu_preempt_needs_cpu(void)
848{ 804{
849 if (!rcu_preempt_running_reader())
850 rcu_preempt_cpu_qs();
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 805 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 806}
853 807
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e66b34ab7555..25b15033c61f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,8 +49,7 @@
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
53 "Josh Triplett <josh@freedesktop.org>");
54 53
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 55static int nfakewriters = 4; /* # fake writer threads */
@@ -206,6 +205,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */
206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 205DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
207 /* and boost task create/destroy. */ 206 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ 207static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
208static bool barrier_phase; /* Test phase. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ 209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ 210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); 211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
@@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p)
407 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { 407 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
408 rp->rtort_mbtest = 0; 408 rp->rtort_mbtest = 0;
409 rcu_torture_free(rp); 409 rcu_torture_free(rp);
410 } else 410 } else {
411 cur_ops->deferred_free(rp); 411 cur_ops->deferred_free(rp);
412 }
412} 413}
413 414
414static int rcu_no_completed(void) 415static int rcu_no_completed(void)
@@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void)
635 synchronize_srcu(&srcu_ctl); 636 synchronize_srcu(&srcu_ctl);
636} 637}
637 638
639static void srcu_torture_call(struct rcu_head *head,
640 void (*func)(struct rcu_head *head))
641{
642 call_srcu(&srcu_ctl, head, func);
643}
644
645static void srcu_torture_barrier(void)
646{
647 srcu_barrier(&srcu_ctl);
648}
649
638static int srcu_torture_stats(char *page) 650static int srcu_torture_stats(char *page)
639{ 651{
640 int cnt = 0; 652 int cnt = 0;
@@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = {
661 .completed = srcu_torture_completed, 673 .completed = srcu_torture_completed,
662 .deferred_free = srcu_torture_deferred_free, 674 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 675 .sync = srcu_torture_synchronize,
664 .call = NULL, 676 .call = srcu_torture_call,
665 .cb_barrier = NULL, 677 .cb_barrier = srcu_torture_barrier,
666 .stats = srcu_torture_stats, 678 .stats = srcu_torture_stats,
667 .name = "srcu" 679 .name = "srcu"
668}; 680};
@@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg)
1013 do { 1025 do {
1014 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 1026 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
1015 udelay(rcu_random(&rand) & 0x3ff); 1027 udelay(rcu_random(&rand) & 0x3ff);
1016 cur_ops->sync(); 1028 if (cur_ops->cb_barrier != NULL &&
1029 rcu_random(&rand) % (nfakewriters * 8) == 0)
1030 cur_ops->cb_barrier();
1031 else
1032 cur_ops->sync();
1017 rcu_stutter_wait("rcu_torture_fakewriter"); 1033 rcu_stutter_wait("rcu_torture_fakewriter");
1018 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1034 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1019 1035
@@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page)
1183 } 1199 }
1184 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1200 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1185 cnt += sprintf(&page[cnt], 1201 cnt += sprintf(&page[cnt],
1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1202 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1188 "rtbf: %ld rtb: %ld nt: %ld "
1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1191 rcu_torture_current, 1203 rcu_torture_current,
1192 rcu_torture_current_version, 1204 rcu_torture_current_version,
1193 list_empty(&rcu_torture_freelist), 1205 list_empty(&rcu_torture_freelist),
1194 atomic_read(&n_rcu_torture_alloc), 1206 atomic_read(&n_rcu_torture_alloc),
1195 atomic_read(&n_rcu_torture_alloc_fail), 1207 atomic_read(&n_rcu_torture_alloc_fail),
1196 atomic_read(&n_rcu_torture_free), 1208 atomic_read(&n_rcu_torture_free));
1209 cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
1197 atomic_read(&n_rcu_torture_mberror), 1210 atomic_read(&n_rcu_torture_mberror),
1198 n_rcu_torture_boost_ktrerror, 1211 n_rcu_torture_boost_ktrerror,
1199 n_rcu_torture_boost_rterror, 1212 n_rcu_torture_boost_rterror);
1213 cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
1200 n_rcu_torture_boost_failure, 1214 n_rcu_torture_boost_failure,
1201 n_rcu_torture_boosts, 1215 n_rcu_torture_boosts,
1202 n_rcu_torture_timers, 1216 n_rcu_torture_timers);
1217 cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ",
1203 n_online_successes, 1218 n_online_successes,
1204 n_online_attempts, 1219 n_online_attempts,
1205 n_offline_successes, 1220 n_offline_successes,
1206 n_offline_attempts, 1221 n_offline_attempts);
1222 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
1207 n_barrier_successes, 1223 n_barrier_successes,
1208 n_barrier_attempts, 1224 n_barrier_attempts,
1209 n_rcu_torture_barrier_error); 1225 n_rcu_torture_barrier_error);
@@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg)
1445 delta = shutdown_time - jiffies_snap; 1461 delta = shutdown_time - jiffies_snap;
1446 if (verbose) 1462 if (verbose)
1447 printk(KERN_ALERT "%s" TORTURE_FLAG 1463 printk(KERN_ALERT "%s" TORTURE_FLAG
1448 "rcu_torture_shutdown task: %lu " 1464 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1449 "jiffies remaining\n",
1450 torture_type, delta); 1465 torture_type, delta);
1451 schedule_timeout_interruptible(delta); 1466 schedule_timeout_interruptible(delta);
1452 jiffies_snap = ACCESS_ONCE(jiffies); 1467 jiffies_snap = ACCESS_ONCE(jiffies);
@@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg)
1498 if (cpu_down(cpu) == 0) { 1513 if (cpu_down(cpu) == 0) {
1499 if (verbose) 1514 if (verbose)
1500 printk(KERN_ALERT "%s" TORTURE_FLAG 1515 printk(KERN_ALERT "%s" TORTURE_FLAG
1501 "rcu_torture_onoff task: " 1516 "rcu_torture_onoff task: offlined %d\n",
1502 "offlined %d\n",
1503 torture_type, cpu); 1517 torture_type, cpu);
1504 n_offline_successes++; 1518 n_offline_successes++;
1505 } 1519 }
@@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg)
1512 if (cpu_up(cpu) == 0) { 1526 if (cpu_up(cpu) == 0) {
1513 if (verbose) 1527 if (verbose)
1514 printk(KERN_ALERT "%s" TORTURE_FLAG 1528 printk(KERN_ALERT "%s" TORTURE_FLAG
1515 "rcu_torture_onoff task: " 1529 "rcu_torture_onoff task: onlined %d\n",
1516 "onlined %d\n",
1517 torture_type, cpu); 1530 torture_type, cpu);
1518 n_online_successes++; 1531 n_online_successes++;
1519 } 1532 }
@@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1631static int rcu_torture_barrier_cbs(void *arg) 1644static int rcu_torture_barrier_cbs(void *arg)
1632{ 1645{
1633 long myid = (long)arg; 1646 long myid = (long)arg;
1647 bool lastphase = 0;
1634 struct rcu_head rcu; 1648 struct rcu_head rcu;
1635 1649
1636 init_rcu_head_on_stack(&rcu); 1650 init_rcu_head_on_stack(&rcu);
@@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg)
1638 set_user_nice(current, 19); 1652 set_user_nice(current, 19);
1639 do { 1653 do {
1640 wait_event(barrier_cbs_wq[myid], 1654 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs || 1655 barrier_phase != lastphase ||
1642 kthread_should_stop() || 1656 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP); 1657 fullstop != FULLSTOP_DONTSTOP);
1658 lastphase = barrier_phase;
1659 smp_mb(); /* ensure barrier_phase load before ->call(). */
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1660 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break; 1661 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf); 1662 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
@@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg)
1665 do { 1681 do {
1666 atomic_set(&barrier_cbs_invoked, 0); 1682 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs); 1683 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */ 1684 smp_mb(); /* Ensure barrier_phase after prior assignments. */
1685 barrier_phase = !barrier_phase;
1669 for (i = 0; i < n_barrier_cbs; i++) 1686 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]); 1687 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq, 1688 wait_event(barrier_wq,
@@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg)
1684 schedule_timeout_interruptible(HZ / 10); 1701 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1702 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); 1703 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); 1704 rcutorture_shutdown_absorb("rcu_torture_barrier");
1688 while (!kthread_should_stop()) 1705 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1); 1706 schedule_timeout_interruptible(1);
1690 return 0; 1707 return 0;
@@ -1908,8 +1925,8 @@ rcu_torture_init(void)
1908 static struct rcu_torture_ops *torture_ops[] = 1925 static struct rcu_torture_ops *torture_ops[] =
1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1926 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1927 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, 1928 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops, 1929 &srcu_raw_ops, &srcu_raw_sync_ops,
1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1930 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1914 1931
1915 mutex_lock(&fullstop_mutex); 1932 mutex_lock(&fullstop_mutex);
@@ -1931,8 +1948,7 @@ rcu_torture_init(void)
1931 return -EINVAL; 1948 return -EINVAL;
1932 } 1949 }
1933 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1950 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1934 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " 1951 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
1935 "fqs_duration, fqs disabled.\n");
1936 fqs_duration = 0; 1952 fqs_duration = 0;
1937 } 1953 }
1938 if (cur_ops->init) 1954 if (cur_ops->init)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4b97bba7396e..f280e542e3e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,36 +60,44 @@
60 60
61/* Data structures. */ 61/* Data structures. */
62 62
63static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 63static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
64 64
65#define RCU_STATE_INITIALIZER(structname) { \ 65#define RCU_STATE_INITIALIZER(sname, cr) { \
66 .level = { &structname##_state.node[0] }, \ 66 .level = { &sname##_state.node[0] }, \
67 .levelcnt = { \ 67 .call = cr, \
68 NUM_RCU_LVL_0, /* root of hierarchy. */ \
69 NUM_RCU_LVL_1, \
70 NUM_RCU_LVL_2, \
71 NUM_RCU_LVL_3, \
72 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
73 }, \
74 .fqs_state = RCU_GP_IDLE, \ 68 .fqs_state = RCU_GP_IDLE, \
75 .gpnum = -300, \ 69 .gpnum = -300, \
76 .completed = -300, \ 70 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 71 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \ 72 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \ 73 .orphan_donetail = &sname##_state.orphan_donelist, \
80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 74 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
81 .n_force_qs = 0, \ 75 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
82 .n_force_qs_ngp = 0, \ 76 .name = #sname, \
83 .name = #structname, \
84} 77}
85 78
86struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); 79struct rcu_state rcu_sched_state =
80 RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
87DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
88 82
89struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); 83struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
90DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 84DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
91 85
92static struct rcu_state *rcu_state; 86static struct rcu_state *rcu_state;
87LIST_HEAD(rcu_struct_flavors);
88
89/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
90static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
91module_param(rcu_fanout_leaf, int, 0);
92int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
93static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
94 NUM_RCU_LVL_0,
95 NUM_RCU_LVL_1,
96 NUM_RCU_LVL_2,
97 NUM_RCU_LVL_3,
98 NUM_RCU_LVL_4,
99};
100int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
93 101
94/* 102/*
95 * The rcu_scheduler_active variable transitions from zero to one just 103 * The rcu_scheduler_active variable transitions from zero to one just
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
147unsigned long rcutorture_testseq; 155unsigned long rcutorture_testseq;
148unsigned long rcutorture_vernum; 156unsigned long rcutorture_vernum;
149 157
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
157/* 158/*
158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 159 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
159 * permit this function to be invoked without holding the root rcu_node 160 * permit this function to be invoked without holding the root rcu_node
@@ -358,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
358 struct task_struct *idle = idle_task(smp_processor_id()); 359 struct task_struct *idle = idle_task(smp_processor_id());
359 360
360 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 361 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
361 ftrace_dump(DUMP_ALL); 362 ftrace_dump(DUMP_ORIG);
362 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 363 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
363 current->pid, current->comm, 364 current->pid, current->comm,
364 idle->pid, idle->comm); /* must be idle task! */ 365 idle->pid, idle->comm); /* must be idle task! */
@@ -468,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
468 469
469 trace_rcu_dyntick("Error on exit: not idle task", 470 trace_rcu_dyntick("Error on exit: not idle task",
470 oldval, rdtp->dynticks_nesting); 471 oldval, rdtp->dynticks_nesting);
471 ftrace_dump(DUMP_ALL); 472 ftrace_dump(DUMP_ORIG);
472 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 473 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
473 current->pid, current->comm, 474 current->pid, current->comm,
474 idle->pid, idle->comm); /* must be idle task! */ 475 idle->pid, idle->comm); /* must be idle task! */
@@ -585,8 +586,6 @@ void rcu_nmi_exit(void)
585 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 586 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
586} 587}
587 588
588#ifdef CONFIG_PROVE_RCU
589
590/** 589/**
591 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle 590 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
592 * 591 *
@@ -604,7 +603,7 @@ int rcu_is_cpu_idle(void)
604} 603}
605EXPORT_SYMBOL(rcu_is_cpu_idle); 604EXPORT_SYMBOL(rcu_is_cpu_idle);
606 605
607#ifdef CONFIG_HOTPLUG_CPU 606#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
608 607
609/* 608/*
610 * Is the current CPU online? Disable preemption to avoid false positives 609 * Is the current CPU online? Disable preemption to avoid false positives
@@ -645,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)
645} 644}
646EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); 645EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
647 646
648#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 647#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
649
650#endif /* #ifdef CONFIG_PROVE_RCU */
651 648
652/** 649/**
653 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 650 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -733,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
733 int cpu; 730 int cpu;
734 long delta; 731 long delta;
735 unsigned long flags; 732 unsigned long flags;
736 int ndetected; 733 int ndetected = 0;
737 struct rcu_node *rnp = rcu_get_root(rsp); 734 struct rcu_node *rnp = rcu_get_root(rsp);
738 735
739 /* Only let one CPU complain about others per time interval. */ 736 /* Only let one CPU complain about others per time interval. */
@@ -774,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
774 */ 771 */
775 rnp = rcu_get_root(rsp); 772 rnp = rcu_get_root(rsp);
776 raw_spin_lock_irqsave(&rnp->lock, flags); 773 raw_spin_lock_irqsave(&rnp->lock, flags);
777 ndetected = rcu_print_task_stall(rnp); 774 ndetected += rcu_print_task_stall(rnp);
778 raw_spin_unlock_irqrestore(&rnp->lock, flags); 775 raw_spin_unlock_irqrestore(&rnp->lock, flags);
779 776
780 print_cpu_stall_info_end(); 777 print_cpu_stall_info_end();
@@ -860,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
860 */ 857 */
861void rcu_cpu_stall_reset(void) 858void rcu_cpu_stall_reset(void)
862{ 859{
863 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; 860 struct rcu_state *rsp;
864 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; 861
865 rcu_preempt_stall_reset(); 862 for_each_rcu_flavor(rsp)
863 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
866} 864}
867 865
868static struct notifier_block rcu_panic_block = { 866static struct notifier_block rcu_panic_block = {
@@ -894,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
894 if (rnp->qsmask & rdp->grpmask) { 892 if (rnp->qsmask & rdp->grpmask) {
895 rdp->qs_pending = 1; 893 rdp->qs_pending = 1;
896 rdp->passed_quiesce = 0; 894 rdp->passed_quiesce = 0;
897 } else 895 } else {
898 rdp->qs_pending = 0; 896 rdp->qs_pending = 0;
897 }
899 zero_cpu_stall_ticks(rdp); 898 zero_cpu_stall_ticks(rdp);
900 } 899 }
901} 900}
@@ -937,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
937} 936}
938 937
939/* 938/*
939 * Initialize the specified rcu_data structure's callback list to empty.
940 */
941static void init_callback_list(struct rcu_data *rdp)
942{
943 int i;
944
945 rdp->nxtlist = NULL;
946 for (i = 0; i < RCU_NEXT_SIZE; i++)
947 rdp->nxttail[i] = &rdp->nxtlist;
948}
949
950/*
940 * Advance this CPU's callbacks, but only if the current grace period 951 * Advance this CPU's callbacks, but only if the current grace period
941 * has ended. This may be called only from the CPU to whom the rdp 952 * has ended. This may be called only from the CPU to whom the rdp
942 * belongs. In addition, the corresponding leaf rcu_node structure's 953 * belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1328,8 +1339,6 @@ static void
1328rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1339rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1329 struct rcu_node *rnp, struct rcu_data *rdp) 1340 struct rcu_node *rnp, struct rcu_data *rdp)
1330{ 1341{
1331 int i;
1332
1333 /* 1342 /*
1334 * Orphan the callbacks. First adjust the counts. This is safe 1343 * Orphan the callbacks. First adjust the counts. This is safe
1335 * because ->onofflock excludes _rcu_barrier()'s adoption of 1344 * because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1340,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1340 rsp->qlen += rdp->qlen; 1349 rsp->qlen += rdp->qlen;
1341 rdp->n_cbs_orphaned += rdp->qlen; 1350 rdp->n_cbs_orphaned += rdp->qlen;
1342 rdp->qlen_lazy = 0; 1351 rdp->qlen_lazy = 0;
1343 rdp->qlen = 0; 1352 ACCESS_ONCE(rdp->qlen) = 0;
1344 } 1353 }
1345 1354
1346 /* 1355 /*
@@ -1369,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1369 } 1378 }
1370 1379
1371 /* Finally, initialize the rcu_data structure's list to empty. */ 1380 /* Finally, initialize the rcu_data structure's list to empty. */
1372 rdp->nxtlist = NULL; 1381 init_callback_list(rdp);
1373 for (i = 0; i < RCU_NEXT_SIZE; i++)
1374 rdp->nxttail[i] = &rdp->nxtlist;
1375} 1382}
1376 1383
1377/* 1384/*
@@ -1505,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1505 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1512 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1506 if (need_report & RCU_OFL_TASKS_EXP_GP) 1513 if (need_report & RCU_OFL_TASKS_EXP_GP)
1507 rcu_report_exp_rnp(rsp, rnp, true); 1514 rcu_report_exp_rnp(rsp, rnp, true);
1515 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
1516 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
1517 cpu, rdp->qlen, rdp->nxtlist);
1508} 1518}
1509 1519
1510#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1520#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1592,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1592 } 1602 }
1593 smp_mb(); /* List handling before counting for rcu_barrier(). */ 1603 smp_mb(); /* List handling before counting for rcu_barrier(). */
1594 rdp->qlen_lazy -= count_lazy; 1604 rdp->qlen_lazy -= count_lazy;
1595 rdp->qlen -= count; 1605 ACCESS_ONCE(rdp->qlen) -= count;
1596 rdp->n_cbs_invoked += count; 1606 rdp->n_cbs_invoked += count;
1597 1607
1598 /* Reinstate batch limit if we have worked down the excess. */ 1608 /* Reinstate batch limit if we have worked down the excess. */
@@ -1605,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1605 rdp->n_force_qs_snap = rsp->n_force_qs; 1615 rdp->n_force_qs_snap = rsp->n_force_qs;
1606 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 1616 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1607 rdp->qlen_last_fqs_check = rdp->qlen; 1617 rdp->qlen_last_fqs_check = rdp->qlen;
1618 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
1608 1619
1609 local_irq_restore(flags); 1620 local_irq_restore(flags);
1610 1621
@@ -1745,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1745 break; /* grace period idle or initializing, ignore. */ 1756 break; /* grace period idle or initializing, ignore. */
1746 1757
1747 case RCU_SAVE_DYNTICK: 1758 case RCU_SAVE_DYNTICK:
1748 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1749 break; /* So gcc recognizes the dead code. */
1750 1759
1751 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1760 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1752 1761
@@ -1788,9 +1797,10 @@ unlock_fqs_ret:
1788 * whom the rdp belongs. 1797 * whom the rdp belongs.
1789 */ 1798 */
1790static void 1799static void
1791__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1800__rcu_process_callbacks(struct rcu_state *rsp)
1792{ 1801{
1793 unsigned long flags; 1802 unsigned long flags;
1803 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1794 1804
1795 WARN_ON_ONCE(rdp->beenonline == 0); 1805 WARN_ON_ONCE(rdp->beenonline == 0);
1796 1806
@@ -1826,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1826 */ 1836 */
1827static void rcu_process_callbacks(struct softirq_action *unused) 1837static void rcu_process_callbacks(struct softirq_action *unused)
1828{ 1838{
1839 struct rcu_state *rsp;
1840
1829 trace_rcu_utilization("Start RCU core"); 1841 trace_rcu_utilization("Start RCU core");
1830 __rcu_process_callbacks(&rcu_sched_state, 1842 for_each_rcu_flavor(rsp)
1831 &__get_cpu_var(rcu_sched_data)); 1843 __rcu_process_callbacks(rsp);
1832 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1833 rcu_preempt_process_callbacks();
1834 trace_rcu_utilization("End RCU core"); 1844 trace_rcu_utilization("End RCU core");
1835} 1845}
1836 1846
@@ -1857,6 +1867,56 @@ static void invoke_rcu_core(void)
1857 raise_softirq(RCU_SOFTIRQ); 1867 raise_softirq(RCU_SOFTIRQ);
1858} 1868}
1859 1869
1870/*
1871 * Handle any core-RCU processing required by a call_rcu() invocation.
1872 */
1873static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
1874 struct rcu_head *head, unsigned long flags)
1875{
1876 /*
1877 * If called from an extended quiescent state, invoke the RCU
1878 * core in order to force a re-evaluation of RCU's idleness.
1879 */
1880 if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
1881 invoke_rcu_core();
1882
1883 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
1884 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
1885 return;
1886
1887 /*
1888 * Force the grace period if too many callbacks or too long waiting.
1889 * Enforce hysteresis, and don't invoke force_quiescent_state()
1890 * if some other CPU has recently done so. Also, don't bother
1891 * invoking force_quiescent_state() if the newly enqueued callback
1892 * is the only one waiting for a grace period to complete.
1893 */
1894 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1895
1896 /* Are we ignoring a completed grace period? */
1897 rcu_process_gp_end(rsp, rdp);
1898 check_for_new_grace_period(rsp, rdp);
1899
1900 /* Start a new grace period if one not already started. */
1901 if (!rcu_gp_in_progress(rsp)) {
1902 unsigned long nestflag;
1903 struct rcu_node *rnp_root = rcu_get_root(rsp);
1904
1905 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1906 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1907 } else {
1908 /* Give the grace period a kick. */
1909 rdp->blimit = LONG_MAX;
1910 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1911 *rdp->nxttail[RCU_DONE_TAIL] != head)
1912 force_quiescent_state(rsp, 0);
1913 rdp->n_force_qs_snap = rsp->n_force_qs;
1914 rdp->qlen_last_fqs_check = rdp->qlen;
1915 }
1916 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1917 force_quiescent_state(rsp, 1);
1918}
1919
1860static void 1920static void
1861__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1921__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1862 struct rcu_state *rsp, bool lazy) 1922 struct rcu_state *rsp, bool lazy)
@@ -1881,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1881 rdp = this_cpu_ptr(rsp->rda); 1941 rdp = this_cpu_ptr(rsp->rda);
1882 1942
1883 /* Add the callback to our list. */ 1943 /* Add the callback to our list. */
1884 rdp->qlen++; 1944 ACCESS_ONCE(rdp->qlen)++;
1885 if (lazy) 1945 if (lazy)
1886 rdp->qlen_lazy++; 1946 rdp->qlen_lazy++;
1887 else 1947 else
@@ -1896,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1896 else 1956 else
1897 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 1957 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
1898 1958
1899 /* If interrupts were disabled, don't dive into RCU core. */ 1959 /* Go handle any RCU core processing required. */
1900 if (irqs_disabled_flags(flags)) { 1960 __call_rcu_core(rsp, rdp, head, flags);
1901 local_irq_restore(flags);
1902 return;
1903 }
1904
1905 /*
1906 * Force the grace period if too many callbacks or too long waiting.
1907 * Enforce hysteresis, and don't invoke force_quiescent_state()
1908 * if some other CPU has recently done so. Also, don't bother
1909 * invoking force_quiescent_state() if the newly enqueued callback
1910 * is the only one waiting for a grace period to complete.
1911 */
1912 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1913
1914 /* Are we ignoring a completed grace period? */
1915 rcu_process_gp_end(rsp, rdp);
1916 check_for_new_grace_period(rsp, rdp);
1917
1918 /* Start a new grace period if one not already started. */
1919 if (!rcu_gp_in_progress(rsp)) {
1920 unsigned long nestflag;
1921 struct rcu_node *rnp_root = rcu_get_root(rsp);
1922
1923 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1924 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1925 } else {
1926 /* Give the grace period a kick. */
1927 rdp->blimit = LONG_MAX;
1928 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1929 *rdp->nxttail[RCU_DONE_TAIL] != head)
1930 force_quiescent_state(rsp, 0);
1931 rdp->n_force_qs_snap = rsp->n_force_qs;
1932 rdp->qlen_last_fqs_check = rdp->qlen;
1933 }
1934 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1935 force_quiescent_state(rsp, 1);
1936 local_irq_restore(flags); 1961 local_irq_restore(flags);
1937} 1962}
1938 1963
@@ -1962,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1962 * occasionally incorrectly indicate that there are multiple CPUs online 1987 * occasionally incorrectly indicate that there are multiple CPUs online
1963 * when there was in fact only one the whole time, as this just adds 1988 * when there was in fact only one the whole time, as this just adds
1964 * some overhead: RCU still operates correctly. 1989 * some overhead: RCU still operates correctly.
1965 *
1966 * Of course, sampling num_online_cpus() with preemption enabled can
1967 * give erroneous results if there are concurrent CPU-hotplug operations.
1968 * For example, given a demonic sequence of preemptions in num_online_cpus()
1969 * and CPU-hotplug operations, there could be two or more CPUs online at
1970 * all times, but num_online_cpus() might well return one (or even zero).
1971 *
1972 * However, all such demonic sequences require at least one CPU-offline
1973 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1974 * is only a problem if there is an RCU read-side critical section executing
1975 * throughout. But RCU-sched and RCU-bh read-side critical sections
1976 * disable either preemption or bh, which prevents a CPU from going offline.
1977 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1978 * that there is only one CPU when in fact there was more than one throughout
1979 * is when there were no RCU readers in the system. If there are no
1980 * RCU readers, the grace period by definition can be of zero length,
1981 * regardless of the number of online CPUs.
1982 */ 1990 */
1983static inline int rcu_blocking_is_gp(void) 1991static inline int rcu_blocking_is_gp(void)
1984{ 1992{
1993 int ret;
1994
1985 might_sleep(); /* Check for RCU read-side critical section. */ 1995 might_sleep(); /* Check for RCU read-side critical section. */
1986 return num_online_cpus() <= 1; 1996 preempt_disable();
1997 ret = num_online_cpus() <= 1;
1998 preempt_enable();
1999 return ret;
1987} 2000}
1988 2001
1989/** 2002/**
@@ -2118,9 +2131,9 @@ void synchronize_sched_expedited(void)
2118 put_online_cpus(); 2131 put_online_cpus();
2119 2132
2120 /* No joy, try again later. Or just synchronize_sched(). */ 2133 /* No joy, try again later. Or just synchronize_sched(). */
2121 if (trycount++ < 10) 2134 if (trycount++ < 10) {
2122 udelay(trycount * num_online_cpus()); 2135 udelay(trycount * num_online_cpus());
2123 else { 2136 } else {
2124 synchronize_sched(); 2137 synchronize_sched();
2125 return; 2138 return;
2126 } 2139 }
@@ -2241,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2241 */ 2254 */
2242static int rcu_pending(int cpu) 2255static int rcu_pending(int cpu)
2243{ 2256{
2244 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || 2257 struct rcu_state *rsp;
2245 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || 2258
2246 rcu_preempt_pending(cpu); 2259 for_each_rcu_flavor(rsp)
2260 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
2261 return 1;
2262 return 0;
2247} 2263}
2248 2264
2249/* 2265/*
@@ -2253,20 +2269,41 @@ static int rcu_pending(int cpu)
2253 */ 2269 */
2254static int rcu_cpu_has_callbacks(int cpu) 2270static int rcu_cpu_has_callbacks(int cpu)
2255{ 2271{
2272 struct rcu_state *rsp;
2273
2256 /* RCU callbacks either ready or pending? */ 2274 /* RCU callbacks either ready or pending? */
2257 return per_cpu(rcu_sched_data, cpu).nxtlist || 2275 for_each_rcu_flavor(rsp)
2258 per_cpu(rcu_bh_data, cpu).nxtlist || 2276 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
2259 rcu_preempt_cpu_has_callbacks(cpu); 2277 return 1;
2278 return 0;
2279}
2280
2281/*
2282 * Helper function for _rcu_barrier() tracing. If tracing is disabled,
2283 * the compiler is expected to optimize this away.
2284 */
2285static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
2286 int cpu, unsigned long done)
2287{
2288 trace_rcu_barrier(rsp->name, s, cpu,
2289 atomic_read(&rsp->barrier_cpu_count), done);
2260} 2290}
2261 2291
2262/* 2292/*
2263 * RCU callback function for _rcu_barrier(). If we are last, wake 2293 * RCU callback function for _rcu_barrier(). If we are last, wake
2264 * up the task executing _rcu_barrier(). 2294 * up the task executing _rcu_barrier().
2265 */ 2295 */
2266static void rcu_barrier_callback(struct rcu_head *notused) 2296static void rcu_barrier_callback(struct rcu_head *rhp)
2267{ 2297{
2268 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2298 struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
2269 complete(&rcu_barrier_completion); 2299 struct rcu_state *rsp = rdp->rsp;
2300
2301 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
2302 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
2303 complete(&rsp->barrier_completion);
2304 } else {
2305 _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
2306 }
2270} 2307}
2271 2308
2272/* 2309/*
@@ -2274,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)
2274 */ 2311 */
2275static void rcu_barrier_func(void *type) 2312static void rcu_barrier_func(void *type)
2276{ 2313{
2277 int cpu = smp_processor_id(); 2314 struct rcu_state *rsp = type;
2278 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); 2315 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
2279 void (*call_rcu_func)(struct rcu_head *head,
2280 void (*func)(struct rcu_head *head));
2281 2316
2282 atomic_inc(&rcu_barrier_cpu_count); 2317 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
2283 call_rcu_func = type; 2318 atomic_inc(&rsp->barrier_cpu_count);
2284 call_rcu_func(head, rcu_barrier_callback); 2319 rsp->call(&rdp->barrier_head, rcu_barrier_callback);
2285} 2320}
2286 2321
2287/* 2322/*
2288 * Orchestrate the specified type of RCU barrier, waiting for all 2323 * Orchestrate the specified type of RCU barrier, waiting for all
2289 * RCU callbacks of the specified type to complete. 2324 * RCU callbacks of the specified type to complete.
2290 */ 2325 */
2291static void _rcu_barrier(struct rcu_state *rsp, 2326static void _rcu_barrier(struct rcu_state *rsp)
2292 void (*call_rcu_func)(struct rcu_head *head,
2293 void (*func)(struct rcu_head *head)))
2294{ 2327{
2295 int cpu; 2328 int cpu;
2296 unsigned long flags; 2329 unsigned long flags;
2297 struct rcu_data *rdp; 2330 struct rcu_data *rdp;
2298 struct rcu_head rh; 2331 struct rcu_data rd;
2332 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
2333 unsigned long snap_done;
2299 2334
2300 init_rcu_head_on_stack(&rh); 2335 init_rcu_head_on_stack(&rd.barrier_head);
2336 _rcu_barrier_trace(rsp, "Begin", -1, snap);
2301 2337
2302 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2338 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2303 mutex_lock(&rcu_barrier_mutex); 2339 mutex_lock(&rsp->barrier_mutex);
2340
2341 /*
2342 * Ensure that all prior references, including to ->n_barrier_done,
2343 * are ordered before the _rcu_barrier() machinery.
2344 */
2345 smp_mb(); /* See above block comment. */
2346
2347 /*
2348 * Recheck ->n_barrier_done to see if others did our work for us.
2349 * This means checking ->n_barrier_done for an even-to-odd-to-even
2350 * transition. The "if" expression below therefore rounds the old
2351 * value up to the next even number and adds two before comparing.
2352 */
2353 snap_done = ACCESS_ONCE(rsp->n_barrier_done);
2354 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2355 if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
2356 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2357 smp_mb(); /* caller's subsequent code after above check. */
2358 mutex_unlock(&rsp->barrier_mutex);
2359 return;
2360 }
2304 2361
2305 smp_mb(); /* Prevent any prior operations from leaking in. */ 2362 /*
2363 * Increment ->n_barrier_done to avoid duplicate work. Use
2364 * ACCESS_ONCE() to prevent the compiler from speculating
2365 * the increment to precede the early-exit check.
2366 */
2367 ACCESS_ONCE(rsp->n_barrier_done)++;
2368 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
2369 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
2370 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
2306 2371
2307 /* 2372 /*
2308 * Initialize the count to one rather than to zero in order to 2373 * Initialize the count to one rather than to zero in order to
@@ -2321,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,
2321 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening 2386 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2322 * us -- but before CPU 1's orphaned callbacks are invoked!!! 2387 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2323 */ 2388 */
2324 init_completion(&rcu_barrier_completion); 2389 init_completion(&rsp->barrier_completion);
2325 atomic_set(&rcu_barrier_cpu_count, 1); 2390 atomic_set(&rsp->barrier_cpu_count, 1);
2326 raw_spin_lock_irqsave(&rsp->onofflock, flags); 2391 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2327 rsp->rcu_barrier_in_progress = current; 2392 rsp->rcu_barrier_in_progress = current;
2328 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2393 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2338,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,
2338 preempt_disable(); 2403 preempt_disable();
2339 rdp = per_cpu_ptr(rsp->rda, cpu); 2404 rdp = per_cpu_ptr(rsp->rda, cpu);
2340 if (cpu_is_offline(cpu)) { 2405 if (cpu_is_offline(cpu)) {
2406 _rcu_barrier_trace(rsp, "Offline", cpu,
2407 rsp->n_barrier_done);
2341 preempt_enable(); 2408 preempt_enable();
2342 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) 2409 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2343 schedule_timeout_interruptible(1); 2410 schedule_timeout_interruptible(1);
2344 } else if (ACCESS_ONCE(rdp->qlen)) { 2411 } else if (ACCESS_ONCE(rdp->qlen)) {
2345 smp_call_function_single(cpu, rcu_barrier_func, 2412 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2346 (void *)call_rcu_func, 1); 2413 rsp->n_barrier_done);
2414 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
2347 preempt_enable(); 2415 preempt_enable();
2348 } else { 2416 } else {
2417 _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
2418 rsp->n_barrier_done);
2349 preempt_enable(); 2419 preempt_enable();
2350 } 2420 }
2351 } 2421 }
@@ -2362,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,
2362 rcu_adopt_orphan_cbs(rsp); 2432 rcu_adopt_orphan_cbs(rsp);
2363 rsp->rcu_barrier_in_progress = NULL; 2433 rsp->rcu_barrier_in_progress = NULL;
2364 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2434 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2365 atomic_inc(&rcu_barrier_cpu_count); 2435 atomic_inc(&rsp->barrier_cpu_count);
2366 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ 2436 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2367 call_rcu_func(&rh, rcu_barrier_callback); 2437 rd.rsp = rsp;
2438 rsp->call(&rd.barrier_head, rcu_barrier_callback);
2368 2439
2369 /* 2440 /*
2370 * Now that we have an rcu_barrier_callback() callback on each 2441 * Now that we have an rcu_barrier_callback() callback on each
2371 * CPU, and thus each counted, remove the initial count. 2442 * CPU, and thus each counted, remove the initial count.
2372 */ 2443 */
2373 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2444 if (atomic_dec_and_test(&rsp->barrier_cpu_count))
2374 complete(&rcu_barrier_completion); 2445 complete(&rsp->barrier_completion);
2446
2447 /* Increment ->n_barrier_done to prevent duplicate work. */
2448 smp_mb(); /* Keep increment after above mechanism. */
2449 ACCESS_ONCE(rsp->n_barrier_done)++;
2450 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
2451 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
2452 smp_mb(); /* Keep increment before caller's subsequent code. */
2375 2453
2376 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 2454 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2377 wait_for_completion(&rcu_barrier_completion); 2455 wait_for_completion(&rsp->barrier_completion);
2378 2456
2379 /* Other rcu_barrier() invocations can now safely proceed. */ 2457 /* Other rcu_barrier() invocations can now safely proceed. */
2380 mutex_unlock(&rcu_barrier_mutex); 2458 mutex_unlock(&rsp->barrier_mutex);
2381 2459
2382 destroy_rcu_head_on_stack(&rh); 2460 destroy_rcu_head_on_stack(&rd.barrier_head);
2383} 2461}
2384 2462
2385/** 2463/**
@@ -2387,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
2387 */ 2465 */
2388void rcu_barrier_bh(void) 2466void rcu_barrier_bh(void)
2389{ 2467{
2390 _rcu_barrier(&rcu_bh_state, call_rcu_bh); 2468 _rcu_barrier(&rcu_bh_state);
2391} 2469}
2392EXPORT_SYMBOL_GPL(rcu_barrier_bh); 2470EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2393 2471
@@ -2396,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2396 */ 2474 */
2397void rcu_barrier_sched(void) 2475void rcu_barrier_sched(void)
2398{ 2476{
2399 _rcu_barrier(&rcu_sched_state, call_rcu_sched); 2477 _rcu_barrier(&rcu_sched_state);
2400} 2478}
2401EXPORT_SYMBOL_GPL(rcu_barrier_sched); 2479EXPORT_SYMBOL_GPL(rcu_barrier_sched);
2402 2480
@@ -2407,18 +2485,15 @@ static void __init
2407rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 2485rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2408{ 2486{
2409 unsigned long flags; 2487 unsigned long flags;
2410 int i;
2411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2488 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2412 struct rcu_node *rnp = rcu_get_root(rsp); 2489 struct rcu_node *rnp = rcu_get_root(rsp);
2413 2490
2414 /* Set up local state, ensuring consistent view of global state. */ 2491 /* Set up local state, ensuring consistent view of global state. */
2415 raw_spin_lock_irqsave(&rnp->lock, flags); 2492 raw_spin_lock_irqsave(&rnp->lock, flags);
2416 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 2493 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
2417 rdp->nxtlist = NULL; 2494 init_callback_list(rdp);
2418 for (i = 0; i < RCU_NEXT_SIZE; i++)
2419 rdp->nxttail[i] = &rdp->nxtlist;
2420 rdp->qlen_lazy = 0; 2495 rdp->qlen_lazy = 0;
2421 rdp->qlen = 0; 2496 ACCESS_ONCE(rdp->qlen) = 0;
2422 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2497 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2423 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2498 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2424 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2499 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -2492,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2492 2567
2493static void __cpuinit rcu_prepare_cpu(int cpu) 2568static void __cpuinit rcu_prepare_cpu(int cpu)
2494{ 2569{
2495 rcu_init_percpu_data(cpu, &rcu_sched_state, 0); 2570 struct rcu_state *rsp;
2496 rcu_init_percpu_data(cpu, &rcu_bh_state, 0); 2571
2497 rcu_preempt_init_percpu_data(cpu); 2572 for_each_rcu_flavor(rsp)
2573 rcu_init_percpu_data(cpu, rsp,
2574 strcmp(rsp->name, "rcu_preempt") == 0);
2498} 2575}
2499 2576
2500/* 2577/*
@@ -2506,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2506 long cpu = (long)hcpu; 2583 long cpu = (long)hcpu;
2507 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2584 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2508 struct rcu_node *rnp = rdp->mynode; 2585 struct rcu_node *rnp = rdp->mynode;
2586 struct rcu_state *rsp;
2509 2587
2510 trace_rcu_utilization("Start CPU hotplug"); 2588 trace_rcu_utilization("Start CPU hotplug");
2511 switch (action) { 2589 switch (action) {
@@ -2530,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2530 * touch any data without introducing corruption. We send the 2608 * touch any data without introducing corruption. We send the
2531 * dying CPU's callbacks to an arbitrarily chosen online CPU. 2609 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2532 */ 2610 */
2533 rcu_cleanup_dying_cpu(&rcu_bh_state); 2611 for_each_rcu_flavor(rsp)
2534 rcu_cleanup_dying_cpu(&rcu_sched_state); 2612 rcu_cleanup_dying_cpu(rsp);
2535 rcu_preempt_cleanup_dying_cpu();
2536 rcu_cleanup_after_idle(cpu); 2613 rcu_cleanup_after_idle(cpu);
2537 break; 2614 break;
2538 case CPU_DEAD: 2615 case CPU_DEAD:
2539 case CPU_DEAD_FROZEN: 2616 case CPU_DEAD_FROZEN:
2540 case CPU_UP_CANCELED: 2617 case CPU_UP_CANCELED:
2541 case CPU_UP_CANCELED_FROZEN: 2618 case CPU_UP_CANCELED_FROZEN:
2542 rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); 2619 for_each_rcu_flavor(rsp)
2543 rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); 2620 rcu_cleanup_dead_cpu(cpu, rsp);
2544 rcu_preempt_cleanup_dead_cpu(cpu);
2545 break; 2621 break;
2546 default: 2622 default:
2547 break; 2623 break;
@@ -2574,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2574{ 2650{
2575 int i; 2651 int i;
2576 2652
2577 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2653 for (i = rcu_num_lvls - 1; i > 0; i--)
2578 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2654 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2579 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; 2655 rsp->levelspread[0] = rcu_fanout_leaf;
2580} 2656}
2581#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2657#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2582static void __init rcu_init_levelspread(struct rcu_state *rsp) 2658static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2586,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2586 int i; 2662 int i;
2587 2663
2588 cprv = NR_CPUS; 2664 cprv = NR_CPUS;
2589 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { 2665 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2590 ccur = rsp->levelcnt[i]; 2666 ccur = rsp->levelcnt[i];
2591 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 2667 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
2592 cprv = ccur; 2668 cprv = ccur;
@@ -2613,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2613 2689
2614 /* Initialize the level-tracking arrays. */ 2690 /* Initialize the level-tracking arrays. */
2615 2691
2616 for (i = 1; i < NUM_RCU_LVLS; i++) 2692 for (i = 0; i < rcu_num_lvls; i++)
2693 rsp->levelcnt[i] = num_rcu_lvl[i];
2694 for (i = 1; i < rcu_num_lvls; i++)
2617 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 2695 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
2618 rcu_init_levelspread(rsp); 2696 rcu_init_levelspread(rsp);
2619 2697
2620 /* Initialize the elements themselves, starting from the leaves. */ 2698 /* Initialize the elements themselves, starting from the leaves. */
2621 2699
2622 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { 2700 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2623 cpustride *= rsp->levelspread[i]; 2701 cpustride *= rsp->levelspread[i];
2624 rnp = rsp->level[i]; 2702 rnp = rsp->level[i];
2625 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 2703 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2649,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2649 } 2727 }
2650 2728
2651 rsp->rda = rda; 2729 rsp->rda = rda;
2652 rnp = rsp->level[NUM_RCU_LVLS - 1]; 2730 rnp = rsp->level[rcu_num_lvls - 1];
2653 for_each_possible_cpu(i) { 2731 for_each_possible_cpu(i) {
2654 while (i > rnp->grphi) 2732 while (i > rnp->grphi)
2655 rnp++; 2733 rnp++;
2656 per_cpu_ptr(rsp->rda, i)->mynode = rnp; 2734 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
2657 rcu_boot_init_percpu_data(i, rsp); 2735 rcu_boot_init_percpu_data(i, rsp);
2658 } 2736 }
2737 list_add(&rsp->flavors, &rcu_struct_flavors);
2738}
2739
2740/*
2741 * Compute the rcu_node tree geometry from kernel parameters. This cannot
2742 * replace the definitions in rcutree.h because those are needed to size
2743 * the ->node array in the rcu_state structure.
2744 */
2745static void __init rcu_init_geometry(void)
2746{
2747 int i;
2748 int j;
2749 int n = nr_cpu_ids;
2750 int rcu_capacity[MAX_RCU_LVLS + 1];
2751
2752 /* If the compile-time values are accurate, just leave. */
2753 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
2754 return;
2755
2756 /*
2757 * Compute number of nodes that can be handled an rcu_node tree
2758 * with the given number of levels. Setting rcu_capacity[0] makes
2759 * some of the arithmetic easier.
2760 */
2761 rcu_capacity[0] = 1;
2762 rcu_capacity[1] = rcu_fanout_leaf;
2763 for (i = 2; i <= MAX_RCU_LVLS; i++)
2764 rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
2765
2766 /*
2767 * The boot-time rcu_fanout_leaf parameter is only permitted
2768 * to increase the leaf-level fanout, not decrease it. Of course,
2769 * the leaf-level fanout cannot exceed the number of bits in
2770 * the rcu_node masks. Finally, the tree must be able to accommodate
2771 * the configured number of CPUs. Complain and fall back to the
2772 * compile-time values if these limits are exceeded.
2773 */
2774 if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
2775 rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
2776 n > rcu_capacity[MAX_RCU_LVLS]) {
2777 WARN_ON(1);
2778 return;
2779 }
2780
2781 /* Calculate the number of rcu_nodes at each level of the tree. */
2782 for (i = 1; i <= MAX_RCU_LVLS; i++)
2783 if (n <= rcu_capacity[i]) {
2784 for (j = 0; j <= i; j++)
2785 num_rcu_lvl[j] =
2786 DIV_ROUND_UP(n, rcu_capacity[i - j]);
2787 rcu_num_lvls = i;
2788 for (j = i + 1; j <= MAX_RCU_LVLS; j++)
2789 num_rcu_lvl[j] = 0;
2790 break;
2791 }
2792
2793 /* Calculate the total number of rcu_node structures. */
2794 rcu_num_nodes = 0;
2795 for (i = 0; i <= MAX_RCU_LVLS; i++)
2796 rcu_num_nodes += num_rcu_lvl[i];
2797 rcu_num_nodes -= n;
2659} 2798}
2660 2799
2661void __init rcu_init(void) 2800void __init rcu_init(void)
@@ -2663,6 +2802,7 @@ void __init rcu_init(void)
2663 int cpu; 2802 int cpu;
2664 2803
2665 rcu_bootup_announce(); 2804 rcu_bootup_announce();
2805 rcu_init_geometry();
2666 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2806 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
2667 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2807 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
2668 __rcu_init_preempt(); 2808 __rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 19b61ac1079f..4d29169f2124 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -42,28 +42,28 @@
42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
43 43
44#if NR_CPUS <= RCU_FANOUT_1 44#if NR_CPUS <= RCU_FANOUT_1
45# define NUM_RCU_LVLS 1 45# define RCU_NUM_LVLS 1
46# define NUM_RCU_LVL_0 1 46# define NUM_RCU_LVL_0 1
47# define NUM_RCU_LVL_1 (NR_CPUS) 47# define NUM_RCU_LVL_1 (NR_CPUS)
48# define NUM_RCU_LVL_2 0 48# define NUM_RCU_LVL_2 0
49# define NUM_RCU_LVL_3 0 49# define NUM_RCU_LVL_3 0
50# define NUM_RCU_LVL_4 0 50# define NUM_RCU_LVL_4 0
51#elif NR_CPUS <= RCU_FANOUT_2 51#elif NR_CPUS <= RCU_FANOUT_2
52# define NUM_RCU_LVLS 2 52# define RCU_NUM_LVLS 2
53# define NUM_RCU_LVL_0 1 53# define NUM_RCU_LVL_0 1
54# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 54# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
55# define NUM_RCU_LVL_2 (NR_CPUS) 55# define NUM_RCU_LVL_2 (NR_CPUS)
56# define NUM_RCU_LVL_3 0 56# define NUM_RCU_LVL_3 0
57# define NUM_RCU_LVL_4 0 57# define NUM_RCU_LVL_4 0
58#elif NR_CPUS <= RCU_FANOUT_3 58#elif NR_CPUS <= RCU_FANOUT_3
59# define NUM_RCU_LVLS 3 59# define RCU_NUM_LVLS 3
60# define NUM_RCU_LVL_0 1 60# define NUM_RCU_LVL_0 1
61# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 61# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
62# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 62# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
63# define NUM_RCU_LVL_3 (NR_CPUS) 63# define NUM_RCU_LVL_3 (NR_CPUS)
64# define NUM_RCU_LVL_4 0 64# define NUM_RCU_LVL_4 0
65#elif NR_CPUS <= RCU_FANOUT_4 65#elif NR_CPUS <= RCU_FANOUT_4
66# define NUM_RCU_LVLS 4 66# define RCU_NUM_LVLS 4
67# define NUM_RCU_LVL_0 1 67# define NUM_RCU_LVL_0 1
68# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) 68# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
69# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 69# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,6 +76,9 @@
76#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 76#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
77#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 77#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
78 78
79extern int rcu_num_lvls;
80extern int rcu_num_nodes;
81
79/* 82/*
80 * Dynticks per-CPU state. 83 * Dynticks per-CPU state.
81 */ 84 */
@@ -97,6 +100,7 @@ struct rcu_dynticks {
97 /* # times non-lazy CBs posted to CPU. */ 100 /* # times non-lazy CBs posted to CPU. */
98 unsigned long nonlazy_posted_snap; 101 unsigned long nonlazy_posted_snap;
99 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
100#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
101}; 105};
102 106
@@ -206,7 +210,7 @@ struct rcu_node {
206 */ 210 */
207#define rcu_for_each_node_breadth_first(rsp, rnp) \ 211#define rcu_for_each_node_breadth_first(rsp, rnp) \
208 for ((rnp) = &(rsp)->node[0]; \ 212 for ((rnp) = &(rsp)->node[0]; \
209 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 213 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
210 214
211/* 215/*
212 * Do a breadth-first scan of the non-leaf rcu_node structures for the 216 * Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -215,7 +219,7 @@ struct rcu_node {
215 */ 219 */
216#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ 220#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
217 for ((rnp) = &(rsp)->node[0]; \ 221 for ((rnp) = &(rsp)->node[0]; \
218 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) 222 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
219 223
220/* 224/*
221 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state 225 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -224,8 +228,8 @@ struct rcu_node {
224 * It is still a leaf node, even if it is also the root node. 228 * It is still a leaf node, even if it is also the root node.
225 */ 229 */
226#define rcu_for_each_leaf_node(rsp, rnp) \ 230#define rcu_for_each_leaf_node(rsp, rnp) \
227 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ 231 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
228 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 232 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
229 233
230/* Index values for nxttail array in struct rcu_data. */ 234/* Index values for nxttail array in struct rcu_data. */
231#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 235#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
@@ -311,6 +315,9 @@ struct rcu_data {
311 unsigned long n_rp_need_fqs; 315 unsigned long n_rp_need_fqs;
312 unsigned long n_rp_need_nothing; 316 unsigned long n_rp_need_nothing;
313 317
318 /* 6) _rcu_barrier() callback. */
319 struct rcu_head barrier_head;
320
314 int cpu; 321 int cpu;
315 struct rcu_state *rsp; 322 struct rcu_state *rsp;
316}; 323};
@@ -357,10 +364,12 @@ do { \
357 */ 364 */
358struct rcu_state { 365struct rcu_state {
359 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ 366 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
360 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 367 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
361 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 368 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
362 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 369 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
363 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 370 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
371 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
372 void (*func)(struct rcu_head *head));
364 373
365 /* The following fields are guarded by the root rcu_node's lock. */ 374 /* The following fields are guarded by the root rcu_node's lock. */
366 375
@@ -392,6 +401,11 @@ struct rcu_state {
392 struct task_struct *rcu_barrier_in_progress; 401 struct task_struct *rcu_barrier_in_progress;
393 /* Task doing rcu_barrier(), */ 402 /* Task doing rcu_barrier(), */
394 /* or NULL if no barrier. */ 403 /* or NULL if no barrier. */
404 struct mutex barrier_mutex; /* Guards barrier fields. */
405 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
406 struct completion barrier_completion; /* Wake at barrier end. */
407 unsigned long n_barrier_done; /* ++ at start and end of */
408 /* _rcu_barrier(). */
395 raw_spinlock_t fqslock; /* Only one task forcing */ 409 raw_spinlock_t fqslock; /* Only one task forcing */
396 /* quiescent states. */ 410 /* quiescent states. */
397 unsigned long jiffies_force_qs; /* Time at which to invoke */ 411 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -409,8 +423,13 @@ struct rcu_state {
409 unsigned long gp_max; /* Maximum GP duration in */ 423 unsigned long gp_max; /* Maximum GP duration in */
410 /* jiffies. */ 424 /* jiffies. */
411 char *name; /* Name of structure. */ 425 char *name; /* Name of structure. */
426 struct list_head flavors; /* List of RCU flavors. */
412}; 427};
413 428
429extern struct list_head rcu_struct_flavors;
430#define for_each_rcu_flavor(rsp) \
431 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
432
414/* Return values for rcu_preempt_offline_tasks(). */ 433/* Return values for rcu_preempt_offline_tasks(). */
415 434
416#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ 435#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
@@ -453,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu);
453#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 472#endif /* #ifdef CONFIG_HOTPLUG_CPU */
454static void rcu_print_detail_task_stall(struct rcu_state *rsp); 473static void rcu_print_detail_task_stall(struct rcu_state *rsp);
455static int rcu_print_task_stall(struct rcu_node *rnp); 474static int rcu_print_task_stall(struct rcu_node *rnp);
456static void rcu_preempt_stall_reset(void);
457static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 475static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
458#ifdef CONFIG_HOTPLUG_CPU 476#ifdef CONFIG_HOTPLUG_CPU
459static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 477static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
460 struct rcu_node *rnp, 478 struct rcu_node *rnp,
461 struct rcu_data *rdp); 479 struct rcu_data *rdp);
462#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 480#endif /* #ifdef CONFIG_HOTPLUG_CPU */
463static void rcu_preempt_cleanup_dead_cpu(int cpu);
464static void rcu_preempt_check_callbacks(int cpu); 481static void rcu_preempt_check_callbacks(int cpu);
465static void rcu_preempt_process_callbacks(void);
466void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 482void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
467#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 483#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
468static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 484static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
469 bool wake); 485 bool wake);
470#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 486#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
471static int rcu_preempt_pending(int cpu);
472static int rcu_preempt_cpu_has_callbacks(int cpu);
473static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
474static void rcu_preempt_cleanup_dying_cpu(void);
475static void __init __rcu_init_preempt(void); 487static void __init __rcu_init_preempt(void);
476static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 488static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
477static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 489static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3e4899459f3d..7f3244c0df01 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void)
68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); 68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
69#endif 69#endif
70#if NUM_RCU_LVL_4 != 0 70#if NUM_RCU_LVL_4 != 0
71 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); 71 printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
72#endif 72#endif
73 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
74 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
75 if (nr_cpu_ids != NR_CPUS)
76 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
73} 77}
74 78
75#ifdef CONFIG_TREE_PREEMPT_RCU 79#ifdef CONFIG_TREE_PREEMPT_RCU
76 80
77struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); 81struct rcu_state rcu_preempt_state =
82 RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
78DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
79static struct rcu_state *rcu_state = &rcu_preempt_state; 84static struct rcu_state *rcu_state = &rcu_preempt_state;
80 85
81static void rcu_read_unlock_special(struct task_struct *t);
82static int rcu_preempted_readers_exp(struct rcu_node *rnp); 86static int rcu_preempted_readers_exp(struct rcu_node *rnp);
83 87
84/* 88/*
@@ -233,18 +237,6 @@ static void rcu_preempt_note_context_switch(int cpu)
233} 237}
234 238
235/* 239/*
236 * Tree-preemptible RCU implementation for rcu_read_lock().
237 * Just increment ->rcu_read_lock_nesting, shared state will be updated
238 * if we block.
239 */
240void __rcu_read_lock(void)
241{
242 current->rcu_read_lock_nesting++;
243 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
244}
245EXPORT_SYMBOL_GPL(__rcu_read_lock);
246
247/*
248 * Check for preempted RCU readers blocking the current grace period 240 * Check for preempted RCU readers blocking the current grace period
249 * for the specified rcu_node structure. If the caller needs a reliable 241 * for the specified rcu_node structure. If the caller needs a reliable
250 * answer, it must hold the rcu_node's ->lock. 242 * answer, it must hold the rcu_node's ->lock.
@@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
310 * notify RCU core processing or task having blocked during the RCU 302 * notify RCU core processing or task having blocked during the RCU
311 * read-side critical section. 303 * read-side critical section.
312 */ 304 */
313static noinline void rcu_read_unlock_special(struct task_struct *t) 305void rcu_read_unlock_special(struct task_struct *t)
314{ 306{
315 int empty; 307 int empty;
316 int empty_exp; 308 int empty_exp;
@@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
398 rnp->grphi, 390 rnp->grphi,
399 !!rnp->gp_tasks); 391 !!rnp->gp_tasks);
400 rcu_report_unblock_qs_rnp(rnp, flags); 392 rcu_report_unblock_qs_rnp(rnp, flags);
401 } else 393 } else {
402 raw_spin_unlock_irqrestore(&rnp->lock, flags); 394 raw_spin_unlock_irqrestore(&rnp->lock, flags);
395 }
403 396
404#ifdef CONFIG_RCU_BOOST 397#ifdef CONFIG_RCU_BOOST
405 /* Unboost if we were boosted. */ 398 /* Unboost if we were boosted. */
@@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
418 } 411 }
419} 412}
420 413
421/*
422 * Tree-preemptible RCU implementation for rcu_read_unlock().
423 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
424 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
425 * invoke rcu_read_unlock_special() to clean up after a context switch
426 * in an RCU read-side critical section and other special cases.
427 */
428void __rcu_read_unlock(void)
429{
430 struct task_struct *t = current;
431
432 if (t->rcu_read_lock_nesting != 1)
433 --t->rcu_read_lock_nesting;
434 else {
435 barrier(); /* critical section before exit code. */
436 t->rcu_read_lock_nesting = INT_MIN;
437 barrier(); /* assign before ->rcu_read_unlock_special load */
438 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
439 rcu_read_unlock_special(t);
440 barrier(); /* ->rcu_read_unlock_special load before assign */
441 t->rcu_read_lock_nesting = 0;
442 }
443#ifdef CONFIG_PROVE_LOCKING
444 {
445 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
446
447 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
448 }
449#endif /* #ifdef CONFIG_PROVE_LOCKING */
450}
451EXPORT_SYMBOL_GPL(__rcu_read_unlock);
452
453#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 414#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
454 415
455/* 416/*
@@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
540} 501}
541 502
542/* 503/*
543 * Suppress preemptible RCU's CPU stall warnings by pushing the
544 * time of the next stall-warning message comfortably far into the
545 * future.
546 */
547static void rcu_preempt_stall_reset(void)
548{
549 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
550}
551
552/*
553 * Check that the list of blocked tasks for the newly completed grace 504 * Check that the list of blocked tasks for the newly completed grace
554 * period is in fact empty. It is a serious bug to complete a grace 505 * period is in fact empty. It is a serious bug to complete a grace
555 * period that still has RCU readers blocked! This function must be 506 * period that still has RCU readers blocked! This function must be
@@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
650#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 601#endif /* #ifdef CONFIG_HOTPLUG_CPU */
651 602
652/* 603/*
653 * Do CPU-offline processing for preemptible RCU.
654 */
655static void rcu_preempt_cleanup_dead_cpu(int cpu)
656{
657 rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
658}
659
660/*
661 * Check for a quiescent state from the current CPU. When a task blocks, 604 * Check for a quiescent state from the current CPU. When a task blocks,
662 * the task is recorded in the corresponding CPU's rcu_node structure, 605 * the task is recorded in the corresponding CPU's rcu_node structure,
663 * which is checked elsewhere. 606 * which is checked elsewhere.
@@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu)
677 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 620 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
678} 621}
679 622
680/*
681 * Process callbacks for preemptible RCU.
682 */
683static void rcu_preempt_process_callbacks(void)
684{
685 __rcu_process_callbacks(&rcu_preempt_state,
686 &__get_cpu_var(rcu_preempt_data));
687}
688
689#ifdef CONFIG_RCU_BOOST 623#ifdef CONFIG_RCU_BOOST
690 624
691static void rcu_preempt_do_callbacks(void) 625static void rcu_preempt_do_callbacks(void)
@@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
824 int must_wait = 0; 758 int must_wait = 0;
825 759
826 raw_spin_lock_irqsave(&rnp->lock, flags); 760 raw_spin_lock_irqsave(&rnp->lock, flags);
827 if (list_empty(&rnp->blkd_tasks)) 761 if (list_empty(&rnp->blkd_tasks)) {
828 raw_spin_unlock_irqrestore(&rnp->lock, flags); 762 raw_spin_unlock_irqrestore(&rnp->lock, flags);
829 else { 763 } else {
830 rnp->exp_tasks = rnp->blkd_tasks.next; 764 rnp->exp_tasks = rnp->blkd_tasks.next;
831 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 765 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
832 must_wait = 1; 766 must_wait = 1;
@@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void)
870 * expedited grace period for us, just leave. 804 * expedited grace period for us, just leave.
871 */ 805 */
872 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 806 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
873 if (trycount++ < 10) 807 if (trycount++ < 10) {
874 udelay(trycount * num_online_cpus()); 808 udelay(trycount * num_online_cpus());
875 else { 809 } else {
876 synchronize_rcu(); 810 synchronize_rcu();
877 return; 811 return;
878 } 812 }
@@ -917,51 +851,16 @@ mb_ret:
917} 851}
918EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 852EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
919 853
920/*
921 * Check to see if there is any immediate preemptible-RCU-related work
922 * to be done.
923 */
924static int rcu_preempt_pending(int cpu)
925{
926 return __rcu_pending(&rcu_preempt_state,
927 &per_cpu(rcu_preempt_data, cpu));
928}
929
930/*
931 * Does preemptible RCU have callbacks on this CPU?
932 */
933static int rcu_preempt_cpu_has_callbacks(int cpu)
934{
935 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
936}
937
938/** 854/**
939 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 855 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
940 */ 856 */
941void rcu_barrier(void) 857void rcu_barrier(void)
942{ 858{
943 _rcu_barrier(&rcu_preempt_state, call_rcu); 859 _rcu_barrier(&rcu_preempt_state);
944} 860}
945EXPORT_SYMBOL_GPL(rcu_barrier); 861EXPORT_SYMBOL_GPL(rcu_barrier);
946 862
947/* 863/*
948 * Initialize preemptible RCU's per-CPU data.
949 */
950static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
951{
952 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
953}
954
955/*
956 * Move preemptible RCU's callbacks from dying CPU to other online CPU
957 * and record a quiescent state.
958 */
959static void rcu_preempt_cleanup_dying_cpu(void)
960{
961 rcu_cleanup_dying_cpu(&rcu_preempt_state);
962}
963
964/*
965 * Initialize preemptible RCU's state structures. 864 * Initialize preemptible RCU's state structures.
966 */ 865 */
967static void __init __rcu_init_preempt(void) 866static void __init __rcu_init_preempt(void)
@@ -1046,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
1046} 945}
1047 946
1048/* 947/*
1049 * Because preemptible RCU does not exist, there is no need to suppress
1050 * its CPU stall warnings.
1051 */
1052static void rcu_preempt_stall_reset(void)
1053{
1054}
1055
1056/*
1057 * Because there is no preemptible RCU, there can be no readers blocked, 948 * Because there is no preemptible RCU, there can be no readers blocked,
1058 * so there is no need to check for blocked tasks. So check only for 949 * so there is no need to check for blocked tasks. So check only for
1059 * bogus qsmask values. 950 * bogus qsmask values.
@@ -1081,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1081#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 972#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1082 973
1083/* 974/*
1084 * Because preemptible RCU does not exist, it never needs CPU-offline
1085 * processing.
1086 */
1087static void rcu_preempt_cleanup_dead_cpu(int cpu)
1088{
1089}
1090
1091/*
1092 * Because preemptible RCU does not exist, it never has any callbacks 975 * Because preemptible RCU does not exist, it never has any callbacks
1093 * to check. 976 * to check.
1094 */ 977 */
@@ -1097,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu)
1097} 980}
1098 981
1099/* 982/*
1100 * Because preemptible RCU does not exist, it never has any callbacks
1101 * to process.
1102 */
1103static void rcu_preempt_process_callbacks(void)
1104{
1105}
1106
1107/*
1108 * Queue an RCU callback for lazy invocation after a grace period. 983 * Queue an RCU callback for lazy invocation after a grace period.
1109 * This will likely be later named something like "call_rcu_lazy()", 984 * This will likely be later named something like "call_rcu_lazy()",
1110 * but this change will require some way of tagging the lazy RCU 985 * but this change will require some way of tagging the lazy RCU
@@ -1145,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1145#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1020#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1146 1021
1147/* 1022/*
1148 * Because preemptible RCU does not exist, it never has any work to do.
1149 */
1150static int rcu_preempt_pending(int cpu)
1151{
1152 return 0;
1153}
1154
1155/*
1156 * Because preemptible RCU does not exist, it never has callbacks
1157 */
1158static int rcu_preempt_cpu_has_callbacks(int cpu)
1159{
1160 return 0;
1161}
1162
1163/*
1164 * Because preemptible RCU does not exist, rcu_barrier() is just 1023 * Because preemptible RCU does not exist, rcu_barrier() is just
1165 * another name for rcu_barrier_sched(). 1024 * another name for rcu_barrier_sched().
1166 */ 1025 */
@@ -1171,21 +1030,6 @@ void rcu_barrier(void)
1171EXPORT_SYMBOL_GPL(rcu_barrier); 1030EXPORT_SYMBOL_GPL(rcu_barrier);
1172 1031
1173/* 1032/*
1174 * Because preemptible RCU does not exist, there is no per-CPU
1175 * data to initialize.
1176 */
1177static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1178{
1179}
1180
1181/*
1182 * Because there is no preemptible RCU, there is no cleanup to do.
1183 */
1184static void rcu_preempt_cleanup_dying_cpu(void)
1185{
1186}
1187
1188/*
1189 * Because preemptible RCU does not exist, it need not be initialized. 1033 * Because preemptible RCU does not exist, it need not be initialized.
1190 */ 1034 */
1191static void __init __rcu_init_preempt(void) 1035static void __init __rcu_init_preempt(void)
@@ -1968,9 +1812,11 @@ static void rcu_idle_count_callbacks_posted(void)
1968 */ 1812 */
1969#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ 1813#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
1970#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ 1814#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
1971#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1815#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1972#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1816#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1973 1817
1818extern int tick_nohz_enabled;
1819
1974/* 1820/*
1975 * Does the specified flavor of RCU have non-lazy callbacks pending on 1821 * Does the specified flavor of RCU have non-lazy callbacks pending on
1976 * the specified CPU? Both RCU flavor and CPU are specified by the 1822 * the specified CPU? Both RCU flavor and CPU are specified by the
@@ -2047,10 +1893,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
2047 return 1; 1893 return 1;
2048 } 1894 }
2049 /* Set up for the possibility that RCU will post a timer. */ 1895 /* Set up for the possibility that RCU will post a timer. */
2050 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 1896 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2051 *delta_jiffies = RCU_IDLE_GP_DELAY; 1897 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
2052 else 1898 RCU_IDLE_GP_DELAY) - jiffies;
2053 *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; 1899 } else {
1900 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
1901 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1902 }
2054 return 0; 1903 return 0;
2055} 1904}
2056 1905
@@ -2109,6 +1958,7 @@ static void rcu_cleanup_after_idle(int cpu)
2109 1958
2110 del_timer(&rdtp->idle_gp_timer); 1959 del_timer(&rdtp->idle_gp_timer);
2111 trace_rcu_prep_idle("Cleanup after idle"); 1960 trace_rcu_prep_idle("Cleanup after idle");
1961 rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
2112} 1962}
2113 1963
2114/* 1964/*
@@ -2134,6 +1984,18 @@ static void rcu_prepare_for_idle(int cpu)
2134{ 1984{
2135 struct timer_list *tp; 1985 struct timer_list *tp;
2136 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1986 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1987 int tne;
1988
1989 /* Handle nohz enablement switches conservatively. */
1990 tne = ACCESS_ONCE(tick_nohz_enabled);
1991 if (tne != rdtp->tick_nohz_enabled_snap) {
1992 if (rcu_cpu_has_callbacks(cpu))
1993 invoke_rcu_core(); /* force nohz to see update. */
1994 rdtp->tick_nohz_enabled_snap = tne;
1995 return;
1996 }
1997 if (!tne)
1998 return;
2137 1999
2138 /* 2000 /*
2139 * If this is an idle re-entry, for example, due to use of 2001 * If this is an idle re-entry, for example, due to use of
@@ -2187,10 +2049,11 @@ static void rcu_prepare_for_idle(int cpu)
2187 if (rcu_cpu_has_nonlazy_callbacks(cpu)) { 2049 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2188 trace_rcu_prep_idle("Dyntick with callbacks"); 2050 trace_rcu_prep_idle("Dyntick with callbacks");
2189 rdtp->idle_gp_timer_expires = 2051 rdtp->idle_gp_timer_expires =
2190 jiffies + RCU_IDLE_GP_DELAY; 2052 round_up(jiffies + RCU_IDLE_GP_DELAY,
2053 RCU_IDLE_GP_DELAY);
2191 } else { 2054 } else {
2192 rdtp->idle_gp_timer_expires = 2055 rdtp->idle_gp_timer_expires =
2193 jiffies + RCU_IDLE_LAZY_GP_DELAY; 2056 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
2194 trace_rcu_prep_idle("Dyntick with lazy callbacks"); 2057 trace_rcu_prep_idle("Dyntick with lazy callbacks");
2195 } 2058 }
2196 tp = &rdtp->idle_gp_timer; 2059 tp = &rdtp->idle_gp_timer;
@@ -2231,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu)
2231 if (rcu_cpu_has_callbacks(cpu)) { 2094 if (rcu_cpu_has_callbacks(cpu)) {
2232 trace_rcu_prep_idle("More callbacks"); 2095 trace_rcu_prep_idle("More callbacks");
2233 invoke_rcu_core(); 2096 invoke_rcu_core();
2234 } else 2097 } else {
2235 trace_rcu_prep_idle("Callbacks drained"); 2098 trace_rcu_prep_idle("Callbacks drained");
2099 }
2236} 2100}
2237 2101
2238/* 2102/*
@@ -2269,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2269 2133
2270static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2134static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2271{ 2135{
2136 *cp = '\0';
2272} 2137}
2273 2138
2274#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ 2139#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..abffb486e94e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,31 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused)
50{
51 struct rcu_state *rsp;
52
53 for_each_rcu_flavor(rsp)
54 seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
55 rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
56 atomic_read(&rsp->barrier_cpu_count),
57 rsp->n_barrier_done);
58 return 0;
59}
60
61static int rcubarrier_open(struct inode *inode, struct file *file)
62{
63 return single_open(file, show_rcubarrier, NULL);
64}
65
66static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE,
68 .open = rcubarrier_open,
69 .read = seq_read,
70 .llseek = seq_lseek,
71 .release = single_release,
72};
73
49#ifdef CONFIG_RCU_BOOST 74#ifdef CONFIG_RCU_BOOST
50 75
51static char convert_kthread_status(unsigned int kthread_status) 76static char convert_kthread_status(unsigned int kthread_status)
@@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
95 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 120 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
96} 121}
97 122
98#define PRINT_RCU_DATA(name, func, m) \
99 do { \
100 int _p_r_d_i; \
101 \
102 for_each_possible_cpu(_p_r_d_i) \
103 func(m, &per_cpu(name, _p_r_d_i)); \
104 } while (0)
105
106static int show_rcudata(struct seq_file *m, void *unused) 123static int show_rcudata(struct seq_file *m, void *unused)
107{ 124{
108#ifdef CONFIG_TREE_PREEMPT_RCU 125 int cpu;
109 seq_puts(m, "rcu_preempt:\n"); 126 struct rcu_state *rsp;
110 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); 127
111#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 128 for_each_rcu_flavor(rsp) {
112 seq_puts(m, "rcu_sched:\n"); 129 seq_printf(m, "%s:\n", rsp->name);
113 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); 130 for_each_possible_cpu(cpu)
114 seq_puts(m, "rcu_bh:\n"); 131 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
115 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 132 }
116 return 0; 133 return 0;
117} 134}
118 135
@@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
166 183
167static int show_rcudata_csv(struct seq_file *m, void *unused) 184static int show_rcudata_csv(struct seq_file *m, void *unused)
168{ 185{
186 int cpu;
187 struct rcu_state *rsp;
188
169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 189 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 190 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
171 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 191 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
@@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
173 seq_puts(m, "\"kt\",\"ktl\""); 193 seq_puts(m, "\"kt\",\"ktl\"");
174#endif /* #ifdef CONFIG_RCU_BOOST */ 194#endif /* #ifdef CONFIG_RCU_BOOST */
175 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); 195 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
176#ifdef CONFIG_TREE_PREEMPT_RCU 196 for_each_rcu_flavor(rsp) {
177 seq_puts(m, "\"rcu_preempt:\"\n"); 197 seq_printf(m, "\"%s:\"\n", rsp->name);
178 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 198 for_each_possible_cpu(cpu)
179#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 199 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
180 seq_puts(m, "\"rcu_sched:\"\n"); 200 }
181 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
182 seq_puts(m, "\"rcu_bh:\"\n");
183 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
184 return 0; 201 return 0;
185} 202}
186 203
@@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = {
201 218
202static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) 219static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
203{ 220{
204 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " 221 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
205 "j=%04x bt=%04x\n",
206 rnp->grplo, rnp->grphi, 222 rnp->grplo, rnp->grphi,
207 "T."[list_empty(&rnp->blkd_tasks)], 223 "T."[list_empty(&rnp->blkd_tasks)],
208 "N."[!rnp->gp_tasks], 224 "N."[!rnp->gp_tasks],
@@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
210 "B."[!rnp->boost_tasks], 226 "B."[!rnp->boost_tasks],
211 convert_kthread_status(rnp->boost_kthread_status), 227 convert_kthread_status(rnp->boost_kthread_status),
212 rnp->n_tasks_boosted, rnp->n_exp_boosts, 228 rnp->n_tasks_boosted, rnp->n_exp_boosts,
213 rnp->n_normal_boosts, 229 rnp->n_normal_boosts);
230 seq_printf(m, "j=%04x bt=%04x\n",
214 (int)(jiffies & 0xffff), 231 (int)(jiffies & 0xffff),
215 (int)(rnp->boost_time & 0xffff)); 232 (int)(rnp->boost_time & 0xffff));
216 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", 233 seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
217 " balk",
218 rnp->n_balk_blkd_tasks, 234 rnp->n_balk_blkd_tasks,
219 rnp->n_balk_exp_gp_tasks, 235 rnp->n_balk_exp_gp_tasks,
220 rnp->n_balk_boost_tasks, 236 rnp->n_balk_boost_tasks,
@@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
270 struct rcu_node *rnp; 286 struct rcu_node *rnp;
271 287
272 gpnum = rsp->gpnum; 288 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 289 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ",
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 290 rsp->name, rsp->completed, gpnum, rsp->fqs_state,
275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 291 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 292 (int)(jiffies & 0xffff));
293 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 294 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 295 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); 296 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 297 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
282 if (rnp->level != level) { 298 if (rnp->level != level) {
283 seq_puts(m, "\n"); 299 seq_puts(m, "\n");
284 level = rnp->level; 300 level = rnp->level;
@@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
295 311
296static int show_rcuhier(struct seq_file *m, void *unused) 312static int show_rcuhier(struct seq_file *m, void *unused)
297{ 313{
298#ifdef CONFIG_TREE_PREEMPT_RCU 314 struct rcu_state *rsp;
299 seq_puts(m, "rcu_preempt:\n"); 315
300 print_one_rcu_state(m, &rcu_preempt_state); 316 for_each_rcu_flavor(rsp)
301#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 317 print_one_rcu_state(m, rsp);
302 seq_puts(m, "rcu_sched:\n");
303 print_one_rcu_state(m, &rcu_sched_state);
304 seq_puts(m, "rcu_bh:\n");
305 print_one_rcu_state(m, &rcu_bh_state);
306 return 0; 318 return 0;
307} 319}
308 320
@@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
343 355
344static int show_rcugp(struct seq_file *m, void *unused) 356static int show_rcugp(struct seq_file *m, void *unused)
345{ 357{
346#ifdef CONFIG_TREE_PREEMPT_RCU 358 struct rcu_state *rsp;
347 show_one_rcugp(m, &rcu_preempt_state); 359
348#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 360 for_each_rcu_flavor(rsp)
349 show_one_rcugp(m, &rcu_sched_state); 361 show_one_rcugp(m, rsp);
350 show_one_rcugp(m, &rcu_bh_state);
351 return 0; 362 return 0;
352} 363}
353 364
@@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = {
366 377
367static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 378static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
368{ 379{
369 seq_printf(m, "%3d%cnp=%ld " 380 seq_printf(m, "%3d%cnp=%ld ",
370 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
371 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
372 rdp->cpu, 381 rdp->cpu,
373 cpu_is_offline(rdp->cpu) ? '!' : ' ', 382 cpu_is_offline(rdp->cpu) ? '!' : ' ',
374 rdp->n_rcu_pending, 383 rdp->n_rcu_pending);
384 seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
375 rdp->n_rp_qs_pending, 385 rdp->n_rp_qs_pending,
376 rdp->n_rp_report_qs, 386 rdp->n_rp_report_qs,
377 rdp->n_rp_cb_ready, 387 rdp->n_rp_cb_ready,
378 rdp->n_rp_cpu_needs_gp, 388 rdp->n_rp_cpu_needs_gp);
389 seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
379 rdp->n_rp_gp_completed, 390 rdp->n_rp_gp_completed,
380 rdp->n_rp_gp_started, 391 rdp->n_rp_gp_started,
381 rdp->n_rp_need_fqs, 392 rdp->n_rp_need_fqs,
382 rdp->n_rp_need_nothing); 393 rdp->n_rp_need_nothing);
383} 394}
384 395
385static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) 396static int show_rcu_pending(struct seq_file *m, void *unused)
386{ 397{
387 int cpu; 398 int cpu;
388 struct rcu_data *rdp; 399 struct rcu_data *rdp;
389 400 struct rcu_state *rsp;
390 for_each_possible_cpu(cpu) { 401
391 rdp = per_cpu_ptr(rsp->rda, cpu); 402 for_each_rcu_flavor(rsp) {
392 if (rdp->beenonline) 403 seq_printf(m, "%s:\n", rsp->name);
393 print_one_rcu_pending(m, rdp); 404 for_each_possible_cpu(cpu) {
405 rdp = per_cpu_ptr(rsp->rda, cpu);
406 if (rdp->beenonline)
407 print_one_rcu_pending(m, rdp);
408 }
394 } 409 }
395}
396
397static int show_rcu_pending(struct seq_file *m, void *unused)
398{
399#ifdef CONFIG_TREE_PREEMPT_RCU
400 seq_puts(m, "rcu_preempt:\n");
401 print_rcu_pendings(m, &rcu_preempt_state);
402#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
403 seq_puts(m, "rcu_sched:\n");
404 print_rcu_pendings(m, &rcu_sched_state);
405 seq_puts(m, "rcu_bh:\n");
406 print_rcu_pendings(m, &rcu_bh_state);
407 return 0; 410 return 0;
408} 411}
409 412
@@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void)
453 if (!rcudir) 456 if (!rcudir)
454 goto free_out; 457 goto free_out;
455 458
459 retval = debugfs_create_file("rcubarrier", 0444, rcudir,
460 NULL, &rcubarrier_fops);
461 if (!retval)
462 goto free_out;
463
456 retval = debugfs_create_file("rcudata", 0444, rcudir, 464 retval = debugfs_create_file("rcudata", 0444, rcudir,
457 NULL, &rcudata_fops); 465 NULL, &rcudata_fops);
458 if (!retval) 466 if (!retval)
diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8ee76d5..34d45886ee84 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/export.h> 12#include <linux/export.h>
11#include <linux/errno.h> 13#include <linux/errno.h>
12#include <linux/ioport.h> 14#include <linux/ioport.h>
@@ -722,14 +724,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
722 724
723 write_lock(&resource_lock); 725 write_lock(&resource_lock);
724 726
727 if (!parent)
728 goto skip;
729
725 if ((start < parent->start) || (end > parent->end)) 730 if ((start < parent->start) || (end > parent->end))
726 goto out; 731 goto out;
727 732
728 for (tmp = res->child; tmp; tmp = tmp->sibling) {
729 if ((tmp->start < start) || (tmp->end > end))
730 goto out;
731 }
732
733 if (res->sibling && (res->sibling->start <= end)) 733 if (res->sibling && (res->sibling->start <= end))
734 goto out; 734 goto out;
735 735
@@ -741,6 +741,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
741 goto out; 741 goto out;
742 } 742 }
743 743
744skip:
745 for (tmp = res->child; tmp; tmp = tmp->sibling)
746 if ((tmp->start < start) || (tmp->end > end))
747 goto out;
748
744 res->start = start; 749 res->start = start;
745 res->end = end; 750 res->end = end;
746 result = 0; 751 result = 0;
@@ -788,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root,
788 resource_size_t start, resource_size_t end, 793 resource_size_t start, resource_size_t end,
789 const char *name) 794 const char *name)
790{ 795{
796 int abort = 0;
797
791 write_lock(&resource_lock); 798 write_lock(&resource_lock);
792 __reserve_region_with_split(root, start, end, name); 799 if (root->start > start || root->end < end) {
800 pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
801 (unsigned long long)start, (unsigned long long)end,
802 root);
803 if (start > root->end || end < root->start)
804 abort = 1;
805 else {
806 if (end > root->end)
807 end = root->end;
808 if (start < root->start)
809 start = root->start;
810 pr_err("fixing request to [0x%llx-0x%llx]\n",
811 (unsigned long long)start,
812 (unsigned long long)end);
813 }
814 dump_stack();
815 }
816 if (!abort)
817 __reserve_region_with_split(root, start, end, name);
793 write_unlock(&resource_lock); 818 write_unlock(&resource_lock);
794} 819}
795 820
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..fbf1fd098dc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1097 * 1097 *
1098 * sched_move_task() holds both and thus holding either pins the cgroup, 1098 * sched_move_task() holds both and thus holding either pins the cgroup,
1099 * see set_task_rq(). 1099 * see task_group().
1100 * 1100 *
1101 * Furthermore, all task_rq users should acquire both locks, see 1101 * Furthermore, all task_rq users should acquire both locks, see
1102 * task_rq_lock(). 1102 * task_rq_lock().
@@ -1910,12 +1910,12 @@ static inline void
1910prepare_task_switch(struct rq *rq, struct task_struct *prev, 1910prepare_task_switch(struct rq *rq, struct task_struct *prev,
1911 struct task_struct *next) 1911 struct task_struct *next)
1912{ 1912{
1913 trace_sched_switch(prev, next);
1913 sched_info_switch(prev, next); 1914 sched_info_switch(prev, next);
1914 perf_event_task_sched_out(prev, next); 1915 perf_event_task_sched_out(prev, next);
1915 fire_sched_out_preempt_notifiers(prev, next); 1916 fire_sched_out_preempt_notifiers(prev, next);
1916 prepare_lock_switch(rq, next); 1917 prepare_lock_switch(rq, next);
1917 prepare_arch_switch(next); 1918 prepare_arch_switch(next);
1918 trace_sched_switch(prev, next);
1919} 1919}
1920 1920
1921/** 1921/**
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3143#endif 3143#endif
3144 3144
3145static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
3146{
3147 u64 temp = (__force u64) rtime;
3148
3149 temp *= (__force u64) utime;
3150
3151 if (sizeof(cputime_t) == 4)
3152 temp = div_u64(temp, (__force u32) total);
3153 else
3154 temp = div64_u64(temp, (__force u64) total);
3155
3156 return (__force cputime_t) temp;
3157}
3158
3145void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3159void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3146{ 3160{
3147 cputime_t rtime, utime = p->utime, total = utime + p->stime; 3161 cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3151 */ 3165 */
3152 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3166 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3153 3167
3154 if (total) { 3168 if (total)
3155 u64 temp = (__force u64) rtime; 3169 utime = scale_utime(utime, rtime, total);
3156 3170 else
3157 temp *= (__force u64) utime;
3158 do_div(temp, (__force u32) total);
3159 utime = (__force cputime_t) temp;
3160 } else
3161 utime = rtime; 3171 utime = rtime;
3162 3172
3163 /* 3173 /*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3184 total = cputime.utime + cputime.stime; 3194 total = cputime.utime + cputime.stime;
3185 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3195 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3186 3196
3187 if (total) { 3197 if (total)
3188 u64 temp = (__force u64) rtime; 3198 utime = scale_utime(cputime.utime, rtime, total);
3189 3199 else
3190 temp *= (__force u64) cputime.utime;
3191 do_div(temp, (__force u32) total);
3192 utime = (__force cputime_t) temp;
3193 } else
3194 utime = rtime; 3200 utime = rtime;
3195 3201
3196 sig->prev_utime = max(sig->prev_utime, utime); 3202 sig->prev_utime = max(sig->prev_utime, utime);
@@ -4340,9 +4346,7 @@ recheck:
4340 */ 4346 */
4341 if (unlikely(policy == p->policy && (!rt_policy(policy) || 4347 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
4342 param->sched_priority == p->rt_priority))) { 4348 param->sched_priority == p->rt_priority))) {
4343 4349 task_rq_unlock(rq, p, &flags);
4344 __task_rq_unlock(rq);
4345 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4346 return 0; 4350 return 0;
4347 } 4351 }
4348 4352
@@ -6024,6 +6028,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 6028 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6025 * allows us to avoid some pointer chasing select_idle_sibling(). 6029 * allows us to avoid some pointer chasing select_idle_sibling().
6026 * 6030 *
6031 * Iterate domains and sched_groups downward, assigning CPUs to be
6032 * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
6033 * due to random perturbation self canceling, ie sw buddies pull
6034 * their counterpart to their CPU's hw counterpart.
6035 *
6027 * Also keep a unique ID per domain (we use the first cpu number in 6036 * Also keep a unique ID per domain (we use the first cpu number in
6028 * the cpumask of the domain), this allows us to quickly tell if 6037 * the cpumask of the domain), this allows us to quickly tell if
6029 * two cpus are in the same cache domain, see cpus_share_cache(). 6038 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6046,40 @@ static void update_top_cache_domain(int cpu)
6037 int id = cpu; 6046 int id = cpu;
6038 6047
6039 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 6048 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040 if (sd) 6049 if (sd) {
6050 struct sched_domain *tmp = sd;
6051 struct sched_group *sg, *prev;
6052 bool right;
6053
6054 /*
6055 * Traverse to first CPU in group, and count hops
6056 * to cpu from there, switching direction on each
6057 * hop, never ever pointing the last CPU rightward.
6058 */
6059 do {
6060 id = cpumask_first(sched_domain_span(tmp));
6061 prev = sg = tmp->groups;
6062 right = 1;
6063
6064 while (cpumask_first(sched_group_cpus(sg)) != id)
6065 sg = sg->next;
6066
6067 while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
6068 prev = sg;
6069 sg = sg->next;
6070 right = !right;
6071 }
6072
6073 /* A CPU went down, never point back to domain start. */
6074 if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
6075 right = false;
6076
6077 sg = right ? sg->next : prev;
6078 tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
6079 } while ((tmp = tmp->child));
6080
6041 id = cpumask_first(sched_domain_span(sd)); 6081 id = cpumask_first(sched_domain_span(sd));
6082 }
6042 6083
6043 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6084 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6044 per_cpu(sd_llc_id, cpu) = id; 6085 per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7138,66 @@ match2:
7097 mutex_unlock(&sched_domains_mutex); 7138 mutex_unlock(&sched_domains_mutex);
7098} 7139}
7099 7140
7141static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
7142
7100/* 7143/*
7101 * Update cpusets according to cpu_active mask. If cpusets are 7144 * Update cpusets according to cpu_active mask. If cpusets are
7102 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7145 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7103 * around partition_sched_domains(). 7146 * around partition_sched_domains().
7147 *
7148 * If we come here as part of a suspend/resume, don't touch cpusets because we
7149 * want to restore it back to its original state upon resume anyway.
7104 */ 7150 */
7105static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7151static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7106 void *hcpu) 7152 void *hcpu)
7107{ 7153{
7108 switch (action & ~CPU_TASKS_FROZEN) { 7154 switch (action) {
7155 case CPU_ONLINE_FROZEN:
7156 case CPU_DOWN_FAILED_FROZEN:
7157
7158 /*
7159 * num_cpus_frozen tracks how many CPUs are involved in suspend
7160 * resume sequence. As long as this is not the last online
7161 * operation in the resume sequence, just build a single sched
7162 * domain, ignoring cpusets.
7163 */
7164 num_cpus_frozen--;
7165 if (likely(num_cpus_frozen)) {
7166 partition_sched_domains(1, NULL, NULL);
7167 break;
7168 }
7169
7170 /*
7171 * This is the last CPU online operation. So fall through and
7172 * restore the original sched domains by considering the
7173 * cpuset configurations.
7174 */
7175
7109 case CPU_ONLINE: 7176 case CPU_ONLINE:
7110 case CPU_DOWN_FAILED: 7177 case CPU_DOWN_FAILED:
7111 cpuset_update_active_cpus(); 7178 cpuset_update_active_cpus(true);
7112 return NOTIFY_OK; 7179 break;
7113 default: 7180 default:
7114 return NOTIFY_DONE; 7181 return NOTIFY_DONE;
7115 } 7182 }
7183 return NOTIFY_OK;
7116} 7184}
7117 7185
7118static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7186static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7119 void *hcpu) 7187 void *hcpu)
7120{ 7188{
7121 switch (action & ~CPU_TASKS_FROZEN) { 7189 switch (action) {
7122 case CPU_DOWN_PREPARE: 7190 case CPU_DOWN_PREPARE:
7123 cpuset_update_active_cpus(); 7191 cpuset_update_active_cpus(false);
7124 return NOTIFY_OK; 7192 break;
7193 case CPU_DOWN_PREPARE_FROZEN:
7194 num_cpus_frozen++;
7195 partition_sched_domains(1, NULL, NULL);
7196 break;
7125 default: 7197 default:
7126 return NOTIFY_DONE; 7198 return NOTIFY_DONE;
7127 } 7199 }
7200 return NOTIFY_OK;
7128} 7201}
7129 7202
7130void __init sched_init_smp(void) 7203void __init sched_init_smp(void)
@@ -7179,6 +7252,7 @@ int in_sched_functions(unsigned long addr)
7179 7252
7180#ifdef CONFIG_CGROUP_SCHED 7253#ifdef CONFIG_CGROUP_SCHED
7181struct task_group root_task_group; 7254struct task_group root_task_group;
7255LIST_HEAD(task_groups);
7182#endif 7256#endif
7183 7257
7184DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 7258DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
@@ -7589,6 +7663,7 @@ void sched_destroy_group(struct task_group *tg)
7589 */ 7663 */
7590void sched_move_task(struct task_struct *tsk) 7664void sched_move_task(struct task_struct *tsk)
7591{ 7665{
7666 struct task_group *tg;
7592 int on_rq, running; 7667 int on_rq, running;
7593 unsigned long flags; 7668 unsigned long flags;
7594 struct rq *rq; 7669 struct rq *rq;
@@ -7603,6 +7678,12 @@ void sched_move_task(struct task_struct *tsk)
7603 if (unlikely(running)) 7678 if (unlikely(running))
7604 tsk->sched_class->put_prev_task(rq, tsk); 7679 tsk->sched_class->put_prev_task(rq, tsk);
7605 7680
7681 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7682 lockdep_is_held(&tsk->sighand->siglock)),
7683 struct task_group, css);
7684 tg = autogroup_task_group(tsk, tg);
7685 tsk->sched_task_group = tg;
7686
7606#ifdef CONFIG_FAIR_GROUP_SCHED 7687#ifdef CONFIG_FAIR_GROUP_SCHED
7607 if (tsk->sched_class->task_move_group) 7688 if (tsk->sched_class->task_move_group)
7608 tsk->sched_class->task_move_group(tsk, on_rq); 7689 tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf660..23aa789c53ee 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
65int cpupri_find(struct cpupri *cp, struct task_struct *p, 65int cpupri_find(struct cpupri *cp, struct task_struct *p,
66 struct cpumask *lowest_mask) 66 struct cpumask *lowest_mask)
67{ 67{
68 int idx = 0; 68 int idx = 0;
69 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
70 70
71 if (task_pri >= MAX_RT_PRIO) 71 if (task_pri >= MAX_RT_PRIO)
72 return 0; 72 return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
137 */ 137 */
138void cpupri_set(struct cpupri *cp, int cpu, int newpri) 138void cpupri_set(struct cpupri *cp, int cpu, int newpri)
139{ 139{
140 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
141 int oldpri = *currpri; 141 int oldpri = *currpri;
142 int do_mb = 0; 142 int do_mb = 0;
143 143
144 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
145 145
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..c219bf8d704c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2637 int cpu = smp_processor_id(); 2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p); 2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd; 2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2642 2640
2643 /* 2641 /*
2644 * If the task is going to be woken-up on this cpu and if it is 2642 * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
2655 return prev_cpu; 2653 return prev_cpu;
2656 2654
2657 /* 2655 /*
2658 * Otherwise, iterate the domains and find an elegible idle cpu. 2656 * Otherwise, check assigned siblings to find an elegible idle cpu.
2659 */ 2657 */
2660 sd = rcu_dereference(per_cpu(sd_llc, target)); 2658 sd = rcu_dereference(per_cpu(sd_llc, target));
2661 for_each_lower_domain(sd) {
2662 sg = sd->groups;
2663 do {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2665 tsk_cpus_allowed(p)))
2666 goto next;
2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672 2659
2673 target = cpumask_first_and(sched_group_cpus(sg), 2660 for_each_lower_domain(sd) {
2674 tsk_cpus_allowed(p)); 2661 if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
2675 goto done; 2662 continue;
2676next: 2663 if (idle_cpu(sd->idle_buddy))
2677 sg = sg->next; 2664 return sd->idle_buddy;
2678 } while (sg != sd->groups);
2679 } 2665 }
2680done: 2666
2681 return target; 2667 return target;
2682} 2668}
2683 2669
@@ -3068,18 +3054,24 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3068 3054
3069#define LBF_ALL_PINNED 0x01 3055#define LBF_ALL_PINNED 0x01
3070#define LBF_NEED_BREAK 0x02 3056#define LBF_NEED_BREAK 0x02
3057#define LBF_SOME_PINNED 0x04
3071 3058
3072struct lb_env { 3059struct lb_env {
3073 struct sched_domain *sd; 3060 struct sched_domain *sd;
3074 3061
3075 int src_cpu;
3076 struct rq *src_rq; 3062 struct rq *src_rq;
3063 int src_cpu;
3077 3064
3078 int dst_cpu; 3065 int dst_cpu;
3079 struct rq *dst_rq; 3066 struct rq *dst_rq;
3080 3067
3068 struct cpumask *dst_grpmask;
3069 int new_dst_cpu;
3081 enum cpu_idle_type idle; 3070 enum cpu_idle_type idle;
3082 long imbalance; 3071 long imbalance;
3072 /* The set of CPUs under consideration for load-balancing */
3073 struct cpumask *cpus;
3074
3083 unsigned int flags; 3075 unsigned int flags;
3084 3076
3085 unsigned int loop; 3077 unsigned int loop;
@@ -3145,9 +3137,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3145 * 3) are cache-hot on their current CPU. 3137 * 3) are cache-hot on their current CPU.
3146 */ 3138 */
3147 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3139 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3140 int new_dst_cpu;
3141
3148 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3142 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3143
3144 /*
3145 * Remember if this task can be migrated to any other cpu in
3146 * our sched_group. We may want to revisit it if we couldn't
3147 * meet load balance goals by pulling other tasks on src_cpu.
3148 *
3149 * Also avoid computing new_dst_cpu if we have already computed
3150 * one in current iteration.
3151 */
3152 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3153 return 0;
3154
3155 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3156 tsk_cpus_allowed(p));
3157 if (new_dst_cpu < nr_cpu_ids) {
3158 env->flags |= LBF_SOME_PINNED;
3159 env->new_dst_cpu = new_dst_cpu;
3160 }
3149 return 0; 3161 return 0;
3150 } 3162 }
3163
3164 /* Record that we found atleast one task that could run on dst_cpu */
3151 env->flags &= ~LBF_ALL_PINNED; 3165 env->flags &= ~LBF_ALL_PINNED;
3152 3166
3153 if (task_running(env->src_rq, p)) { 3167 if (task_running(env->src_rq, p)) {
@@ -3373,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data)
3373 3387
3374static void update_h_load(long cpu) 3388static void update_h_load(long cpu)
3375{ 3389{
3390 struct rq *rq = cpu_rq(cpu);
3391 unsigned long now = jiffies;
3392
3393 if (rq->h_load_throttle == now)
3394 return;
3395
3396 rq->h_load_throttle = now;
3397
3376 rcu_read_lock(); 3398 rcu_read_lock();
3377 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 3399 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3378 rcu_read_unlock(); 3400 rcu_read_unlock();
@@ -3642,8 +3664,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3642 */ 3664 */
3643static inline void update_sg_lb_stats(struct lb_env *env, 3665static inline void update_sg_lb_stats(struct lb_env *env,
3644 struct sched_group *group, int load_idx, 3666 struct sched_group *group, int load_idx,
3645 int local_group, const struct cpumask *cpus, 3667 int local_group, int *balance, struct sg_lb_stats *sgs)
3646 int *balance, struct sg_lb_stats *sgs)
3647{ 3668{
3648 unsigned long nr_running, max_nr_running, min_nr_running; 3669 unsigned long nr_running, max_nr_running, min_nr_running;
3649 unsigned long load, max_cpu_load, min_cpu_load; 3670 unsigned long load, max_cpu_load, min_cpu_load;
@@ -3660,7 +3681,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3660 max_nr_running = 0; 3681 max_nr_running = 0;
3661 min_nr_running = ~0UL; 3682 min_nr_running = ~0UL;
3662 3683
3663 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3684 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
3664 struct rq *rq = cpu_rq(i); 3685 struct rq *rq = cpu_rq(i);
3665 3686
3666 nr_running = rq->nr_running; 3687 nr_running = rq->nr_running;
@@ -3789,8 +3810,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
3789 * @sds: variable to hold the statistics for this sched_domain. 3810 * @sds: variable to hold the statistics for this sched_domain.
3790 */ 3811 */
3791static inline void update_sd_lb_stats(struct lb_env *env, 3812static inline void update_sd_lb_stats(struct lb_env *env,
3792 const struct cpumask *cpus, 3813 int *balance, struct sd_lb_stats *sds)
3793 int *balance, struct sd_lb_stats *sds)
3794{ 3814{
3795 struct sched_domain *child = env->sd->child; 3815 struct sched_domain *child = env->sd->child;
3796 struct sched_group *sg = env->sd->groups; 3816 struct sched_group *sg = env->sd->groups;
@@ -3807,8 +3827,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3807 3827
3808 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 3828 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3809 memset(&sgs, 0, sizeof(sgs)); 3829 memset(&sgs, 0, sizeof(sgs));
3810 update_sg_lb_stats(env, sg, load_idx, local_group, 3830 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
3811 cpus, balance, &sgs);
3812 3831
3813 if (local_group && !(*balance)) 3832 if (local_group && !(*balance))
3814 return; 3833 return;
@@ -4044,7 +4063,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4044 * to restore balance. 4063 * to restore balance.
4045 * 4064 *
4046 * @env: The load balancing environment. 4065 * @env: The load balancing environment.
4047 * @cpus: The set of CPUs under consideration for load-balancing.
4048 * @balance: Pointer to a variable indicating if this_cpu 4066 * @balance: Pointer to a variable indicating if this_cpu
4049 * is the appropriate cpu to perform load balancing at this_level. 4067 * is the appropriate cpu to perform load balancing at this_level.
4050 * 4068 *
@@ -4054,7 +4072,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4054 * put to idle by rebalancing its tasks onto our group. 4072 * put to idle by rebalancing its tasks onto our group.
4055 */ 4073 */
4056static struct sched_group * 4074static struct sched_group *
4057find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) 4075find_busiest_group(struct lb_env *env, int *balance)
4058{ 4076{
4059 struct sd_lb_stats sds; 4077 struct sd_lb_stats sds;
4060 4078
@@ -4064,7 +4082,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4064 * Compute the various statistics relavent for load balancing at 4082 * Compute the various statistics relavent for load balancing at
4065 * this level. 4083 * this level.
4066 */ 4084 */
4067 update_sd_lb_stats(env, cpus, balance, &sds); 4085 update_sd_lb_stats(env, balance, &sds);
4068 4086
4069 /* 4087 /*
4070 * this_cpu is not the appropriate cpu to perform load balancing at 4088 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4144,8 +4162,7 @@ ret:
4144 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4162 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4145 */ 4163 */
4146static struct rq *find_busiest_queue(struct lb_env *env, 4164static struct rq *find_busiest_queue(struct lb_env *env,
4147 struct sched_group *group, 4165 struct sched_group *group)
4148 const struct cpumask *cpus)
4149{ 4166{
4150 struct rq *busiest = NULL, *rq; 4167 struct rq *busiest = NULL, *rq;
4151 unsigned long max_load = 0; 4168 unsigned long max_load = 0;
@@ -4160,7 +4177,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4160 if (!capacity) 4177 if (!capacity)
4161 capacity = fix_small_capacity(env->sd, group); 4178 capacity = fix_small_capacity(env->sd, group);
4162 4179
4163 if (!cpumask_test_cpu(i, cpus)) 4180 if (!cpumask_test_cpu(i, env->cpus))
4164 continue; 4181 continue;
4165 4182
4166 rq = cpu_rq(i); 4183 rq = cpu_rq(i);
@@ -4227,7 +4244,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4227 struct sched_domain *sd, enum cpu_idle_type idle, 4244 struct sched_domain *sd, enum cpu_idle_type idle,
4228 int *balance) 4245 int *balance)
4229{ 4246{
4230 int ld_moved, active_balance = 0; 4247 int ld_moved, cur_ld_moved, active_balance = 0;
4248 int lb_iterations, max_lb_iterations;
4231 struct sched_group *group; 4249 struct sched_group *group;
4232 struct rq *busiest; 4250 struct rq *busiest;
4233 unsigned long flags; 4251 unsigned long flags;
@@ -4237,16 +4255,19 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4237 .sd = sd, 4255 .sd = sd,
4238 .dst_cpu = this_cpu, 4256 .dst_cpu = this_cpu,
4239 .dst_rq = this_rq, 4257 .dst_rq = this_rq,
4258 .dst_grpmask = sched_group_cpus(sd->groups),
4240 .idle = idle, 4259 .idle = idle,
4241 .loop_break = sched_nr_migrate_break, 4260 .loop_break = sched_nr_migrate_break,
4261 .cpus = cpus,
4242 }; 4262 };
4243 4263
4244 cpumask_copy(cpus, cpu_active_mask); 4264 cpumask_copy(cpus, cpu_active_mask);
4265 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4245 4266
4246 schedstat_inc(sd, lb_count[idle]); 4267 schedstat_inc(sd, lb_count[idle]);
4247 4268
4248redo: 4269redo:
4249 group = find_busiest_group(&env, cpus, balance); 4270 group = find_busiest_group(&env, balance);
4250 4271
4251 if (*balance == 0) 4272 if (*balance == 0)
4252 goto out_balanced; 4273 goto out_balanced;
@@ -4256,7 +4277,7 @@ redo:
4256 goto out_balanced; 4277 goto out_balanced;
4257 } 4278 }
4258 4279
4259 busiest = find_busiest_queue(&env, group, cpus); 4280 busiest = find_busiest_queue(&env, group);
4260 if (!busiest) { 4281 if (!busiest) {
4261 schedstat_inc(sd, lb_nobusyq[idle]); 4282 schedstat_inc(sd, lb_nobusyq[idle]);
4262 goto out_balanced; 4283 goto out_balanced;
@@ -4267,6 +4288,7 @@ redo:
4267 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4288 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4268 4289
4269 ld_moved = 0; 4290 ld_moved = 0;
4291 lb_iterations = 1;
4270 if (busiest->nr_running > 1) { 4292 if (busiest->nr_running > 1) {
4271 /* 4293 /*
4272 * Attempt to move tasks. If find_busiest_group has found 4294 * Attempt to move tasks. If find_busiest_group has found
@@ -4279,12 +4301,17 @@ redo:
4279 env.src_rq = busiest; 4301 env.src_rq = busiest;
4280 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 4302 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4281 4303
4304 update_h_load(env.src_cpu);
4282more_balance: 4305more_balance:
4283 local_irq_save(flags); 4306 local_irq_save(flags);
4284 double_rq_lock(this_rq, busiest); 4307 double_rq_lock(this_rq, busiest);
4285 if (!env.loop) 4308
4286 update_h_load(env.src_cpu); 4309 /*
4287 ld_moved += move_tasks(&env); 4310 * cur_ld_moved - load moved in current iteration
4311 * ld_moved - cumulative load moved across iterations
4312 */
4313 cur_ld_moved = move_tasks(&env);
4314 ld_moved += cur_ld_moved;
4288 double_rq_unlock(this_rq, busiest); 4315 double_rq_unlock(this_rq, busiest);
4289 local_irq_restore(flags); 4316 local_irq_restore(flags);
4290 4317
@@ -4296,14 +4323,52 @@ more_balance:
4296 /* 4323 /*
4297 * some other cpu did the load balance for us. 4324 * some other cpu did the load balance for us.
4298 */ 4325 */
4299 if (ld_moved && this_cpu != smp_processor_id()) 4326 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4300 resched_cpu(this_cpu); 4327 resched_cpu(env.dst_cpu);
4328
4329 /*
4330 * Revisit (affine) tasks on src_cpu that couldn't be moved to
4331 * us and move them to an alternate dst_cpu in our sched_group
4332 * where they can run. The upper limit on how many times we
4333 * iterate on same src_cpu is dependent on number of cpus in our
4334 * sched_group.
4335 *
4336 * This changes load balance semantics a bit on who can move
4337 * load to a given_cpu. In addition to the given_cpu itself
4338 * (or a ilb_cpu acting on its behalf where given_cpu is
4339 * nohz-idle), we now have balance_cpu in a position to move
4340 * load to given_cpu. In rare situations, this may cause
4341 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
4342 * _independently_ and at _same_ time to move some load to
4343 * given_cpu) causing exceess load to be moved to given_cpu.
4344 * This however should not happen so much in practice and
4345 * moreover subsequent load balance cycles should correct the
4346 * excess load moved.
4347 */
4348 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4349 lb_iterations++ < max_lb_iterations) {
4350
4351 this_rq = cpu_rq(env.new_dst_cpu);
4352 env.dst_rq = this_rq;
4353 env.dst_cpu = env.new_dst_cpu;
4354 env.flags &= ~LBF_SOME_PINNED;
4355 env.loop = 0;
4356 env.loop_break = sched_nr_migrate_break;
4357 /*
4358 * Go back to "more_balance" rather than "redo" since we
4359 * need to continue with same src_cpu.
4360 */
4361 goto more_balance;
4362 }
4301 4363
4302 /* All tasks on this runqueue were pinned by CPU affinity */ 4364 /* All tasks on this runqueue were pinned by CPU affinity */
4303 if (unlikely(env.flags & LBF_ALL_PINNED)) { 4365 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4304 cpumask_clear_cpu(cpu_of(busiest), cpus); 4366 cpumask_clear_cpu(cpu_of(busiest), cpus);
4305 if (!cpumask_empty(cpus)) 4367 if (!cpumask_empty(cpus)) {
4368 env.loop = 0;
4369 env.loop_break = sched_nr_migrate_break;
4306 goto redo; 4370 goto redo;
4371 }
4307 goto out_balanced; 4372 goto out_balanced;
4308 } 4373 }
4309 } 4374 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca01102..944cb68420e9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
788 const struct cpumask *span; 788 const struct cpumask *span;
789 789
790 span = sched_rt_period_mask(); 790 span = sched_rt_period_mask();
791#ifdef CONFIG_RT_GROUP_SCHED
792 /*
793 * FIXME: isolated CPUs should really leave the root task group,
794 * whether they are isolcpus or were isolated via cpusets, lest
795 * the timer run on a CPU which does not service all runqueues,
796 * potentially leaving other CPUs indefinitely throttled. If
797 * isolation is really required, the user will turn the throttle
798 * off to kill the perturbations it causes anyway. Meanwhile,
799 * this maintains functionality for boot and/or troubleshooting.
800 */
801 if (rt_b == &root_task_group.rt_bandwidth)
802 span = cpu_online_mask;
803#endif
791 for_each_cpu(i, span) { 804 for_each_cpu(i, span) {
792 int enqueue = 0; 805 int enqueue = 0;
793 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 806 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..f6714d009e77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
80struct cfs_rq; 80struct cfs_rq;
81struct rt_rq; 81struct rt_rq;
82 82
83static LIST_HEAD(task_groups); 83extern struct list_head task_groups;
84 84
85struct cfs_bandwidth { 85struct cfs_bandwidth {
86#ifdef CONFIG_CFS_BANDWIDTH 86#ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
374#ifdef CONFIG_FAIR_GROUP_SCHED 374#ifdef CONFIG_FAIR_GROUP_SCHED
375 /* list of leaf cfs_rq on this cpu: */ 375 /* list of leaf cfs_rq on this cpu: */
376 struct list_head leaf_cfs_rq_list; 376 struct list_head leaf_cfs_rq_list;
377#endif 377#ifdef CONFIG_SMP
378 unsigned long h_load_throttle;
379#endif /* CONFIG_SMP */
380#endif /* CONFIG_FAIR_GROUP_SCHED */
381
378#ifdef CONFIG_RT_GROUP_SCHED 382#ifdef CONFIG_RT_GROUP_SCHED
379 struct list_head leaf_rt_rq_list; 383 struct list_head leaf_rt_rq_list;
380#endif 384#endif
@@ -538,22 +542,19 @@ extern int group_balance_cpu(struct sched_group *sg);
538/* 542/*
539 * Return the group to which this tasks belongs. 543 * Return the group to which this tasks belongs.
540 * 544 *
541 * We use task_subsys_state_check() and extend the RCU verification with 545 * We cannot use task_subsys_state() and friends because the cgroup
542 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each 546 * subsystem changes that value before the cgroup_subsys::attach() method
543 * task it moves into the cgroup. Therefore by holding either of those locks, 547 * is called, therefore we cannot pin it and might observe the wrong value.
544 * we pin the task to the current cgroup. 548 *
549 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
550 * core changes this before calling sched_move_task().
551 *
552 * Instead we use a 'copy' which is updated from sched_move_task() while
553 * holding both task_struct::pi_lock and rq::lock.
545 */ 554 */
546static inline struct task_group *task_group(struct task_struct *p) 555static inline struct task_group *task_group(struct task_struct *p)
547{ 556{
548 struct task_group *tg; 557 return p->sched_task_group;
549 struct cgroup_subsys_state *css;
550
551 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
552 lockdep_is_held(&p->pi_lock) ||
553 lockdep_is_held(&task_rq(p)->lock));
554 tg = container_of(css, struct task_group, css);
555
556 return autogroup_task_group(p, tg);
557} 558}
558 559
559/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 560/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd23..da5eb5bed84a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
27{ 27{
28 struct task_struct *stop = rq->stop; 28 struct task_struct *stop = rq->stop;
29 29
30 if (stop && stop->on_rq) 30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task;
31 return stop; 32 return stop;
33 }
32 34
33 return NULL; 35 return NULL;
34} 36}
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
52 54
53static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) 55static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
54{ 56{
57 struct task_struct *curr = rq->curr;
58 u64 delta_exec;
59
60 delta_exec = rq->clock_task - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0;
63
64 schedstat_set(curr->se.statistics.exec_max,
65 max(curr->se.statistics.exec_max, delta_exec));
66
67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec);
69
70 curr->se.exec_start = rq->clock_task;
71 cpuacct_charge(curr, delta_exec);
55} 72}
56 73
57static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) 74static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
60 77
61static void set_curr_task_stop(struct rq *rq) 78static void set_curr_task_stop(struct rq *rq)
62{ 79{
80 struct task_struct *stop = rq->stop;
81
82 stop->se.exec_start = rq->clock_task;
63} 83}
64 84
65static void switched_to_stop(struct rq *rq, struct task_struct *p) 85static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/signal.c b/kernel/signal.c
index 677102789cf2..be4f856d52f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1971void ptrace_notify(int exit_code) 1971void ptrace_notify(int exit_code)
1972{ 1972{
1973 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); 1973 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1974 if (unlikely(current->task_works)) {
1975 if (test_and_clear_ti_thread_flag(current_thread_info(),
1976 TIF_NOTIFY_RESUME)) {
1977 smp_mb__after_clear_bit();
1978 task_work_run();
1979 }
1980 }
1974 1981
1975 spin_lock_irq(&current->sighand->siglock); 1982 spin_lock_irq(&current->sighand->siglock);
1976 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); 1983 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2191 struct signal_struct *signal = current->signal; 2198 struct signal_struct *signal = current->signal;
2192 int signr; 2199 int signr;
2193 2200
2201 if (unlikely(current->task_works)) {
2202 if (test_and_clear_ti_thread_flag(current_thread_info(),
2203 TIF_NOTIFY_RESUME)) {
2204 smp_mb__after_clear_bit();
2205 task_work_run();
2206 }
2207 }
2208
2194 if (unlikely(uprobe_deny_signal())) 2209 if (unlikely(uprobe_deny_signal()))
2195 return 0; 2210 return 0;
2196 2211
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae5b24875e..29dd40a9f2f4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
581 return 0; 581 return 0;
582} 582}
583EXPORT_SYMBOL(smp_call_function); 583EXPORT_SYMBOL(smp_call_function);
584
585void ipi_call_lock(void)
586{
587 raw_spin_lock(&call_function.lock);
588}
589
590void ipi_call_unlock(void)
591{
592 raw_spin_unlock(&call_function.lock);
593}
594
595void ipi_call_lock_irq(void)
596{
597 raw_spin_lock_irq(&call_function.lock);
598}
599
600void ipi_call_unlock_irq(void)
601{
602 raw_spin_unlock_irq(&call_function.lock);
603}
604#endif /* USE_GENERIC_SMP_HELPERS */ 584#endif /* USE_GENERIC_SMP_HELPERS */
605 585
606/* Setup configured maximum number of CPUs to activate */ 586/* Setup configured maximum number of CPUs to activate */
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb8472..6ef9433e1c70 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -3,8 +3,6 @@
3 3
4struct task_struct; 4struct task_struct;
5 5
6int smpboot_prepare(unsigned int cpu);
7
8#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD 6#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
9struct task_struct *idle_thread_get(unsigned int cpu); 7struct task_struct *idle_thread_get(unsigned int cpu);
10void idle_thread_set_boot_cpu(void); 8void idle_thread_set_boot_cpu(void);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..b73e681df09e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
210 __u32 pending; 210 __u32 pending;
211 int max_restart = MAX_SOFTIRQ_RESTART; 211 int max_restart = MAX_SOFTIRQ_RESTART;
212 int cpu; 212 int cpu;
213 unsigned long old_flags = current->flags;
214
215 /*
216 * Mask out PF_MEMALLOC s current task context is borrowed for the
217 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
218 * again if the socket is related to swap
219 */
220 current->flags &= ~PF_MEMALLOC;
213 221
214 pending = local_softirq_pending(); 222 pending = local_softirq_pending();
215 account_system_vtime(current); 223 account_system_vtime(current);
@@ -265,6 +273,7 @@ restart:
265 273
266 account_system_vtime(current); 274 account_system_vtime(current);
267 __local_bh_enable(SOFTIRQ_OFFSET); 275 __local_bh_enable(SOFTIRQ_OFFSET);
276 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
268} 277}
269 278
270#ifndef __ARCH_HAS_DO_SOFTIRQ 279#ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/kernel/sys.c b/kernel/sys.c
index 2d39a84cd857..241507f23eca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2015 break; 2015 break;
2016 } 2016 }
2017 me->pdeath_signal = arg2; 2017 me->pdeath_signal = arg2;
2018 error = 0;
2019 break; 2018 break;
2020 case PR_GET_PDEATHSIG: 2019 case PR_GET_PDEATHSIG:
2021 error = put_user(me->pdeath_signal, (int __user *)arg2); 2020 error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2029 break; 2028 break;
2030 } 2029 }
2031 set_dumpable(me->mm, arg2); 2030 set_dumpable(me->mm, arg2);
2032 error = 0;
2033 break; 2031 break;
2034 2032
2035 case PR_SET_UNALIGN: 2033 case PR_SET_UNALIGN:
@@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2056 case PR_SET_TIMING: 2054 case PR_SET_TIMING:
2057 if (arg2 != PR_TIMING_STATISTICAL) 2055 if (arg2 != PR_TIMING_STATISTICAL)
2058 error = -EINVAL; 2056 error = -EINVAL;
2059 else
2060 error = 0;
2061 break; 2057 break;
2062
2063 case PR_SET_NAME: 2058 case PR_SET_NAME:
2064 comm[sizeof(me->comm)-1] = 0; 2059 comm[sizeof(me->comm)-1] = 0;
2065 if (strncpy_from_user(comm, (char __user *)arg2, 2060 if (strncpy_from_user(comm, (char __user *)arg2,
@@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2067 return -EFAULT; 2062 return -EFAULT;
2068 set_task_comm(me, comm); 2063 set_task_comm(me, comm);
2069 proc_comm_connector(me); 2064 proc_comm_connector(me);
2070 return 0; 2065 break;
2071 case PR_GET_NAME: 2066 case PR_GET_NAME:
2072 get_task_comm(comm, me); 2067 get_task_comm(comm, me);
2073 if (copy_to_user((char __user *)arg2, comm, 2068 if (copy_to_user((char __user *)arg2, comm,
2074 sizeof(comm))) 2069 sizeof(comm)))
2075 return -EFAULT; 2070 return -EFAULT;
2076 return 0; 2071 break;
2077 case PR_GET_ENDIAN: 2072 case PR_GET_ENDIAN:
2078 error = GET_ENDIAN(me, arg2); 2073 error = GET_ENDIAN(me, arg2);
2079 break; 2074 break;
2080 case PR_SET_ENDIAN: 2075 case PR_SET_ENDIAN:
2081 error = SET_ENDIAN(me, arg2); 2076 error = SET_ENDIAN(me, arg2);
2082 break; 2077 break;
2083
2084 case PR_GET_SECCOMP: 2078 case PR_GET_SECCOMP:
2085 error = prctl_get_seccomp(); 2079 error = prctl_get_seccomp();
2086 break; 2080 break;
@@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2108 current->default_timer_slack_ns; 2102 current->default_timer_slack_ns;
2109 else 2103 else
2110 current->timer_slack_ns = arg2; 2104 current->timer_slack_ns = arg2;
2111 error = 0;
2112 break; 2105 break;
2113 case PR_MCE_KILL: 2106 case PR_MCE_KILL:
2114 if (arg4 | arg5) 2107 if (arg4 | arg5)
@@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2134 default: 2127 default:
2135 return -EINVAL; 2128 return -EINVAL;
2136 } 2129 }
2137 error = 0;
2138 break; 2130 break;
2139 case PR_MCE_KILL_GET: 2131 case PR_MCE_KILL_GET:
2140 if (arg2 | arg3 | arg4 | arg5) 2132 if (arg2 | arg3 | arg4 | arg5)
@@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2153 break; 2145 break;
2154 case PR_SET_CHILD_SUBREAPER: 2146 case PR_SET_CHILD_SUBREAPER:
2155 me->signal->is_child_subreaper = !!arg2; 2147 me->signal->is_child_subreaper = !!arg2;
2156 error = 0;
2157 break; 2148 break;
2158 case PR_GET_CHILD_SUBREAPER: 2149 case PR_GET_CHILD_SUBREAPER:
2159 error = put_user(me->signal->is_child_subreaper, 2150 error = put_user(me->signal->is_child_subreaper,
@@ -2195,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info)
2195 argv_free(info->argv); 2186 argv_free(info->argv);
2196} 2187}
2197 2188
2198/** 2189static int __orderly_poweroff(void)
2199 * orderly_poweroff - Trigger an orderly system poweroff
2200 * @force: force poweroff if command execution fails
2201 *
2202 * This may be called from any context to trigger a system shutdown.
2203 * If the orderly shutdown fails, it will force an immediate shutdown.
2204 */
2205int orderly_poweroff(bool force)
2206{ 2190{
2207 int argc; 2191 int argc;
2208 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); 2192 char **argv;
2209 static char *envp[] = { 2193 static char *envp[] = {
2210 "HOME=/", 2194 "HOME=/",
2211 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", 2195 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2212 NULL 2196 NULL
2213 }; 2197 };
2214 int ret = -ENOMEM; 2198 int ret;
2215 2199
2200 argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2216 if (argv == NULL) { 2201 if (argv == NULL) {
2217 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2202 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2218 __func__, poweroff_cmd); 2203 __func__, poweroff_cmd);
2219 goto out; 2204 return -ENOMEM;
2220 } 2205 }
2221 2206
2222 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, 2207 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
2223 NULL, argv_cleanup, NULL); 2208 NULL, argv_cleanup, NULL);
2224out:
2225 if (likely(!ret))
2226 return 0;
2227
2228 if (ret == -ENOMEM) 2209 if (ret == -ENOMEM)
2229 argv_free(argv); 2210 argv_free(argv);
2230 2211
2231 if (force) { 2212 return ret;
2213}
2214
2215/**
2216 * orderly_poweroff - Trigger an orderly system poweroff
2217 * @force: force poweroff if command execution fails
2218 *
2219 * This may be called from any context to trigger a system shutdown.
2220 * If the orderly shutdown fails, it will force an immediate shutdown.
2221 */
2222int orderly_poweroff(bool force)
2223{
2224 int ret = __orderly_poweroff();
2225
2226 if (ret && force) {
2232 printk(KERN_WARNING "Failed to start orderly shutdown: " 2227 printk(KERN_WARNING "Failed to start orderly shutdown: "
2233 "forcing the issue\n"); 2228 "forcing the issue\n");
2234 2229
2235 /* I guess this should try to kick off some daemon to 2230 /*
2236 sync and poweroff asap. Or not even bother syncing 2231 * I guess this should try to kick off some daemon to sync and
2237 if we're doing an emergency shutdown? */ 2232 * poweroff asap. Or not even bother syncing if we're doing an
2233 * emergency shutdown?
2234 */
2238 emergency_sync(); 2235 emergency_sync();
2239 kernel_power_off(); 2236 kernel_power_off();
2240 } 2237 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb4..87174ef59161 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,6 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/kmemcheck.h> 32#include <linux/kmemcheck.h>
33#include <linux/kmemleak.h>
33#include <linux/fs.h> 34#include <linux/fs.h>
34#include <linux/init.h> 35#include <linux/init.h>
35#include <linux/kernel.h> 36#include <linux/kernel.h>
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, loff_t *ppos); 175 void __user *buffer, size_t *lenp, loff_t *ppos);
175#endif 176#endif
176 177
178static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
179 void __user *buffer, size_t *lenp, loff_t *ppos);
180static int proc_dostring_coredump(struct ctl_table *table, int write,
181 void __user *buffer, size_t *lenp, loff_t *ppos);
182
177#ifdef CONFIG_MAGIC_SYSRQ 183#ifdef CONFIG_MAGIC_SYSRQ
178/* Note: sysrq code uses it's own private copy */ 184/* Note: sysrq code uses it's own private copy */
179static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 185static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = {
410 .data = core_pattern, 416 .data = core_pattern,
411 .maxlen = CORENAME_MAX_SIZE, 417 .maxlen = CORENAME_MAX_SIZE,
412 .mode = 0644, 418 .mode = 0644,
413 .proc_handler = proc_dostring, 419 .proc_handler = proc_dostring_coredump,
414 }, 420 },
415 { 421 {
416 .procname = "core_pipe_limit", 422 .procname = "core_pipe_limit",
@@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = {
1095 .extra1 = &zero, 1101 .extra1 = &zero,
1096 }, 1102 },
1097 { 1103 {
1098 .procname = "nr_pdflush_threads", 1104 .procname = "nr_pdflush_threads",
1099 .data = &nr_pdflush_threads, 1105 .mode = 0444 /* read-only */,
1100 .maxlen = sizeof nr_pdflush_threads, 1106 .proc_handler = pdflush_proc_obsolete,
1101 .mode = 0444 /* read-only*/,
1102 .proc_handler = proc_dointvec,
1103 }, 1107 },
1104 { 1108 {
1105 .procname = "swappiness", 1109 .procname = "swappiness",
@@ -1494,11 +1498,29 @@ static struct ctl_table fs_table[] = {
1494#endif 1498#endif
1495#endif 1499#endif
1496 { 1500 {
1501 .procname = "protected_symlinks",
1502 .data = &sysctl_protected_symlinks,
1503 .maxlen = sizeof(int),
1504 .mode = 0600,
1505 .proc_handler = proc_dointvec_minmax,
1506 .extra1 = &zero,
1507 .extra2 = &one,
1508 },
1509 {
1510 .procname = "protected_hardlinks",
1511 .data = &sysctl_protected_hardlinks,
1512 .maxlen = sizeof(int),
1513 .mode = 0600,
1514 .proc_handler = proc_dointvec_minmax,
1515 .extra1 = &zero,
1516 .extra2 = &one,
1517 },
1518 {
1497 .procname = "suid_dumpable", 1519 .procname = "suid_dumpable",
1498 .data = &suid_dumpable, 1520 .data = &suid_dumpable,
1499 .maxlen = sizeof(int), 1521 .maxlen = sizeof(int),
1500 .mode = 0644, 1522 .mode = 0644,
1501 .proc_handler = proc_dointvec_minmax, 1523 .proc_handler = proc_dointvec_minmax_coredump,
1502 .extra1 = &zero, 1524 .extra1 = &zero,
1503 .extra2 = &two, 1525 .extra2 = &two,
1504 }, 1526 },
@@ -1551,7 +1573,10 @@ static struct ctl_table dev_table[] = {
1551 1573
1552int __init sysctl_init(void) 1574int __init sysctl_init(void)
1553{ 1575{
1554 register_sysctl_table(sysctl_base_table); 1576 struct ctl_table_header *hdr;
1577
1578 hdr = register_sysctl_table(sysctl_base_table);
1579 kmemleak_not_leak(hdr);
1555 return 0; 1580 return 0;
1556} 1581}
1557 1582
@@ -2009,6 +2034,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2009 do_proc_dointvec_minmax_conv, &param); 2034 do_proc_dointvec_minmax_conv, &param);
2010} 2035}
2011 2036
2037static void validate_coredump_safety(void)
2038{
2039 if (suid_dumpable == SUID_DUMPABLE_SAFE &&
2040 core_pattern[0] != '/' && core_pattern[0] != '|') {
2041 printk(KERN_WARNING "Unsafe core_pattern used with "\
2042 "suid_dumpable=2. Pipe handler or fully qualified "\
2043 "core dump path required.\n");
2044 }
2045}
2046
2047static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
2048 void __user *buffer, size_t *lenp, loff_t *ppos)
2049{
2050 int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2051 if (!error)
2052 validate_coredump_safety();
2053 return error;
2054}
2055
2056static int proc_dostring_coredump(struct ctl_table *table, int write,
2057 void __user *buffer, size_t *lenp, loff_t *ppos)
2058{
2059 int error = proc_dostring(table, write, buffer, lenp, ppos);
2060 if (!error)
2061 validate_coredump_safety();
2062 return error;
2063}
2064
2012static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2065static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2013 void __user *buffer, 2066 void __user *buffer,
2014 size_t *lenp, loff_t *ppos, 2067 size_t *lenp, loff_t *ppos,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..65bdcf198d4e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, 147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ 148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ 149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
150 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, 150 /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, 151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
152 /* VM_PAGEBUF unused */ 152 /* VM_PAGEBUF unused */
153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ 153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 82d1c794066d..d320d44903bd 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -3,82 +3,79 @@
3#include <linux/tracehook.h> 3#include <linux/tracehook.h>
4 4
5int 5int
6task_work_add(struct task_struct *task, struct task_work *twork, bool notify) 6task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
7{ 7{
8 struct callback_head *last, *first;
8 unsigned long flags; 9 unsigned long flags;
9 int err = -ESRCH;
10 10
11#ifndef TIF_NOTIFY_RESUME
12 if (notify)
13 return -ENOTSUPP;
14#endif
15 /* 11 /*
16 * We must not insert the new work if the task has already passed 12 * Not inserting the new work if the task has already passed
17 * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() 13 * exit_task_work() is the responisbility of callers.
18 * and check PF_EXITING under pi_lock.
19 */ 14 */
20 raw_spin_lock_irqsave(&task->pi_lock, flags); 15 raw_spin_lock_irqsave(&task->pi_lock, flags);
21 if (likely(!(task->flags & PF_EXITING))) { 16 last = task->task_works;
22 hlist_add_head(&twork->hlist, &task->task_works); 17 first = last ? last->next : twork;
23 err = 0; 18 twork->next = first;
24 } 19 if (last)
20 last->next = twork;
21 task->task_works = twork;
25 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 22 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
26 23
27 /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ 24 /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
28 if (likely(!err) && notify) 25 if (notify)
29 set_notify_resume(task); 26 set_notify_resume(task);
30 return err; 27 return 0;
31} 28}
32 29
33struct task_work * 30struct callback_head *
34task_work_cancel(struct task_struct *task, task_work_func_t func) 31task_work_cancel(struct task_struct *task, task_work_func_t func)
35{ 32{
36 unsigned long flags; 33 unsigned long flags;
37 struct task_work *twork; 34 struct callback_head *last, *res = NULL;
38 struct hlist_node *pos;
39 35
40 raw_spin_lock_irqsave(&task->pi_lock, flags); 36 raw_spin_lock_irqsave(&task->pi_lock, flags);
41 hlist_for_each_entry(twork, pos, &task->task_works, hlist) { 37 last = task->task_works;
42 if (twork->func == func) { 38 if (last) {
43 hlist_del(&twork->hlist); 39 struct callback_head *q = last, *p = q->next;
44 goto found; 40 while (1) {
41 if (p->func == func) {
42 q->next = p->next;
43 if (p == last)
44 task->task_works = q == p ? NULL : q;
45 res = p;
46 break;
47 }
48 if (p == last)
49 break;
50 q = p;
51 p = q->next;
45 } 52 }
46 } 53 }
47 twork = NULL;
48 found:
49 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 54 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
50 55 return res;
51 return twork;
52} 56}
53 57
54void task_work_run(void) 58void task_work_run(void)
55{ 59{
56 struct task_struct *task = current; 60 struct task_struct *task = current;
57 struct hlist_head task_works; 61 struct callback_head *p, *q;
58 struct hlist_node *pos;
59 62
60 raw_spin_lock_irq(&task->pi_lock); 63 while (1) {
61 hlist_move_list(&task->task_works, &task_works); 64 raw_spin_lock_irq(&task->pi_lock);
62 raw_spin_unlock_irq(&task->pi_lock); 65 p = task->task_works;
66 task->task_works = NULL;
67 raw_spin_unlock_irq(&task->pi_lock);
63 68
64 if (unlikely(hlist_empty(&task_works))) 69 if (unlikely(!p))
65 return; 70 return;
66 /*
67 * We use hlist to save the space in task_struct, but we want fifo.
68 * Find the last entry, the list should be short, then process them
69 * in reverse order.
70 */
71 for (pos = task_works.first; pos->next; pos = pos->next)
72 ;
73 71
74 for (;;) { 72 q = p->next; /* head */
75 struct hlist_node **pprev = pos->pprev; 73 p->next = NULL; /* cut it */
76 struct task_work *twork = container_of(pos, struct task_work, 74 while (q) {
77 hlist); 75 p = q->next;
78 twork->func(twork); 76 q->func(q);
79 77 q = p;
80 if (pprev == &task_works.first) 78 cond_resched();
81 break; 79 }
82 pos = container_of(pprev, struct hlist_node, next);
83 } 80 }
84} 81}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4f..d0a32796550f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
436 436
437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
438 sizeof(struct cgroupstats)); 438 sizeof(struct cgroupstats));
439 if (na == NULL) {
440 rc = -EMSGSIZE;
441 goto err;
442 }
443
439 stats = nla_data(na); 444 stats = nla_data(na);
440 memset(stats, 0, sizeof(*stats)); 445 memset(stats, 0, sizeof(*stats));
441 446
diff --git a/kernel/time.c b/kernel/time.c
index ba744cf80696..d226c6a3fd28 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -30,7 +30,7 @@
30#include <linux/export.h> 30#include <linux/export.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/clocksource.h> 33#include <linux/timekeeper_internal.h>
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fd42bd452b75..8601f0db1261 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA
16config GENERIC_TIME_VSYSCALL 16config GENERIC_TIME_VSYSCALL
17 bool 17 bool
18 18
19# Timekeeping vsyscall support
20config GENERIC_TIME_VSYSCALL_OLD
21 bool
22
19# ktime_t scalar 64bit nsec representation 23# ktime_t scalar 64bit nsec representation
20config KTIME_SCALAR 24config KTIME_SCALAR
21 bool 25 bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index aa27d391bfc8..f11d83b12949 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,7 +37,6 @@
37static struct alarm_base { 37static struct alarm_base {
38 spinlock_t lock; 38 spinlock_t lock;
39 struct timerqueue_head timerqueue; 39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
41 ktime_t (*gettime)(void); 40 ktime_t (*gettime)(void);
42 clockid_t base_clockid; 41 clockid_t base_clockid;
43} alarm_bases[ALARM_NUMTYPE]; 42} alarm_bases[ALARM_NUMTYPE];
@@ -46,6 +45,8 @@ static struct alarm_base {
46static ktime_t freezer_delta; 45static ktime_t freezer_delta;
47static DEFINE_SPINLOCK(freezer_delta_lock); 46static DEFINE_SPINLOCK(freezer_delta_lock);
48 47
48static struct wakeup_source *ws;
49
49#ifdef CONFIG_RTC_CLASS 50#ifdef CONFIG_RTC_CLASS
50/* rtc timer and device for setting alarm wakeups at suspend */ 51/* rtc timer and device for setting alarm wakeups at suspend */
51static struct rtc_timer rtctimer; 52static struct rtc_timer rtctimer;
@@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }
130 * @base: pointer to the base where the timer is being run 131 * @base: pointer to the base where the timer is being run
131 * @alarm: pointer to alarm being enqueued. 132 * @alarm: pointer to alarm being enqueued.
132 * 133 *
133 * Adds alarm to a alarm_base timerqueue and if necessary sets 134 * Adds alarm to a alarm_base timerqueue
134 * an hrtimer to run.
135 * 135 *
136 * Must hold base->lock when calling. 136 * Must hold base->lock when calling.
137 */ 137 */
138static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) 138static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
139{ 139{
140 if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
141 timerqueue_del(&base->timerqueue, &alarm->node);
142
140 timerqueue_add(&base->timerqueue, &alarm->node); 143 timerqueue_add(&base->timerqueue, &alarm->node);
141 alarm->state |= ALARMTIMER_STATE_ENQUEUED; 144 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
142
143 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
144 hrtimer_try_to_cancel(&base->timer);
145 hrtimer_start(&base->timer, alarm->node.expires,
146 HRTIMER_MODE_ABS);
147 }
148} 145}
149 146
150/** 147/**
151 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue 148 * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
152 * @base: pointer to the base where the timer is running 149 * @base: pointer to the base where the timer is running
153 * @alarm: pointer to alarm being removed 150 * @alarm: pointer to alarm being removed
154 * 151 *
155 * Removes alarm to a alarm_base timerqueue and if necessary sets 152 * Removes alarm to a alarm_base timerqueue
156 * a new timer to run.
157 * 153 *
158 * Must hold base->lock when calling. 154 * Must hold base->lock when calling.
159 */ 155 */
160static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) 156static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
161{ 157{
162 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
163
164 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) 158 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
165 return; 159 return;
166 160
167 timerqueue_del(&base->timerqueue, &alarm->node); 161 timerqueue_del(&base->timerqueue, &alarm->node);
168 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; 162 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
169
170 if (next == &alarm->node) {
171 hrtimer_try_to_cancel(&base->timer);
172 next = timerqueue_getnext(&base->timerqueue);
173 if (!next)
174 return;
175 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
176 }
177} 163}
178 164
179 165
@@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
188 */ 174 */
189static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) 175static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
190{ 176{
191 struct alarm_base *base = container_of(timer, struct alarm_base, timer); 177 struct alarm *alarm = container_of(timer, struct alarm, timer);
192 struct timerqueue_node *next; 178 struct alarm_base *base = &alarm_bases[alarm->type];
193 unsigned long flags; 179 unsigned long flags;
194 ktime_t now;
195 int ret = HRTIMER_NORESTART; 180 int ret = HRTIMER_NORESTART;
196 int restart = ALARMTIMER_NORESTART; 181 int restart = ALARMTIMER_NORESTART;
197 182
198 spin_lock_irqsave(&base->lock, flags); 183 spin_lock_irqsave(&base->lock, flags);
199 now = base->gettime(); 184 alarmtimer_dequeue(base, alarm);
200 while ((next = timerqueue_getnext(&base->timerqueue))) { 185 spin_unlock_irqrestore(&base->lock, flags);
201 struct alarm *alarm;
202 ktime_t expired = next->expires;
203
204 if (expired.tv64 > now.tv64)
205 break;
206
207 alarm = container_of(next, struct alarm, node);
208
209 timerqueue_del(&base->timerqueue, &alarm->node);
210 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
211
212 alarm->state |= ALARMTIMER_STATE_CALLBACK;
213 spin_unlock_irqrestore(&base->lock, flags);
214 if (alarm->function)
215 restart = alarm->function(alarm, now);
216 spin_lock_irqsave(&base->lock, flags);
217 alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
218 186
219 if (restart != ALARMTIMER_NORESTART) { 187 if (alarm->function)
220 timerqueue_add(&base->timerqueue, &alarm->node); 188 restart = alarm->function(alarm, base->gettime());
221 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
222 }
223 }
224 189
225 if (next) { 190 spin_lock_irqsave(&base->lock, flags);
226 hrtimer_set_expires(&base->timer, next->expires); 191 if (restart != ALARMTIMER_NORESTART) {
192 hrtimer_set_expires(&alarm->timer, alarm->node.expires);
193 alarmtimer_enqueue(base, alarm);
227 ret = HRTIMER_RESTART; 194 ret = HRTIMER_RESTART;
228 } 195 }
229 spin_unlock_irqrestore(&base->lock, flags); 196 spin_unlock_irqrestore(&base->lock, flags);
@@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)
250 unsigned long flags; 217 unsigned long flags;
251 struct rtc_device *rtc; 218 struct rtc_device *rtc;
252 int i; 219 int i;
220 int ret;
253 221
254 spin_lock_irqsave(&freezer_delta_lock, flags); 222 spin_lock_irqsave(&freezer_delta_lock, flags);
255 min = freezer_delta; 223 min = freezer_delta;
@@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)
279 if (min.tv64 == 0) 247 if (min.tv64 == 0)
280 return 0; 248 return 0;
281 249
282 /* XXX - Should we enforce a minimum sleep time? */ 250 if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
283 WARN_ON(min.tv64 < NSEC_PER_SEC); 251 __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
252 return -EBUSY;
253 }
284 254
285 /* Setup an rtc timer to fire that far in the future */ 255 /* Setup an rtc timer to fire that far in the future */
286 rtc_timer_cancel(rtc, &rtctimer); 256 rtc_timer_cancel(rtc, &rtctimer);
@@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)
288 now = rtc_tm_to_ktime(tm); 258 now = rtc_tm_to_ktime(tm);
289 now = ktime_add(now, min); 259 now = ktime_add(now, min);
290 260
291 rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); 261 /* Set alarm, if in the past reject suspend briefly to handle */
292 262 ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
293 return 0; 263 if (ret < 0)
264 __pm_wakeup_event(ws, MSEC_PER_SEC);
265 return ret;
294} 266}
295#else 267#else
296static int alarmtimer_suspend(struct device *dev) 268static int alarmtimer_suspend(struct device *dev)
@@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
324 enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) 296 enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
325{ 297{
326 timerqueue_init(&alarm->node); 298 timerqueue_init(&alarm->node);
299 hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
300 HRTIMER_MODE_ABS);
301 alarm->timer.function = alarmtimer_fired;
327 alarm->function = function; 302 alarm->function = function;
328 alarm->type = type; 303 alarm->type = type;
329 alarm->state = ALARMTIMER_STATE_INACTIVE; 304 alarm->state = ALARMTIMER_STATE_INACTIVE;
@@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
334 * @alarm: ptr to alarm to set 309 * @alarm: ptr to alarm to set
335 * @start: time to run the alarm 310 * @start: time to run the alarm
336 */ 311 */
337void alarm_start(struct alarm *alarm, ktime_t start) 312int alarm_start(struct alarm *alarm, ktime_t start)
338{ 313{
339 struct alarm_base *base = &alarm_bases[alarm->type]; 314 struct alarm_base *base = &alarm_bases[alarm->type];
340 unsigned long flags; 315 unsigned long flags;
316 int ret;
341 317
342 spin_lock_irqsave(&base->lock, flags); 318 spin_lock_irqsave(&base->lock, flags);
343 if (alarmtimer_active(alarm))
344 alarmtimer_remove(base, alarm);
345 alarm->node.expires = start; 319 alarm->node.expires = start;
346 alarmtimer_enqueue(base, alarm); 320 alarmtimer_enqueue(base, alarm);
321 ret = hrtimer_start(&alarm->timer, alarm->node.expires,
322 HRTIMER_MODE_ABS);
347 spin_unlock_irqrestore(&base->lock, flags); 323 spin_unlock_irqrestore(&base->lock, flags);
324 return ret;
348} 325}
349 326
350/** 327/**
@@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)
358{ 335{
359 struct alarm_base *base = &alarm_bases[alarm->type]; 336 struct alarm_base *base = &alarm_bases[alarm->type];
360 unsigned long flags; 337 unsigned long flags;
361 int ret = -1; 338 int ret;
362 spin_lock_irqsave(&base->lock, flags);
363
364 if (alarmtimer_callback_running(alarm))
365 goto out;
366 339
367 if (alarmtimer_is_queued(alarm)) { 340 spin_lock_irqsave(&base->lock, flags);
368 alarmtimer_remove(base, alarm); 341 ret = hrtimer_try_to_cancel(&alarm->timer);
369 ret = 1; 342 if (ret >= 0)
370 } else 343 alarmtimer_dequeue(base, alarm);
371 ret = 0;
372out:
373 spin_unlock_irqrestore(&base->lock, flags); 344 spin_unlock_irqrestore(&base->lock, flags);
374 return ret; 345 return ret;
375} 346}
@@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)
802 for (i = 0; i < ALARM_NUMTYPE; i++) { 773 for (i = 0; i < ALARM_NUMTYPE; i++) {
803 timerqueue_init_head(&alarm_bases[i].timerqueue); 774 timerqueue_init_head(&alarm_bases[i].timerqueue);
804 spin_lock_init(&alarm_bases[i].lock); 775 spin_lock_init(&alarm_bases[i].lock);
805 hrtimer_init(&alarm_bases[i].timer,
806 alarm_bases[i].base_clockid,
807 HRTIMER_MODE_ABS);
808 alarm_bases[i].timer.function = alarmtimer_fired;
809 } 776 }
810 777
811 error = alarmtimer_rtc_interface_setup(); 778 error = alarmtimer_rtc_interface_setup();
@@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)
821 error = PTR_ERR(pdev); 788 error = PTR_ERR(pdev);
822 goto out_drv; 789 goto out_drv;
823 } 790 }
791 ws = wakeup_source_register("alarmtimer");
824 return 0; 792 return 0;
825 793
826out_drv: 794out_drv:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a470154e0408..6629bf7b5285 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
37 * requested HZ value. It is also not recommended 37 * requested HZ value. It is also not recommended
38 * for "tick-less" systems. 38 * for "tick-less" systems.
39 */ 39 */
40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) 40#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
41 41
42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier 42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
43 * conversion, the .shift value could be zero. However 43 * conversion, the .shift value could be zero. However
@@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)
95{ 95{
96 return &clocksource_jiffies; 96 return &clocksource_jiffies;
97} 97}
98
99struct clocksource refined_jiffies;
100
101int register_refined_jiffies(long cycles_per_second)
102{
103 u64 nsec_per_tick, shift_hz;
104 long cycles_per_tick;
105
106
107
108 refined_jiffies = clocksource_jiffies;
109 refined_jiffies.name = "refined-jiffies";
110 refined_jiffies.rating++;
111
112 /* Calc cycles per tick */
113 cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
114 /* shift_hz stores hz<<8 for extra accuracy */
115 shift_hz = (u64)cycles_per_second << 8;
116 shift_hz += cycles_per_tick/2;
117 do_div(shift_hz, cycles_per_tick);
118 /* Calculate nsec_per_tick using shift_hz */
119 nsec_per_tick = (u64)NSEC_PER_SEC << 8;
120 nsec_per_tick += (u32)shift_hz/2;
121 do_div(nsec_per_tick, (u32)shift_hz);
122
123 refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
124
125 clocksource_register(&refined_jiffies);
126 return 0;
127}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b7fbadc5c973..24174b4d669b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock);
28/* USER_HZ period (usecs): */ 28/* USER_HZ period (usecs): */
29unsigned long tick_usec = TICK_USEC; 29unsigned long tick_usec = TICK_USEC;
30 30
31/* ACTHZ period (nsecs): */ 31/* SHIFTED_HZ period (nsecs): */
32unsigned long tick_nsec; 32unsigned long tick_nsec;
33 33
34static u64 tick_length; 34static u64 tick_length;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 41be02250e08..024540f97f74 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void)
105/* 105/*
106 * NO HZ enabled ? 106 * NO HZ enabled ?
107 */ 107 */
108static int tick_nohz_enabled __read_mostly = 1; 108int tick_nohz_enabled __read_mostly = 1;
109 109
110/* 110/*
111 * Enable / Disable tickless mode 111 * Enable / Disable tickless mode
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f045cc50832d..16280ff3cf82 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/timekeeper_internal.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/interrupt.h> 13#include <linux/interrupt.h>
13#include <linux/percpu.h> 14#include <linux/percpu.h>
@@ -21,61 +22,6 @@
21#include <linux/tick.h> 22#include <linux/tick.h>
22#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
23 24
24/* Structure holding internal timekeeping values. */
25struct timekeeper {
26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock;
28 /* NTP adjusted clock multiplier */
29 u32 mult;
30 /* The shift value of the current clocksource. */
31 u32 shift;
32 /* Number of clock cycles in one NTP interval. */
33 cycle_t cycle_interval;
34 /* Number of clock shifted nano seconds in one NTP interval. */
35 u64 xtime_interval;
36 /* shifted nano seconds left over when rounding cycle_interval */
37 s64 xtime_remainder;
38 /* Raw nano seconds accumulated per NTP interval. */
39 u32 raw_interval;
40
41 /* Current CLOCK_REALTIME time in seconds */
42 u64 xtime_sec;
43 /* Clock shifted nano seconds */
44 u64 xtime_nsec;
45
46 /* Difference between accumulated time and NTP time in ntp
47 * shifted nano seconds. */
48 s64 ntp_error;
49 /* Shift conversion between clock shifted nano seconds and
50 * ntp shifted nano seconds. */
51 u32 ntp_error_shift;
52
53 /*
54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
56 * at zero at system boot time, so wall_to_monotonic will be negative,
57 * however, we will ALWAYS keep the tv_nsec part positive so we can use
58 * the usual normalization.
59 *
60 * wall_to_monotonic is moved after resume from suspend for the
61 * monotonic time not to jump. We need to add total_sleep_time to
62 * wall_to_monotonic to get the real boot based time offset.
63 *
64 * - wall_to_monotonic is no longer the boot time, getboottime must be
65 * used instead.
66 */
67 struct timespec wall_to_monotonic;
68 /* time spent in suspend */
69 struct timespec total_sleep_time;
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time;
72 /* Offset clock monotonic -> clock realtime */
73 ktime_t offs_real;
74 /* Offset clock monotonic -> clock boottime */
75 ktime_t offs_boot;
76 /* Seqlock for all timekeeper values */
77 seqlock_t lock;
78};
79 25
80static struct timekeeper timekeeper; 26static struct timekeeper timekeeper;
81 27
@@ -96,25 +42,42 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
96 } 42 }
97} 43}
98 44
99static struct timespec tk_xtime(struct timekeeper *tk)
100{
101 struct timespec ts;
102
103 ts.tv_sec = tk->xtime_sec;
104 ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
105 return ts;
106}
107
108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) 45static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
109{ 46{
110 tk->xtime_sec = ts->tv_sec; 47 tk->xtime_sec = ts->tv_sec;
111 tk->xtime_nsec = ts->tv_nsec << tk->shift; 48 tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
112} 49}
113 50
114static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) 51static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
115{ 52{
116 tk->xtime_sec += ts->tv_sec; 53 tk->xtime_sec += ts->tv_sec;
117 tk->xtime_nsec += ts->tv_nsec << tk->shift; 54 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
55 tk_normalize_xtime(tk);
56}
57
58static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
59{
60 struct timespec tmp;
61
62 /*
63 * Verify consistency of: offset_real = -wall_to_monotonic
64 * before modifying anything
65 */
66 set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
67 -tk->wall_to_monotonic.tv_nsec);
68 WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
69 tk->wall_to_monotonic = wtm;
70 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
71 tk->offs_real = timespec_to_ktime(tmp);
72}
73
74static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
75{
76 /* Verify consistency before modifying */
77 WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
78
79 tk->total_sleep_time = t;
80 tk->offs_boot = timespec_to_ktime(t);
118} 81}
119 82
120/** 83/**
@@ -217,29 +180,16 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
217 return nsec + arch_gettimeoffset(); 180 return nsec + arch_gettimeoffset();
218} 181}
219 182
220static void update_rt_offset(struct timekeeper *tk)
221{
222 struct timespec tmp, *wtm = &tk->wall_to_monotonic;
223
224 set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
225 tk->offs_real = timespec_to_ktime(tmp);
226}
227
228/* must hold write on timekeeper.lock */ 183/* must hold write on timekeeper.lock */
229static void timekeeping_update(struct timekeeper *tk, bool clearntp) 184static void timekeeping_update(struct timekeeper *tk, bool clearntp)
230{ 185{
231 struct timespec xt;
232
233 if (clearntp) { 186 if (clearntp) {
234 tk->ntp_error = 0; 187 tk->ntp_error = 0;
235 ntp_clear(); 188 ntp_clear();
236 } 189 }
237 update_rt_offset(tk); 190 update_vsyscall(tk);
238 xt = tk_xtime(tk);
239 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
240} 191}
241 192
242
243/** 193/**
244 * timekeeping_forward_now - update clock to the current time 194 * timekeeping_forward_now - update clock to the current time
245 * 195 *
@@ -261,7 +211,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
261 tk->xtime_nsec += cycle_delta * tk->mult; 211 tk->xtime_nsec += cycle_delta * tk->mult;
262 212
263 /* If arch requires, add in gettimeoffset() */ 213 /* If arch requires, add in gettimeoffset() */
264 tk->xtime_nsec += arch_gettimeoffset() << tk->shift; 214 tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
265 215
266 tk_normalize_xtime(tk); 216 tk_normalize_xtime(tk);
267 217
@@ -277,38 +227,39 @@ static void timekeeping_forward_now(struct timekeeper *tk)
277 */ 227 */
278void getnstimeofday(struct timespec *ts) 228void getnstimeofday(struct timespec *ts)
279{ 229{
230 struct timekeeper *tk = &timekeeper;
280 unsigned long seq; 231 unsigned long seq;
281 s64 nsecs = 0; 232 s64 nsecs = 0;
282 233
283 WARN_ON(timekeeping_suspended); 234 WARN_ON(timekeeping_suspended);
284 235
285 do { 236 do {
286 seq = read_seqbegin(&timekeeper.lock); 237 seq = read_seqbegin(&tk->lock);
287 238
288 ts->tv_sec = timekeeper.xtime_sec; 239 ts->tv_sec = tk->xtime_sec;
289 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 240 nsecs = timekeeping_get_ns(tk);
290 241
291 } while (read_seqretry(&timekeeper.lock, seq)); 242 } while (read_seqretry(&tk->lock, seq));
292 243
244 ts->tv_nsec = 0;
293 timespec_add_ns(ts, nsecs); 245 timespec_add_ns(ts, nsecs);
294} 246}
295EXPORT_SYMBOL(getnstimeofday); 247EXPORT_SYMBOL(getnstimeofday);
296 248
297ktime_t ktime_get(void) 249ktime_t ktime_get(void)
298{ 250{
251 struct timekeeper *tk = &timekeeper;
299 unsigned int seq; 252 unsigned int seq;
300 s64 secs, nsecs; 253 s64 secs, nsecs;
301 254
302 WARN_ON(timekeeping_suspended); 255 WARN_ON(timekeeping_suspended);
303 256
304 do { 257 do {
305 seq = read_seqbegin(&timekeeper.lock); 258 seq = read_seqbegin(&tk->lock);
306 secs = timekeeper.xtime_sec + 259 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
307 timekeeper.wall_to_monotonic.tv_sec; 260 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
308 nsecs = timekeeping_get_ns(&timekeeper) +
309 timekeeper.wall_to_monotonic.tv_nsec;
310 261
311 } while (read_seqretry(&timekeeper.lock, seq)); 262 } while (read_seqretry(&tk->lock, seq));
312 /* 263 /*
313 * Use ktime_set/ktime_add_ns to create a proper ktime on 264 * Use ktime_set/ktime_add_ns to create a proper ktime on
314 * 32-bit architectures without CONFIG_KTIME_SCALAR. 265 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -327,21 +278,24 @@ EXPORT_SYMBOL_GPL(ktime_get);
327 */ 278 */
328void ktime_get_ts(struct timespec *ts) 279void ktime_get_ts(struct timespec *ts)
329{ 280{
281 struct timekeeper *tk = &timekeeper;
330 struct timespec tomono; 282 struct timespec tomono;
283 s64 nsec;
331 unsigned int seq; 284 unsigned int seq;
332 285
333 WARN_ON(timekeeping_suspended); 286 WARN_ON(timekeeping_suspended);
334 287
335 do { 288 do {
336 seq = read_seqbegin(&timekeeper.lock); 289 seq = read_seqbegin(&tk->lock);
337 ts->tv_sec = timekeeper.xtime_sec; 290 ts->tv_sec = tk->xtime_sec;
338 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 291 nsec = timekeeping_get_ns(tk);
339 tomono = timekeeper.wall_to_monotonic; 292 tomono = tk->wall_to_monotonic;
340 293
341 } while (read_seqretry(&timekeeper.lock, seq)); 294 } while (read_seqretry(&tk->lock, seq));
342 295
343 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 296 ts->tv_sec += tomono.tv_sec;
344 ts->tv_nsec + tomono.tv_nsec); 297 ts->tv_nsec = 0;
298 timespec_add_ns(ts, nsec + tomono.tv_nsec);
345} 299}
346EXPORT_SYMBOL_GPL(ktime_get_ts); 300EXPORT_SYMBOL_GPL(ktime_get_ts);
347 301
@@ -358,22 +312,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
358 */ 312 */
359void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) 313void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
360{ 314{
315 struct timekeeper *tk = &timekeeper;
361 unsigned long seq; 316 unsigned long seq;
362 s64 nsecs_raw, nsecs_real; 317 s64 nsecs_raw, nsecs_real;
363 318
364 WARN_ON_ONCE(timekeeping_suspended); 319 WARN_ON_ONCE(timekeeping_suspended);
365 320
366 do { 321 do {
367 seq = read_seqbegin(&timekeeper.lock); 322 seq = read_seqbegin(&tk->lock);
368 323
369 *ts_raw = timekeeper.raw_time; 324 *ts_raw = tk->raw_time;
370 ts_real->tv_sec = timekeeper.xtime_sec; 325 ts_real->tv_sec = tk->xtime_sec;
371 ts_real->tv_nsec = 0; 326 ts_real->tv_nsec = 0;
372 327
373 nsecs_raw = timekeeping_get_ns_raw(&timekeeper); 328 nsecs_raw = timekeeping_get_ns_raw(tk);
374 nsecs_real = timekeeping_get_ns(&timekeeper); 329 nsecs_real = timekeeping_get_ns(tk);
375 330
376 } while (read_seqretry(&timekeeper.lock, seq)); 331 } while (read_seqretry(&tk->lock, seq));
377 332
378 timespec_add_ns(ts_raw, nsecs_raw); 333 timespec_add_ns(ts_raw, nsecs_raw);
379 timespec_add_ns(ts_real, nsecs_real); 334 timespec_add_ns(ts_real, nsecs_real);
@@ -406,28 +361,28 @@ EXPORT_SYMBOL(do_gettimeofday);
406 */ 361 */
407int do_settimeofday(const struct timespec *tv) 362int do_settimeofday(const struct timespec *tv)
408{ 363{
364 struct timekeeper *tk = &timekeeper;
409 struct timespec ts_delta, xt; 365 struct timespec ts_delta, xt;
410 unsigned long flags; 366 unsigned long flags;
411 367
412 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 368 if (!timespec_valid_strict(tv))
413 return -EINVAL; 369 return -EINVAL;
414 370
415 write_seqlock_irqsave(&timekeeper.lock, flags); 371 write_seqlock_irqsave(&tk->lock, flags);
416 372
417 timekeeping_forward_now(&timekeeper); 373 timekeeping_forward_now(tk);
418 374
419 xt = tk_xtime(&timekeeper); 375 xt = tk_xtime(tk);
420 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 376 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
421 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 377 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
422 378
423 timekeeper.wall_to_monotonic = 379 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
424 timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
425 380
426 tk_set_xtime(&timekeeper, tv); 381 tk_set_xtime(tk, tv);
427 382
428 timekeeping_update(&timekeeper, true); 383 timekeeping_update(tk, true);
429 384
430 write_sequnlock_irqrestore(&timekeeper.lock, flags); 385 write_sequnlock_irqrestore(&tk->lock, flags);
431 386
432 /* signal hrtimers about time change */ 387 /* signal hrtimers about time change */
433 clock_was_set(); 388 clock_was_set();
@@ -436,7 +391,6 @@ int do_settimeofday(const struct timespec *tv)
436} 391}
437EXPORT_SYMBOL(do_settimeofday); 392EXPORT_SYMBOL(do_settimeofday);
438 393
439
440/** 394/**
441 * timekeeping_inject_offset - Adds or subtracts from the current time. 395 * timekeeping_inject_offset - Adds or subtracts from the current time.
442 * @tv: pointer to the timespec variable containing the offset 396 * @tv: pointer to the timespec variable containing the offset
@@ -445,28 +399,37 @@ EXPORT_SYMBOL(do_settimeofday);
445 */ 399 */
446int timekeeping_inject_offset(struct timespec *ts) 400int timekeeping_inject_offset(struct timespec *ts)
447{ 401{
402 struct timekeeper *tk = &timekeeper;
448 unsigned long flags; 403 unsigned long flags;
404 struct timespec tmp;
405 int ret = 0;
449 406
450 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 407 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
451 return -EINVAL; 408 return -EINVAL;
452 409
453 write_seqlock_irqsave(&timekeeper.lock, flags); 410 write_seqlock_irqsave(&tk->lock, flags);
454 411
455 timekeeping_forward_now(&timekeeper); 412 timekeeping_forward_now(tk);
456 413
414 /* Make sure the proposed value is valid */
415 tmp = timespec_add(tk_xtime(tk), *ts);
416 if (!timespec_valid_strict(&tmp)) {
417 ret = -EINVAL;
418 goto error;
419 }
457 420
458 tk_xtime_add(&timekeeper, ts); 421 tk_xtime_add(tk, ts);
459 timekeeper.wall_to_monotonic = 422 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
460 timespec_sub(timekeeper.wall_to_monotonic, *ts);
461 423
462 timekeeping_update(&timekeeper, true); 424error: /* even if we error out, we forwarded the time, so call update */
425 timekeeping_update(tk, true);
463 426
464 write_sequnlock_irqrestore(&timekeeper.lock, flags); 427 write_sequnlock_irqrestore(&tk->lock, flags);
465 428
466 /* signal hrtimers about time change */ 429 /* signal hrtimers about time change */
467 clock_was_set(); 430 clock_was_set();
468 431
469 return 0; 432 return ret;
470} 433}
471EXPORT_SYMBOL(timekeeping_inject_offset); 434EXPORT_SYMBOL(timekeeping_inject_offset);
472 435
@@ -477,23 +440,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
477 */ 440 */
478static int change_clocksource(void *data) 441static int change_clocksource(void *data)
479{ 442{
443 struct timekeeper *tk = &timekeeper;
480 struct clocksource *new, *old; 444 struct clocksource *new, *old;
481 unsigned long flags; 445 unsigned long flags;
482 446
483 new = (struct clocksource *) data; 447 new = (struct clocksource *) data;
484 448
485 write_seqlock_irqsave(&timekeeper.lock, flags); 449 write_seqlock_irqsave(&tk->lock, flags);
486 450
487 timekeeping_forward_now(&timekeeper); 451 timekeeping_forward_now(tk);
488 if (!new->enable || new->enable(new) == 0) { 452 if (!new->enable || new->enable(new) == 0) {
489 old = timekeeper.clock; 453 old = tk->clock;
490 tk_setup_internals(&timekeeper, new); 454 tk_setup_internals(tk, new);
491 if (old->disable) 455 if (old->disable)
492 old->disable(old); 456 old->disable(old);
493 } 457 }
494 timekeeping_update(&timekeeper, true); 458 timekeeping_update(tk, true);
495 459
496 write_sequnlock_irqrestore(&timekeeper.lock, flags); 460 write_sequnlock_irqrestore(&tk->lock, flags);
497 461
498 return 0; 462 return 0;
499} 463}
@@ -507,7 +471,9 @@ static int change_clocksource(void *data)
507 */ 471 */
508void timekeeping_notify(struct clocksource *clock) 472void timekeeping_notify(struct clocksource *clock)
509{ 473{
510 if (timekeeper.clock == clock) 474 struct timekeeper *tk = &timekeeper;
475
476 if (tk->clock == clock)
511 return; 477 return;
512 stop_machine(change_clocksource, clock, NULL); 478 stop_machine(change_clocksource, clock, NULL);
513 tick_clock_notify(); 479 tick_clock_notify();
@@ -536,35 +502,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
536 */ 502 */
537void getrawmonotonic(struct timespec *ts) 503void getrawmonotonic(struct timespec *ts)
538{ 504{
505 struct timekeeper *tk = &timekeeper;
539 unsigned long seq; 506 unsigned long seq;
540 s64 nsecs; 507 s64 nsecs;
541 508
542 do { 509 do {
543 seq = read_seqbegin(&timekeeper.lock); 510 seq = read_seqbegin(&tk->lock);
544 nsecs = timekeeping_get_ns_raw(&timekeeper); 511 nsecs = timekeeping_get_ns_raw(tk);
545 *ts = timekeeper.raw_time; 512 *ts = tk->raw_time;
546 513
547 } while (read_seqretry(&timekeeper.lock, seq)); 514 } while (read_seqretry(&tk->lock, seq));
548 515
549 timespec_add_ns(ts, nsecs); 516 timespec_add_ns(ts, nsecs);
550} 517}
551EXPORT_SYMBOL(getrawmonotonic); 518EXPORT_SYMBOL(getrawmonotonic);
552 519
553
554/** 520/**
555 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 521 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
556 */ 522 */
557int timekeeping_valid_for_hres(void) 523int timekeeping_valid_for_hres(void)
558{ 524{
525 struct timekeeper *tk = &timekeeper;
559 unsigned long seq; 526 unsigned long seq;
560 int ret; 527 int ret;
561 528
562 do { 529 do {
563 seq = read_seqbegin(&timekeeper.lock); 530 seq = read_seqbegin(&tk->lock);
564 531
565 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 532 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
566 533
567 } while (read_seqretry(&timekeeper.lock, seq)); 534 } while (read_seqretry(&tk->lock, seq));
568 535
569 return ret; 536 return ret;
570} 537}
@@ -574,15 +541,16 @@ int timekeeping_valid_for_hres(void)
574 */ 541 */
575u64 timekeeping_max_deferment(void) 542u64 timekeeping_max_deferment(void)
576{ 543{
544 struct timekeeper *tk = &timekeeper;
577 unsigned long seq; 545 unsigned long seq;
578 u64 ret; 546 u64 ret;
579 547
580 do { 548 do {
581 seq = read_seqbegin(&timekeeper.lock); 549 seq = read_seqbegin(&tk->lock);
582 550
583 ret = timekeeper.clock->max_idle_ns; 551 ret = tk->clock->max_idle_ns;
584 552
585 } while (read_seqretry(&timekeeper.lock, seq)); 553 } while (read_seqretry(&tk->lock, seq));
586 554
587 return ret; 555 return ret;
588} 556}
@@ -622,46 +590,56 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
622 */ 590 */
623void __init timekeeping_init(void) 591void __init timekeeping_init(void)
624{ 592{
593 struct timekeeper *tk = &timekeeper;
625 struct clocksource *clock; 594 struct clocksource *clock;
626 unsigned long flags; 595 unsigned long flags;
627 struct timespec now, boot; 596 struct timespec now, boot, tmp;
628 597
629 read_persistent_clock(&now); 598 read_persistent_clock(&now);
599 if (!timespec_valid_strict(&now)) {
600 pr_warn("WARNING: Persistent clock returned invalid value!\n"
601 " Check your CMOS/BIOS settings.\n");
602 now.tv_sec = 0;
603 now.tv_nsec = 0;
604 }
605
630 read_boot_clock(&boot); 606 read_boot_clock(&boot);
607 if (!timespec_valid_strict(&boot)) {
608 pr_warn("WARNING: Boot clock returned invalid value!\n"
609 " Check your CMOS/BIOS settings.\n");
610 boot.tv_sec = 0;
611 boot.tv_nsec = 0;
612 }
631 613
632 seqlock_init(&timekeeper.lock); 614 seqlock_init(&tk->lock);
633 615
634 ntp_init(); 616 ntp_init();
635 617
636 write_seqlock_irqsave(&timekeeper.lock, flags); 618 write_seqlock_irqsave(&tk->lock, flags);
637 clock = clocksource_default_clock(); 619 clock = clocksource_default_clock();
638 if (clock->enable) 620 if (clock->enable)
639 clock->enable(clock); 621 clock->enable(clock);
640 tk_setup_internals(&timekeeper, clock); 622 tk_setup_internals(tk, clock);
641 623
642 tk_set_xtime(&timekeeper, &now); 624 tk_set_xtime(tk, &now);
643 timekeeper.raw_time.tv_sec = 0; 625 tk->raw_time.tv_sec = 0;
644 timekeeper.raw_time.tv_nsec = 0; 626 tk->raw_time.tv_nsec = 0;
645 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 627 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
646 boot = tk_xtime(&timekeeper); 628 boot = tk_xtime(tk);
647 629
648 set_normalized_timespec(&timekeeper.wall_to_monotonic, 630 set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
649 -boot.tv_sec, -boot.tv_nsec); 631 tk_set_wall_to_mono(tk, tmp);
650 update_rt_offset(&timekeeper); 632
651 timekeeper.total_sleep_time.tv_sec = 0; 633 tmp.tv_sec = 0;
652 timekeeper.total_sleep_time.tv_nsec = 0; 634 tmp.tv_nsec = 0;
653 write_sequnlock_irqrestore(&timekeeper.lock, flags); 635 tk_set_sleep_time(tk, tmp);
636
637 write_sequnlock_irqrestore(&tk->lock, flags);
654} 638}
655 639
656/* time in seconds when suspend began */ 640/* time in seconds when suspend began */
657static struct timespec timekeeping_suspend_time; 641static struct timespec timekeeping_suspend_time;
658 642
659static void update_sleep_time(struct timespec t)
660{
661 timekeeper.total_sleep_time = t;
662 timekeeper.offs_boot = timespec_to_ktime(t);
663}
664
665/** 643/**
666 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 644 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
667 * @delta: pointer to a timespec delta value 645 * @delta: pointer to a timespec delta value
@@ -672,18 +650,16 @@ static void update_sleep_time(struct timespec t)
672static void __timekeeping_inject_sleeptime(struct timekeeper *tk, 650static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
673 struct timespec *delta) 651 struct timespec *delta)
674{ 652{
675 if (!timespec_valid(delta)) { 653 if (!timespec_valid_strict(delta)) {
676 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 654 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
677 "sleep delta value!\n"); 655 "sleep delta value!\n");
678 return; 656 return;
679 } 657 }
680
681 tk_xtime_add(tk, delta); 658 tk_xtime_add(tk, delta);
682 tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); 659 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
683 update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); 660 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
684} 661}
685 662
686
687/** 663/**
688 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values 664 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
689 * @delta: pointer to a timespec delta value 665 * @delta: pointer to a timespec delta value
@@ -696,6 +672,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
696 */ 672 */
697void timekeeping_inject_sleeptime(struct timespec *delta) 673void timekeeping_inject_sleeptime(struct timespec *delta)
698{ 674{
675 struct timekeeper *tk = &timekeeper;
699 unsigned long flags; 676 unsigned long flags;
700 struct timespec ts; 677 struct timespec ts;
701 678
@@ -704,21 +681,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
704 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 681 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
705 return; 682 return;
706 683
707 write_seqlock_irqsave(&timekeeper.lock, flags); 684 write_seqlock_irqsave(&tk->lock, flags);
708 685
709 timekeeping_forward_now(&timekeeper); 686 timekeeping_forward_now(tk);
710 687
711 __timekeeping_inject_sleeptime(&timekeeper, delta); 688 __timekeeping_inject_sleeptime(tk, delta);
712 689
713 timekeeping_update(&timekeeper, true); 690 timekeeping_update(tk, true);
714 691
715 write_sequnlock_irqrestore(&timekeeper.lock, flags); 692 write_sequnlock_irqrestore(&tk->lock, flags);
716 693
717 /* signal hrtimers about time change */ 694 /* signal hrtimers about time change */
718 clock_was_set(); 695 clock_was_set();
719} 696}
720 697
721
722/** 698/**
723 * timekeeping_resume - Resumes the generic timekeeping subsystem. 699 * timekeeping_resume - Resumes the generic timekeeping subsystem.
724 * 700 *
@@ -728,6 +704,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
728 */ 704 */
729static void timekeeping_resume(void) 705static void timekeeping_resume(void)
730{ 706{
707 struct timekeeper *tk = &timekeeper;
731 unsigned long flags; 708 unsigned long flags;
732 struct timespec ts; 709 struct timespec ts;
733 710
@@ -735,18 +712,18 @@ static void timekeeping_resume(void)
735 712
736 clocksource_resume(); 713 clocksource_resume();
737 714
738 write_seqlock_irqsave(&timekeeper.lock, flags); 715 write_seqlock_irqsave(&tk->lock, flags);
739 716
740 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 717 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
741 ts = timespec_sub(ts, timekeeping_suspend_time); 718 ts = timespec_sub(ts, timekeeping_suspend_time);
742 __timekeeping_inject_sleeptime(&timekeeper, &ts); 719 __timekeeping_inject_sleeptime(tk, &ts);
743 } 720 }
744 /* re-base the last cycle value */ 721 /* re-base the last cycle value */
745 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 722 tk->clock->cycle_last = tk->clock->read(tk->clock);
746 timekeeper.ntp_error = 0; 723 tk->ntp_error = 0;
747 timekeeping_suspended = 0; 724 timekeeping_suspended = 0;
748 timekeeping_update(&timekeeper, false); 725 timekeeping_update(tk, false);
749 write_sequnlock_irqrestore(&timekeeper.lock, flags); 726 write_sequnlock_irqrestore(&tk->lock, flags);
750 727
751 touch_softlockup_watchdog(); 728 touch_softlockup_watchdog();
752 729
@@ -758,14 +735,15 @@ static void timekeeping_resume(void)
758 735
759static int timekeeping_suspend(void) 736static int timekeeping_suspend(void)
760{ 737{
738 struct timekeeper *tk = &timekeeper;
761 unsigned long flags; 739 unsigned long flags;
762 struct timespec delta, delta_delta; 740 struct timespec delta, delta_delta;
763 static struct timespec old_delta; 741 static struct timespec old_delta;
764 742
765 read_persistent_clock(&timekeeping_suspend_time); 743 read_persistent_clock(&timekeeping_suspend_time);
766 744
767 write_seqlock_irqsave(&timekeeper.lock, flags); 745 write_seqlock_irqsave(&tk->lock, flags);
768 timekeeping_forward_now(&timekeeper); 746 timekeeping_forward_now(tk);
769 timekeeping_suspended = 1; 747 timekeeping_suspended = 1;
770 748
771 /* 749 /*
@@ -774,7 +752,7 @@ static int timekeeping_suspend(void)
774 * try to compensate so the difference in system time 752 * try to compensate so the difference in system time
775 * and persistent_clock time stays close to constant. 753 * and persistent_clock time stays close to constant.
776 */ 754 */
777 delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); 755 delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
778 delta_delta = timespec_sub(delta, old_delta); 756 delta_delta = timespec_sub(delta, old_delta);
779 if (abs(delta_delta.tv_sec) >= 2) { 757 if (abs(delta_delta.tv_sec) >= 2) {
780 /* 758 /*
@@ -787,7 +765,7 @@ static int timekeeping_suspend(void)
787 timekeeping_suspend_time = 765 timekeeping_suspend_time =
788 timespec_add(timekeeping_suspend_time, delta_delta); 766 timespec_add(timekeeping_suspend_time, delta_delta);
789 } 767 }
790 write_sequnlock_irqrestore(&timekeeper.lock, flags); 768 write_sequnlock_irqrestore(&tk->lock, flags);
791 769
792 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 770 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
793 clocksource_suspend(); 771 clocksource_suspend();
@@ -898,27 +876,29 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
898 * the error. This causes the likely below to be unlikely. 876 * the error. This causes the likely below to be unlikely.
899 * 877 *
900 * The proper fix is to avoid rounding up by using 878 * The proper fix is to avoid rounding up by using
901 * the high precision timekeeper.xtime_nsec instead of 879 * the high precision tk->xtime_nsec instead of
902 * xtime.tv_nsec everywhere. Fixing this will take some 880 * xtime.tv_nsec everywhere. Fixing this will take some
903 * time. 881 * time.
904 */ 882 */
905 if (likely(error <= interval)) 883 if (likely(error <= interval))
906 adj = 1; 884 adj = 1;
907 else 885 else
908 adj = timekeeping_bigadjust(tk, error, &interval, 886 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
909 &offset); 887 } else {
910 } else if (error < -interval) { 888 if (error < -interval) {
911 /* See comment above, this is just switched for the negative */ 889 /* See comment above, this is just switched for the negative */
912 error >>= 2; 890 error >>= 2;
913 if (likely(error >= -interval)) { 891 if (likely(error >= -interval)) {
914 adj = -1; 892 adj = -1;
915 interval = -interval; 893 interval = -interval;
916 offset = -offset; 894 offset = -offset;
917 } else 895 } else {
918 adj = timekeeping_bigadjust(tk, error, &interval, 896 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
919 &offset); 897 }
920 } else 898 } else {
921 return; 899 goto out_adjust;
900 }
901 }
922 902
923 if (unlikely(tk->clock->maxadj && 903 if (unlikely(tk->clock->maxadj &&
924 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { 904 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
@@ -981,6 +961,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
981 tk->xtime_nsec -= offset; 961 tk->xtime_nsec -= offset;
982 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; 962 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
983 963
964out_adjust:
984 /* 965 /*
985 * It may be possible that when we entered this function, xtime_nsec 966 * It may be possible that when we entered this function, xtime_nsec
986 * was very small. Further, if we're slightly speeding the clocksource 967 * was very small. Further, if we're slightly speeding the clocksource
@@ -1003,7 +984,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1003 984
1004} 985}
1005 986
1006
1007/** 987/**
1008 * accumulate_nsecs_to_secs - Accumulates nsecs into secs 988 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
1009 * 989 *
@@ -1024,15 +1004,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1024 1004
1025 /* Figure out if its a leap sec and apply if needed */ 1005 /* Figure out if its a leap sec and apply if needed */
1026 leap = second_overflow(tk->xtime_sec); 1006 leap = second_overflow(tk->xtime_sec);
1027 tk->xtime_sec += leap; 1007 if (unlikely(leap)) {
1028 tk->wall_to_monotonic.tv_sec -= leap; 1008 struct timespec ts;
1029 if (leap) 1009
1030 clock_was_set_delayed(); 1010 tk->xtime_sec += leap;
1031 1011
1012 ts.tv_sec = leap;
1013 ts.tv_nsec = 0;
1014 tk_set_wall_to_mono(tk,
1015 timespec_sub(tk->wall_to_monotonic, ts));
1016
1017 clock_was_set_delayed();
1018 }
1032 } 1019 }
1033} 1020}
1034 1021
1035
1036/** 1022/**
1037 * logarithmic_accumulation - shifted accumulation of cycles 1023 * logarithmic_accumulation - shifted accumulation of cycles
1038 * 1024 *
@@ -1076,6 +1062,32 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1076 return offset; 1062 return offset;
1077} 1063}
1078 1064
1065#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
1066static inline void old_vsyscall_fixup(struct timekeeper *tk)
1067{
1068 s64 remainder;
1069
1070 /*
1071 * Store only full nanoseconds into xtime_nsec after rounding
1072 * it up and add the remainder to the error difference.
1073 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1074 * by truncating the remainder in vsyscalls. However, it causes
1075 * additional work to be done in timekeeping_adjust(). Once
1076 * the vsyscall implementations are converted to use xtime_nsec
1077 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
1078 * users are removed, this can be killed.
1079 */
1080 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1081 tk->xtime_nsec -= remainder;
1082 tk->xtime_nsec += 1ULL << tk->shift;
1083 tk->ntp_error += remainder << tk->ntp_error_shift;
1084
1085}
1086#else
1087#define old_vsyscall_fixup(tk)
1088#endif
1089
1090
1079 1091
1080/** 1092/**
1081 * update_wall_time - Uses the current clocksource to increment the wall time 1093 * update_wall_time - Uses the current clocksource to increment the wall time
@@ -1084,25 +1096,29 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1084static void update_wall_time(void) 1096static void update_wall_time(void)
1085{ 1097{
1086 struct clocksource *clock; 1098 struct clocksource *clock;
1099 struct timekeeper *tk = &timekeeper;
1087 cycle_t offset; 1100 cycle_t offset;
1088 int shift = 0, maxshift; 1101 int shift = 0, maxshift;
1089 unsigned long flags; 1102 unsigned long flags;
1090 s64 remainder;
1091 1103
1092 write_seqlock_irqsave(&timekeeper.lock, flags); 1104 write_seqlock_irqsave(&tk->lock, flags);
1093 1105
1094 /* Make sure we're fully resumed: */ 1106 /* Make sure we're fully resumed: */
1095 if (unlikely(timekeeping_suspended)) 1107 if (unlikely(timekeeping_suspended))
1096 goto out; 1108 goto out;
1097 1109
1098 clock = timekeeper.clock; 1110 clock = tk->clock;
1099 1111
1100#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1112#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1101 offset = timekeeper.cycle_interval; 1113 offset = tk->cycle_interval;
1102#else 1114#else
1103 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1115 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1104#endif 1116#endif
1105 1117
1118 /* Check if there's really nothing to do */
1119 if (offset < tk->cycle_interval)
1120 goto out;
1121
1106 /* 1122 /*
1107 * With NO_HZ we may have to accumulate many cycle_intervals 1123 * With NO_HZ we may have to accumulate many cycle_intervals
1108 * (think "ticks") worth of time at once. To do this efficiently, 1124 * (think "ticks") worth of time at once. To do this efficiently,
@@ -1111,45 +1127,36 @@ static void update_wall_time(void)
1111 * chunk in one go, and then try to consume the next smaller 1127 * chunk in one go, and then try to consume the next smaller
1112 * doubled multiple. 1128 * doubled multiple.
1113 */ 1129 */
1114 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); 1130 shift = ilog2(offset) - ilog2(tk->cycle_interval);
1115 shift = max(0, shift); 1131 shift = max(0, shift);
1116 /* Bound shift to one less than what overflows tick_length */ 1132 /* Bound shift to one less than what overflows tick_length */
1117 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 1133 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1118 shift = min(shift, maxshift); 1134 shift = min(shift, maxshift);
1119 while (offset >= timekeeper.cycle_interval) { 1135 while (offset >= tk->cycle_interval) {
1120 offset = logarithmic_accumulation(&timekeeper, offset, shift); 1136 offset = logarithmic_accumulation(tk, offset, shift);
1121 if(offset < timekeeper.cycle_interval<<shift) 1137 if (offset < tk->cycle_interval<<shift)
1122 shift--; 1138 shift--;
1123 } 1139 }
1124 1140
1125 /* correct the clock when NTP error is too big */ 1141 /* correct the clock when NTP error is too big */
1126 timekeeping_adjust(&timekeeper, offset); 1142 timekeeping_adjust(tk, offset);
1127
1128 1143
1129 /* 1144 /*
1130 * Store only full nanoseconds into xtime_nsec after rounding 1145 * XXX This can be killed once everyone converts
1131 * it up and add the remainder to the error difference. 1146 * to the new update_vsyscall.
1132 * XXX - This is necessary to avoid small 1ns inconsistnecies caused 1147 */
1133 * by truncating the remainder in vsyscalls. However, it causes 1148 old_vsyscall_fixup(tk);
1134 * additional work to be done in timekeeping_adjust(). Once
1135 * the vsyscall implementations are converted to use xtime_nsec
1136 * (shifted nanoseconds), this can be killed.
1137 */
1138 remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
1139 timekeeper.xtime_nsec -= remainder;
1140 timekeeper.xtime_nsec += 1 << timekeeper.shift;
1141 timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
1142 1149
1143 /* 1150 /*
1144 * Finally, make sure that after the rounding 1151 * Finally, make sure that after the rounding
1145 * xtime_nsec isn't larger than NSEC_PER_SEC 1152 * xtime_nsec isn't larger than NSEC_PER_SEC
1146 */ 1153 */
1147 accumulate_nsecs_to_secs(&timekeeper); 1154 accumulate_nsecs_to_secs(tk);
1148 1155
1149 timekeeping_update(&timekeeper, false); 1156 timekeeping_update(tk, false);
1150 1157
1151out: 1158out:
1152 write_sequnlock_irqrestore(&timekeeper.lock, flags); 1159 write_sequnlock_irqrestore(&tk->lock, flags);
1153 1160
1154} 1161}
1155 1162
@@ -1166,18 +1173,18 @@ out:
1166 */ 1173 */
1167void getboottime(struct timespec *ts) 1174void getboottime(struct timespec *ts)
1168{ 1175{
1176 struct timekeeper *tk = &timekeeper;
1169 struct timespec boottime = { 1177 struct timespec boottime = {
1170 .tv_sec = timekeeper.wall_to_monotonic.tv_sec + 1178 .tv_sec = tk->wall_to_monotonic.tv_sec +
1171 timekeeper.total_sleep_time.tv_sec, 1179 tk->total_sleep_time.tv_sec,
1172 .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + 1180 .tv_nsec = tk->wall_to_monotonic.tv_nsec +
1173 timekeeper.total_sleep_time.tv_nsec 1181 tk->total_sleep_time.tv_nsec
1174 }; 1182 };
1175 1183
1176 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 1184 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
1177} 1185}
1178EXPORT_SYMBOL_GPL(getboottime); 1186EXPORT_SYMBOL_GPL(getboottime);
1179 1187
1180
1181/** 1188/**
1182 * get_monotonic_boottime - Returns monotonic time since boot 1189 * get_monotonic_boottime - Returns monotonic time since boot
1183 * @ts: pointer to the timespec to be set 1190 * @ts: pointer to the timespec to be set
@@ -1189,22 +1196,25 @@ EXPORT_SYMBOL_GPL(getboottime);
1189 */ 1196 */
1190void get_monotonic_boottime(struct timespec *ts) 1197void get_monotonic_boottime(struct timespec *ts)
1191{ 1198{
1199 struct timekeeper *tk = &timekeeper;
1192 struct timespec tomono, sleep; 1200 struct timespec tomono, sleep;
1201 s64 nsec;
1193 unsigned int seq; 1202 unsigned int seq;
1194 1203
1195 WARN_ON(timekeeping_suspended); 1204 WARN_ON(timekeeping_suspended);
1196 1205
1197 do { 1206 do {
1198 seq = read_seqbegin(&timekeeper.lock); 1207 seq = read_seqbegin(&tk->lock);
1199 ts->tv_sec = timekeeper.xtime_sec; 1208 ts->tv_sec = tk->xtime_sec;
1200 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 1209 nsec = timekeeping_get_ns(tk);
1201 tomono = timekeeper.wall_to_monotonic; 1210 tomono = tk->wall_to_monotonic;
1202 sleep = timekeeper.total_sleep_time; 1211 sleep = tk->total_sleep_time;
1203 1212
1204 } while (read_seqretry(&timekeeper.lock, seq)); 1213 } while (read_seqretry(&tk->lock, seq));
1205 1214
1206 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1215 ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
1207 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); 1216 ts->tv_nsec = 0;
1217 timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
1208} 1218}
1209EXPORT_SYMBOL_GPL(get_monotonic_boottime); 1219EXPORT_SYMBOL_GPL(get_monotonic_boottime);
1210 1220
@@ -1231,31 +1241,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
1231 */ 1241 */
1232void monotonic_to_bootbased(struct timespec *ts) 1242void monotonic_to_bootbased(struct timespec *ts)
1233{ 1243{
1234 *ts = timespec_add(*ts, timekeeper.total_sleep_time); 1244 struct timekeeper *tk = &timekeeper;
1245
1246 *ts = timespec_add(*ts, tk->total_sleep_time);
1235} 1247}
1236EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1248EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1237 1249
1238unsigned long get_seconds(void) 1250unsigned long get_seconds(void)
1239{ 1251{
1240 return timekeeper.xtime_sec; 1252 struct timekeeper *tk = &timekeeper;
1253
1254 return tk->xtime_sec;
1241} 1255}
1242EXPORT_SYMBOL(get_seconds); 1256EXPORT_SYMBOL(get_seconds);
1243 1257
1244struct timespec __current_kernel_time(void) 1258struct timespec __current_kernel_time(void)
1245{ 1259{
1246 return tk_xtime(&timekeeper); 1260 struct timekeeper *tk = &timekeeper;
1261
1262 return tk_xtime(tk);
1247} 1263}
1248 1264
1249struct timespec current_kernel_time(void) 1265struct timespec current_kernel_time(void)
1250{ 1266{
1267 struct timekeeper *tk = &timekeeper;
1251 struct timespec now; 1268 struct timespec now;
1252 unsigned long seq; 1269 unsigned long seq;
1253 1270
1254 do { 1271 do {
1255 seq = read_seqbegin(&timekeeper.lock); 1272 seq = read_seqbegin(&tk->lock);
1256 1273
1257 now = tk_xtime(&timekeeper); 1274 now = tk_xtime(tk);
1258 } while (read_seqretry(&timekeeper.lock, seq)); 1275 } while (read_seqretry(&tk->lock, seq));
1259 1276
1260 return now; 1277 return now;
1261} 1278}
@@ -1263,15 +1280,16 @@ EXPORT_SYMBOL(current_kernel_time);
1263 1280
1264struct timespec get_monotonic_coarse(void) 1281struct timespec get_monotonic_coarse(void)
1265{ 1282{
1283 struct timekeeper *tk = &timekeeper;
1266 struct timespec now, mono; 1284 struct timespec now, mono;
1267 unsigned long seq; 1285 unsigned long seq;
1268 1286
1269 do { 1287 do {
1270 seq = read_seqbegin(&timekeeper.lock); 1288 seq = read_seqbegin(&tk->lock);
1271 1289
1272 now = tk_xtime(&timekeeper); 1290 now = tk_xtime(tk);
1273 mono = timekeeper.wall_to_monotonic; 1291 mono = tk->wall_to_monotonic;
1274 } while (read_seqretry(&timekeeper.lock, seq)); 1292 } while (read_seqretry(&tk->lock, seq));
1275 1293
1276 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1294 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1277 now.tv_nsec + mono.tv_nsec); 1295 now.tv_nsec + mono.tv_nsec);
@@ -1300,14 +1318,15 @@ void do_timer(unsigned long ticks)
1300void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, 1318void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1301 struct timespec *wtom, struct timespec *sleep) 1319 struct timespec *wtom, struct timespec *sleep)
1302{ 1320{
1321 struct timekeeper *tk = &timekeeper;
1303 unsigned long seq; 1322 unsigned long seq;
1304 1323
1305 do { 1324 do {
1306 seq = read_seqbegin(&timekeeper.lock); 1325 seq = read_seqbegin(&tk->lock);
1307 *xtim = tk_xtime(&timekeeper); 1326 *xtim = tk_xtime(tk);
1308 *wtom = timekeeper.wall_to_monotonic; 1327 *wtom = tk->wall_to_monotonic;
1309 *sleep = timekeeper.total_sleep_time; 1328 *sleep = tk->total_sleep_time;
1310 } while (read_seqretry(&timekeeper.lock, seq)); 1329 } while (read_seqretry(&tk->lock, seq));
1311} 1330}
1312 1331
1313#ifdef CONFIG_HIGH_RES_TIMERS 1332#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1321,19 +1340,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1321 */ 1340 */
1322ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) 1341ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1323{ 1342{
1343 struct timekeeper *tk = &timekeeper;
1324 ktime_t now; 1344 ktime_t now;
1325 unsigned int seq; 1345 unsigned int seq;
1326 u64 secs, nsecs; 1346 u64 secs, nsecs;
1327 1347
1328 do { 1348 do {
1329 seq = read_seqbegin(&timekeeper.lock); 1349 seq = read_seqbegin(&tk->lock);
1330 1350
1331 secs = timekeeper.xtime_sec; 1351 secs = tk->xtime_sec;
1332 nsecs = timekeeping_get_ns(&timekeeper); 1352 nsecs = timekeeping_get_ns(tk);
1333 1353
1334 *offs_real = timekeeper.offs_real; 1354 *offs_real = tk->offs_real;
1335 *offs_boot = timekeeper.offs_boot; 1355 *offs_boot = tk->offs_boot;
1336 } while (read_seqretry(&timekeeper.lock, seq)); 1356 } while (read_seqretry(&tk->lock, seq));
1337 1357
1338 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1358 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1339 now = ktime_sub(now, *offs_real); 1359 now = ktime_sub(now, *offs_real);
@@ -1346,19 +1366,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1346 */ 1366 */
1347ktime_t ktime_get_monotonic_offset(void) 1367ktime_t ktime_get_monotonic_offset(void)
1348{ 1368{
1369 struct timekeeper *tk = &timekeeper;
1349 unsigned long seq; 1370 unsigned long seq;
1350 struct timespec wtom; 1371 struct timespec wtom;
1351 1372
1352 do { 1373 do {
1353 seq = read_seqbegin(&timekeeper.lock); 1374 seq = read_seqbegin(&tk->lock);
1354 wtom = timekeeper.wall_to_monotonic; 1375 wtom = tk->wall_to_monotonic;
1355 } while (read_seqretry(&timekeeper.lock, seq)); 1376 } while (read_seqretry(&tk->lock, seq));
1356 1377
1357 return timespec_to_ktime(wtom); 1378 return timespec_to_ktime(wtom);
1358} 1379}
1359EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); 1380EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1360 1381
1361
1362/** 1382/**
1363 * xtime_update() - advances the timekeeping infrastructure 1383 * xtime_update() - advances the timekeeping infrastructure
1364 * @ticks: number of ticks, that have elapsed since the last call. 1384 * @ticks: number of ticks, that have elapsed since the last call.
diff --git a/kernel/timer.c b/kernel/timer.c
index 706fe4c53e82..d5de1b2292aa 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1393,13 +1393,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1393 1393
1394#endif 1394#endif
1395 1395
1396#ifndef __alpha__
1397
1398/*
1399 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
1400 * should be moved into arch/i386 instead?
1401 */
1402
1403/** 1396/**
1404 * sys_getpid - return the thread group id of the current process 1397 * sys_getpid - return the thread group id of the current process
1405 * 1398 *
@@ -1455,8 +1448,6 @@ SYSCALL_DEFINE0(getegid)
1455 return from_kgid_munged(current_user_ns(), current_egid()); 1448 return from_kgid_munged(current_user_ns(), current_egid());
1456} 1449}
1457 1450
1458#endif
1459
1460static void process_timeout(unsigned long __data) 1451static void process_timeout(unsigned long __data)
1461{ 1452{
1462 wake_up_process((struct task_struct *)__data); 1453 wake_up_process((struct task_struct *)__data);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a008663d86c8..b4f20fba09fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
312 312
313static int __register_ftrace_function(struct ftrace_ops *ops) 313static int __register_ftrace_function(struct ftrace_ops *ops)
314{ 314{
315 if (ftrace_disabled) 315 if (unlikely(ftrace_disabled))
316 return -ENODEV; 316 return -ENODEV;
317 317
318 if (FTRACE_WARN_ON(ops == &global_ops)) 318 if (FTRACE_WARN_ON(ops == &global_ops))
@@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
4299 4299
4300 mutex_lock(&ftrace_lock); 4300 mutex_lock(&ftrace_lock);
4301 4301
4302 if (unlikely(ftrace_disabled))
4303 goto out_unlock;
4304
4305 ret = __register_ftrace_function(ops); 4302 ret = __register_ftrace_function(ops);
4306 if (!ret) 4303 if (!ret)
4307 ret = ftrace_startup(ops, 0); 4304 ret = ftrace_startup(ops, 0);
4308 4305
4309
4310 out_unlock:
4311 mutex_unlock(&ftrace_lock); 4306 mutex_unlock(&ftrace_lock);
4307
4312 return ret; 4308 return ret;
4313} 4309}
4314EXPORT_SYMBOL_GPL(register_ftrace_function); 4310EXPORT_SYMBOL_GPL(register_ftrace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f765465bffe4..49491fa7daa2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3239 if (cpu_buffer->commit_page == cpu_buffer->reader_page) 3239 if (cpu_buffer->commit_page == cpu_buffer->reader_page)
3240 goto out; 3240 goto out;
3241 3241
3242 /* Don't bother swapping if the ring buffer is empty */
3243 if (rb_num_of_entries(cpu_buffer) == 0)
3244 goto out;
3245
3242 /* 3246 /*
3243 * Reset the reader page to size zero. 3247 * Reset the reader page to size zero.
3244 */ 3248 */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a7fa0702be1c..5c38c81496ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -830,6 +830,8 @@ int register_tracer(struct tracer *type)
830 current_trace = saved_tracer; 830 current_trace = saved_tracer;
831 if (ret) { 831 if (ret) {
832 printk(KERN_CONT "FAILED!\n"); 832 printk(KERN_CONT "FAILED!\n");
833 /* Add the warning after printing 'FAILED' */
834 WARN_ON(1);
833 goto out; 835 goto out;
834 } 836 }
835 /* Only reset on passing, to avoid touching corrupted buffers */ 837 /* Only reset on passing, to avoid touching corrupted buffers */
@@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
1708 1710
1709static void trace_iterator_increment(struct trace_iterator *iter) 1711static void trace_iterator_increment(struct trace_iterator *iter)
1710{ 1712{
1713 struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
1714
1711 iter->idx++; 1715 iter->idx++;
1712 if (iter->buffer_iter[iter->cpu]) 1716 if (buf_iter)
1713 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); 1717 ring_buffer_read(buf_iter, NULL);
1714} 1718}
1715 1719
1716static struct trace_entry * 1720static struct trace_entry *
@@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1718 unsigned long *lost_events) 1722 unsigned long *lost_events)
1719{ 1723{
1720 struct ring_buffer_event *event; 1724 struct ring_buffer_event *event;
1721 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1725 struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
1722 1726
1723 if (buf_iter) 1727 if (buf_iter)
1724 event = ring_buffer_iter_peek(buf_iter, ts); 1728 event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1856 1860
1857 tr->data[cpu]->skipped_entries = 0; 1861 tr->data[cpu]->skipped_entries = 0;
1858 1862
1859 if (!iter->buffer_iter[cpu]) 1863 buf_iter = trace_buffer_iter(iter, cpu);
1864 if (!buf_iter)
1860 return; 1865 return;
1861 1866
1862 buf_iter = iter->buffer_iter[cpu];
1863 ring_buffer_iter_reset(buf_iter); 1867 ring_buffer_iter_reset(buf_iter);
1864 1868
1865 /* 1869 /*
@@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2205 2209
2206int trace_empty(struct trace_iterator *iter) 2210int trace_empty(struct trace_iterator *iter)
2207{ 2211{
2212 struct ring_buffer_iter *buf_iter;
2208 int cpu; 2213 int cpu;
2209 2214
2210 /* If we are looking at one CPU buffer, only check that one */ 2215 /* If we are looking at one CPU buffer, only check that one */
2211 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { 2216 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
2212 cpu = iter->cpu_file; 2217 cpu = iter->cpu_file;
2213 if (iter->buffer_iter[cpu]) { 2218 buf_iter = trace_buffer_iter(iter, cpu);
2214 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 2219 if (buf_iter) {
2220 if (!ring_buffer_iter_empty(buf_iter))
2215 return 0; 2221 return 0;
2216 } else { 2222 } else {
2217 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2223 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter)
2221 } 2227 }
2222 2228
2223 for_each_tracing_cpu(cpu) { 2229 for_each_tracing_cpu(cpu) {
2224 if (iter->buffer_iter[cpu]) { 2230 buf_iter = trace_buffer_iter(iter, cpu);
2225 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 2231 if (buf_iter) {
2232 if (!ring_buffer_iter_empty(buf_iter))
2226 return 0; 2233 return 0;
2227 } else { 2234 } else {
2228 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2235 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file)
2381 if (!iter) 2388 if (!iter)
2382 return ERR_PTR(-ENOMEM); 2389 return ERR_PTR(-ENOMEM);
2383 2390
2391 iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
2392 GFP_KERNEL);
2393 if (!iter->buffer_iter)
2394 goto release;
2395
2384 /* 2396 /*
2385 * We make a copy of the current tracer to avoid concurrent 2397 * We make a copy of the current tracer to avoid concurrent
2386 * changes on it while we are reading. 2398 * changes on it while we are reading.
@@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file)
2441 fail: 2453 fail:
2442 mutex_unlock(&trace_types_lock); 2454 mutex_unlock(&trace_types_lock);
2443 kfree(iter->trace); 2455 kfree(iter->trace);
2456 kfree(iter->buffer_iter);
2457release:
2444 seq_release_private(inode, file); 2458 seq_release_private(inode, file);
2445 return ERR_PTR(-ENOMEM); 2459 return ERR_PTR(-ENOMEM);
2446} 2460}
@@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file)
2481 mutex_destroy(&iter->mutex); 2495 mutex_destroy(&iter->mutex);
2482 free_cpumask_var(iter->started); 2496 free_cpumask_var(iter->started);
2483 kfree(iter->trace); 2497 kfree(iter->trace);
2498 kfree(iter->buffer_iter);
2484 seq_release_private(inode, file); 2499 seq_release_private(inode, file);
2485 return 0; 2500 return 0;
2486} 2501}
@@ -3172,10 +3187,10 @@ static int tracing_set_tracer(const char *buf)
3172 } 3187 }
3173 destroy_trace_option_files(topts); 3188 destroy_trace_option_files(topts);
3174 3189
3175 current_trace = t; 3190 current_trace = &nop_trace;
3176 3191
3177 topts = create_trace_option_files(current_trace); 3192 topts = create_trace_option_files(t);
3178 if (current_trace->use_max_tr) { 3193 if (t->use_max_tr) {
3179 int cpu; 3194 int cpu;
3180 /* we need to make per cpu buffer sizes equivalent */ 3195 /* we need to make per cpu buffer sizes equivalent */
3181 for_each_tracing_cpu(cpu) { 3196 for_each_tracing_cpu(cpu) {
@@ -3195,6 +3210,7 @@ static int tracing_set_tracer(const char *buf)
3195 goto out; 3210 goto out;
3196 } 3211 }
3197 3212
3213 current_trace = t;
3198 trace_branch_enable(tr); 3214 trace_branch_enable(tr);
3199 out: 3215 out:
3200 mutex_unlock(&trace_types_lock); 3216 mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5aec220d2de0..55e1f7f0db12 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -317,6 +317,14 @@ struct tracer {
317 317
318#define TRACE_PIPE_ALL_CPU -1 318#define TRACE_PIPE_ALL_CPU -1
319 319
320static inline struct ring_buffer_iter *
321trace_buffer_iter(struct trace_iterator *iter, int cpu)
322{
323 if (iter->buffer_iter && iter->buffer_iter[cpu])
324 return iter->buffer_iter[cpu];
325 return NULL;
326}
327
320int tracer_init(struct tracer *t, struct trace_array *tr); 328int tracer_init(struct tracer *t, struct trace_array *tr);
321int tracing_is_enabled(void); 329int tracing_is_enabled(void);
322void trace_wake_up(void); 330void trace_wake_up(void);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index fee3752ae8f6..8a6d2ee2086c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
281 281
282 head = this_cpu_ptr(event_function.perf_events); 282 head = this_cpu_ptr(event_function.perf_events);
283 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 283 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
284 1, &regs, head); 284 1, &regs, head, NULL);
285 285
286#undef ENTRY_SIZE 286#undef ENTRY_SIZE
287} 287}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c7b0c6a7db09..a426f410c060 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,6 +13,7 @@
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/pstore.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17 18
18#include "trace.h" 19#include "trace.h"
@@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
74 preempt_enable_notrace(); 75 preempt_enable_notrace();
75} 76}
76 77
78/* Our two options */
79enum {
80 TRACE_FUNC_OPT_STACK = 0x1,
81 TRACE_FUNC_OPT_PSTORE = 0x2,
82};
83
84static struct tracer_flags func_flags;
85
77static void 86static void
78function_trace_call(unsigned long ip, unsigned long parent_ip) 87function_trace_call(unsigned long ip, unsigned long parent_ip)
79{ 88{
@@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
97 disabled = atomic_inc_return(&data->disabled); 106 disabled = atomic_inc_return(&data->disabled);
98 107
99 if (likely(disabled == 1)) { 108 if (likely(disabled == 1)) {
109 /*
110 * So far tracing doesn't support multiple buffers, so
111 * we make an explicit call for now.
112 */
113 if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
114 pstore_ftrace_call(ip, parent_ip);
100 pc = preempt_count(); 115 pc = preempt_count();
101 trace_function(tr, ip, parent_ip, flags, pc); 116 trace_function(tr, ip, parent_ip, flags, pc);
102 } 117 }
@@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly =
158 .flags = FTRACE_OPS_FL_GLOBAL, 173 .flags = FTRACE_OPS_FL_GLOBAL,
159}; 174};
160 175
161/* Our two options */
162enum {
163 TRACE_FUNC_OPT_STACK = 0x1,
164};
165
166static struct tracer_opt func_opts[] = { 176static struct tracer_opt func_opts[] = {
167#ifdef CONFIG_STACKTRACE 177#ifdef CONFIG_STACKTRACE
168 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, 178 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
169#endif 179#endif
180#ifdef CONFIG_PSTORE_FTRACE
181 { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
182#endif
170 { } /* Always set a last empty entry */ 183 { } /* Always set a last empty entry */
171}; 184};
172 185
@@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void)
204 217
205static int func_set_flag(u32 old_flags, u32 bit, int set) 218static int func_set_flag(u32 old_flags, u32 bit, int set)
206{ 219{
207 if (bit == TRACE_FUNC_OPT_STACK) { 220 switch (bit) {
221 case TRACE_FUNC_OPT_STACK:
208 /* do nothing if already set */ 222 /* do nothing if already set */
209 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) 223 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
210 return 0; 224 break;
211 225
212 if (set) { 226 if (set) {
213 unregister_ftrace_function(&trace_ops); 227 unregister_ftrace_function(&trace_ops);
@@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
217 register_ftrace_function(&trace_ops); 231 register_ftrace_function(&trace_ops);
218 } 232 }
219 233
220 return 0; 234 break;
235 case TRACE_FUNC_OPT_PSTORE:
236 break;
237 default:
238 return -EINVAL;
221 } 239 }
222 240
223 return -EINVAL; 241 return 0;
224} 242}
225 243
226static struct tracer function_trace __read_mostly = 244static struct tracer function_trace __read_mostly =
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7d2a4c653d8..ce27c8ba8d31 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
538 next = &data->ret; 538 next = &data->ret;
539 } else { 539 } else {
540 540
541 ring_iter = iter->buffer_iter[iter->cpu]; 541 ring_iter = trace_buffer_iter(iter, iter->cpu);
542 542
543 /* First peek to compare current entry and the next one */ 543 /* First peek to compare current entry and the next one */
544 if (ring_iter) 544 if (ring_iter)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5699fe..1a2117043bb1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1002 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1002 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1003 1003
1004 head = this_cpu_ptr(call->perf_events); 1004 head = this_cpu_ptr(call->perf_events);
1005 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1005 perf_trace_buf_submit(entry, size, rctx,
1006 entry->ip, 1, regs, head, NULL);
1006} 1007}
1007 1008
1008/* Kretprobe profile handler */ 1009/* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1033 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1034 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1034 1035
1035 head = this_cpu_ptr(call->perf_events); 1036 head = this_cpu_ptr(call->perf_events);
1036 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1037 perf_trace_buf_submit(entry, size, rctx,
1038 entry->ret_ip, 1, regs, head, NULL);
1037} 1039}
1038#endif /* CONFIG_PERF_EVENTS */ 1040#endif /* CONFIG_PERF_EVENTS */
1039 1041
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df611a0e76c5..123b189c732c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1325,4 +1325,4 @@ __init static int init_events(void)
1325 1325
1326 return 0; 1326 return 0;
1327} 1327}
1328device_initcall(init_events); 1328early_initcall(init_events);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc73369099..6b245f64c8dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
506 int size; 506 int size;
507 507
508 syscall_nr = syscall_get_nr(current, regs); 508 syscall_nr = syscall_get_nr(current, regs);
509 if (syscall_nr < 0)
510 return;
509 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 511 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
510 return; 512 return;
511 513
@@ -532,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
532 (unsigned long *)&rec->args); 534 (unsigned long *)&rec->args);
533 535
534 head = this_cpu_ptr(sys_data->enter_event->perf_events); 536 head = this_cpu_ptr(sys_data->enter_event->perf_events);
535 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 537 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
536} 538}
537 539
538int perf_sysenter_enable(struct ftrace_event_call *call) 540int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
580 int size; 582 int size;
581 583
582 syscall_nr = syscall_get_nr(current, regs); 584 syscall_nr = syscall_get_nr(current, regs);
585 if (syscall_nr < 0)
586 return;
583 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 587 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
584 return; 588 return;
585 589
@@ -608,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
608 rec->ret = syscall_get_return_value(current, regs); 612 rec->ret = syscall_get_return_value(current, regs);
609 613
610 head = this_cpu_ptr(sys_data->exit_event->perf_events); 614 head = this_cpu_ptr(sys_data->exit_event->perf_events);
611 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 615 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
612} 616}
613 617
614int perf_sysexit_enable(struct ftrace_event_call *call) 618int perf_sysexit_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2b36ac68549e..03003cd7dd96 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
671 671
672 head = this_cpu_ptr(call->perf_events); 672 head = this_cpu_ptr(call->perf_events);
673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
674 674
675 out: 675 out:
676 preempt_enable(); 676 preempt_enable();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67df..1e1373bcb3e3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,32 +45,42 @@
45#include "workqueue_sched.h" 45#include "workqueue_sched.h"
46 46
47enum { 47enum {
48 /* global_cwq flags */ 48 /*
49 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 49 * global_cwq flags
50 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ 50 *
51 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 51 * A bound gcwq is either associated or disassociated with its CPU.
52 GCWQ_FREEZING = 1 << 3, /* freeze in progress */ 52 * While associated (!DISASSOCIATED), all workers are bound to the
53 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ 53 * CPU and none has %WORKER_UNBOUND set and concurrency management
54 * is in effect.
55 *
56 * While DISASSOCIATED, the cpu may be offline and all workers have
57 * %WORKER_UNBOUND set and concurrency management disabled, and may
58 * be executing on any CPU. The gcwq behaves as an unbound one.
59 *
60 * Note that DISASSOCIATED can be flipped only while holding
61 * managership of all pools on the gcwq to avoid changing binding
62 * state while create_worker() is in progress.
63 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65 GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66
67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
69 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
54 70
55 /* worker flags */ 71 /* worker flags */
56 WORKER_STARTED = 1 << 0, /* started */ 72 WORKER_STARTED = 1 << 0, /* started */
57 WORKER_DIE = 1 << 1, /* die die die */ 73 WORKER_DIE = 1 << 1, /* die die die */
58 WORKER_IDLE = 1 << 2, /* is idle */ 74 WORKER_IDLE = 1 << 2, /* is idle */
59 WORKER_PREP = 1 << 3, /* preparing to run works */ 75 WORKER_PREP = 1 << 3, /* preparing to run works */
60 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
61 WORKER_REBIND = 1 << 5, /* mom is home, come back */ 76 WORKER_REBIND = 1 << 5, /* mom is home, come back */
62 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 77 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
63 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 78 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
64 79
65 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | 80 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
66 WORKER_CPU_INTENSIVE | WORKER_UNBOUND, 81 WORKER_CPU_INTENSIVE,
67 82
68 /* gcwq->trustee_state */ 83 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
69 TRUSTEE_START = 0, /* start */
70 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
71 TRUSTEE_BUTCHER = 2, /* butcher workers */
72 TRUSTEE_RELEASE = 3, /* release workers */
73 TRUSTEE_DONE = 4, /* trustee is done */
74 84
75 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 85 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
76 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, 86 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
@@ -84,13 +94,13 @@ enum {
84 (min two ticks) */ 94 (min two ticks) */
85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 95 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
86 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 96 CREATE_COOLDOWN = HZ, /* time to breath after fail */
87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
88 97
89 /* 98 /*
90 * Rescue workers are used only on emergencies and shared by 99 * Rescue workers are used only on emergencies and shared by
91 * all cpus. Give -20. 100 * all cpus. Give -20.
92 */ 101 */
93 RESCUER_NICE_LEVEL = -20, 102 RESCUER_NICE_LEVEL = -20,
103 HIGHPRI_NICE_LEVEL = -20,
94}; 104};
95 105
96/* 106/*
@@ -115,6 +125,8 @@ enum {
115 */ 125 */
116 126
117struct global_cwq; 127struct global_cwq;
128struct worker_pool;
129struct idle_rebind;
118 130
119/* 131/*
120 * The poor guys doing the actual heavy lifting. All on-duty workers 132 * The poor guys doing the actual heavy lifting. All on-duty workers
@@ -131,12 +143,31 @@ struct worker {
131 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ 143 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
132 struct list_head scheduled; /* L: scheduled works */ 144 struct list_head scheduled; /* L: scheduled works */
133 struct task_struct *task; /* I: worker task */ 145 struct task_struct *task; /* I: worker task */
134 struct global_cwq *gcwq; /* I: the associated gcwq */ 146 struct worker_pool *pool; /* I: the associated pool */
135 /* 64 bytes boundary on 64bit, 32 on 32bit */ 147 /* 64 bytes boundary on 64bit, 32 on 32bit */
136 unsigned long last_active; /* L: last active timestamp */ 148 unsigned long last_active; /* L: last active timestamp */
137 unsigned int flags; /* X: flags */ 149 unsigned int flags; /* X: flags */
138 int id; /* I: worker id */ 150 int id; /* I: worker id */
139 struct work_struct rebind_work; /* L: rebind worker to cpu */ 151
152 /* for rebinding worker to CPU */
153 struct idle_rebind *idle_rebind; /* L: for idle worker */
154 struct work_struct rebind_work; /* L: for busy worker */
155};
156
157struct worker_pool {
158 struct global_cwq *gcwq; /* I: the owning gcwq */
159 unsigned int flags; /* X: flags */
160
161 struct list_head worklist; /* L: list of pending works */
162 int nr_workers; /* L: total number of workers */
163 int nr_idle; /* L: currently idle ones */
164
165 struct list_head idle_list; /* X: list of idle workers */
166 struct timer_list idle_timer; /* L: worker idle timeout */
167 struct timer_list mayday_timer; /* L: SOS timer for workers */
168
169 struct mutex manager_mutex; /* mutex manager should hold */
170 struct ida worker_ida; /* L: for worker IDs */
140}; 171};
141 172
142/* 173/*
@@ -146,27 +177,16 @@ struct worker {
146 */ 177 */
147struct global_cwq { 178struct global_cwq {
148 spinlock_t lock; /* the gcwq lock */ 179 spinlock_t lock; /* the gcwq lock */
149 struct list_head worklist; /* L: list of pending works */
150 unsigned int cpu; /* I: the associated cpu */ 180 unsigned int cpu; /* I: the associated cpu */
151 unsigned int flags; /* L: GCWQ_* flags */ 181 unsigned int flags; /* L: GCWQ_* flags */
152 182
153 int nr_workers; /* L: total number of workers */ 183 /* workers are chained either in busy_hash or pool idle_list */
154 int nr_idle; /* L: currently idle ones */
155
156 /* workers are chained either in the idle_list or busy_hash */
157 struct list_head idle_list; /* X: list of idle workers */
158 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; 184 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
159 /* L: hash of busy workers */ 185 /* L: hash of busy workers */
160 186
161 struct timer_list idle_timer; /* L: worker idle timeout */ 187 struct worker_pool pools[2]; /* normal and highpri pools */
162 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
163
164 struct ida worker_ida; /* L: for worker IDs */
165 188
166 struct task_struct *trustee; /* L: for gcwq shutdown */ 189 wait_queue_head_t rebind_hold; /* rebind hold wait */
167 unsigned int trustee_state; /* L: trustee state */
168 wait_queue_head_t trustee_wait; /* trustee wait */
169 struct worker *first_idle; /* L: first idle worker */
170} ____cacheline_aligned_in_smp; 190} ____cacheline_aligned_in_smp;
171 191
172/* 192/*
@@ -175,7 +195,7 @@ struct global_cwq {
175 * aligned at two's power of the number of flag bits. 195 * aligned at two's power of the number of flag bits.
176 */ 196 */
177struct cpu_workqueue_struct { 197struct cpu_workqueue_struct {
178 struct global_cwq *gcwq; /* I: the associated gcwq */ 198 struct worker_pool *pool; /* I: the associated pool */
179 struct workqueue_struct *wq; /* I: the owning workqueue */ 199 struct workqueue_struct *wq; /* I: the owning workqueue */
180 int work_color; /* L: current color */ 200 int work_color; /* L: current color */
181 int flush_color; /* L: flushing color */ 201 int flush_color; /* L: flushing color */
@@ -264,6 +284,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
264#define CREATE_TRACE_POINTS 284#define CREATE_TRACE_POINTS
265#include <trace/events/workqueue.h> 285#include <trace/events/workqueue.h>
266 286
287#define for_each_worker_pool(pool, gcwq) \
288 for ((pool) = &(gcwq)->pools[0]; \
289 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
290
267#define for_each_busy_worker(worker, i, pos, gcwq) \ 291#define for_each_busy_worker(worker, i, pos, gcwq) \
268 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 292 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
269 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) 293 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -444,7 +468,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */
444 * try_to_wake_up(). Put it in a separate cacheline. 468 * try_to_wake_up(). Put it in a separate cacheline.
445 */ 469 */
446static DEFINE_PER_CPU(struct global_cwq, global_cwq); 470static DEFINE_PER_CPU(struct global_cwq, global_cwq);
447static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); 471static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
448 472
449/* 473/*
450 * Global cpu workqueue and nr_running counter for unbound gcwq. The 474 * Global cpu workqueue and nr_running counter for unbound gcwq. The
@@ -452,10 +476,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
452 * workers have WORKER_UNBOUND set. 476 * workers have WORKER_UNBOUND set.
453 */ 477 */
454static struct global_cwq unbound_global_cwq; 478static struct global_cwq unbound_global_cwq;
455static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ 479static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
480 [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
481};
456 482
457static int worker_thread(void *__worker); 483static int worker_thread(void *__worker);
458 484
485static int worker_pool_pri(struct worker_pool *pool)
486{
487 return pool - pool->gcwq->pools;
488}
489
459static struct global_cwq *get_gcwq(unsigned int cpu) 490static struct global_cwq *get_gcwq(unsigned int cpu)
460{ 491{
461 if (cpu != WORK_CPU_UNBOUND) 492 if (cpu != WORK_CPU_UNBOUND)
@@ -464,12 +495,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
464 return &unbound_global_cwq; 495 return &unbound_global_cwq;
465} 496}
466 497
467static atomic_t *get_gcwq_nr_running(unsigned int cpu) 498static atomic_t *get_pool_nr_running(struct worker_pool *pool)
468{ 499{
500 int cpu = pool->gcwq->cpu;
501 int idx = worker_pool_pri(pool);
502
469 if (cpu != WORK_CPU_UNBOUND) 503 if (cpu != WORK_CPU_UNBOUND)
470 return &per_cpu(gcwq_nr_running, cpu); 504 return &per_cpu(pool_nr_running, cpu)[idx];
471 else 505 else
472 return &unbound_gcwq_nr_running; 506 return &unbound_pool_nr_running[idx];
473} 507}
474 508
475static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, 509static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -555,7 +589,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
555 589
556 if (data & WORK_STRUCT_CWQ) 590 if (data & WORK_STRUCT_CWQ)
557 return ((struct cpu_workqueue_struct *) 591 return ((struct cpu_workqueue_struct *)
558 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; 592 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
559 593
560 cpu = data >> WORK_STRUCT_FLAG_BITS; 594 cpu = data >> WORK_STRUCT_FLAG_BITS;
561 if (cpu == WORK_CPU_NONE) 595 if (cpu == WORK_CPU_NONE)
@@ -566,60 +600,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
566} 600}
567 601
568/* 602/*
569 * Policy functions. These define the policies on how the global 603 * Policy functions. These define the policies on how the global worker
570 * worker pool is managed. Unless noted otherwise, these functions 604 * pools are managed. Unless noted otherwise, these functions assume that
571 * assume that they're being called with gcwq->lock held. 605 * they're being called with gcwq->lock held.
572 */ 606 */
573 607
574static bool __need_more_worker(struct global_cwq *gcwq) 608static bool __need_more_worker(struct worker_pool *pool)
575{ 609{
576 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || 610 return !atomic_read(get_pool_nr_running(pool));
577 gcwq->flags & GCWQ_HIGHPRI_PENDING;
578} 611}
579 612
580/* 613/*
581 * Need to wake up a worker? Called from anything but currently 614 * Need to wake up a worker? Called from anything but currently
582 * running workers. 615 * running workers.
616 *
617 * Note that, because unbound workers never contribute to nr_running, this
618 * function will always return %true for unbound gcwq as long as the
619 * worklist isn't empty.
583 */ 620 */
584static bool need_more_worker(struct global_cwq *gcwq) 621static bool need_more_worker(struct worker_pool *pool)
585{ 622{
586 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); 623 return !list_empty(&pool->worklist) && __need_more_worker(pool);
587} 624}
588 625
589/* Can I start working? Called from busy but !running workers. */ 626/* Can I start working? Called from busy but !running workers. */
590static bool may_start_working(struct global_cwq *gcwq) 627static bool may_start_working(struct worker_pool *pool)
591{ 628{
592 return gcwq->nr_idle; 629 return pool->nr_idle;
593} 630}
594 631
595/* Do I need to keep working? Called from currently running workers. */ 632/* Do I need to keep working? Called from currently running workers. */
596static bool keep_working(struct global_cwq *gcwq) 633static bool keep_working(struct worker_pool *pool)
597{ 634{
598 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 635 atomic_t *nr_running = get_pool_nr_running(pool);
599 636
600 return !list_empty(&gcwq->worklist) && 637 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
601 (atomic_read(nr_running) <= 1 ||
602 gcwq->flags & GCWQ_HIGHPRI_PENDING);
603} 638}
604 639
605/* Do we need a new worker? Called from manager. */ 640/* Do we need a new worker? Called from manager. */
606static bool need_to_create_worker(struct global_cwq *gcwq) 641static bool need_to_create_worker(struct worker_pool *pool)
607{ 642{
608 return need_more_worker(gcwq) && !may_start_working(gcwq); 643 return need_more_worker(pool) && !may_start_working(pool);
609} 644}
610 645
611/* Do I need to be the manager? */ 646/* Do I need to be the manager? */
612static bool need_to_manage_workers(struct global_cwq *gcwq) 647static bool need_to_manage_workers(struct worker_pool *pool)
613{ 648{
614 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; 649 return need_to_create_worker(pool) ||
650 (pool->flags & POOL_MANAGE_WORKERS);
615} 651}
616 652
617/* Do we have too many workers and should some go away? */ 653/* Do we have too many workers and should some go away? */
618static bool too_many_workers(struct global_cwq *gcwq) 654static bool too_many_workers(struct worker_pool *pool)
619{ 655{
620 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; 656 bool managing = pool->flags & POOL_MANAGING_WORKERS;
621 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ 657 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
622 int nr_busy = gcwq->nr_workers - nr_idle; 658 int nr_busy = pool->nr_workers - nr_idle;
623 659
624 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 660 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
625} 661}
@@ -629,26 +665,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
629 */ 665 */
630 666
631/* Return the first worker. Safe with preemption disabled */ 667/* Return the first worker. Safe with preemption disabled */
632static struct worker *first_worker(struct global_cwq *gcwq) 668static struct worker *first_worker(struct worker_pool *pool)
633{ 669{
634 if (unlikely(list_empty(&gcwq->idle_list))) 670 if (unlikely(list_empty(&pool->idle_list)))
635 return NULL; 671 return NULL;
636 672
637 return list_first_entry(&gcwq->idle_list, struct worker, entry); 673 return list_first_entry(&pool->idle_list, struct worker, entry);
638} 674}
639 675
640/** 676/**
641 * wake_up_worker - wake up an idle worker 677 * wake_up_worker - wake up an idle worker
642 * @gcwq: gcwq to wake worker for 678 * @pool: worker pool to wake worker from
643 * 679 *
644 * Wake up the first idle worker of @gcwq. 680 * Wake up the first idle worker of @pool.
645 * 681 *
646 * CONTEXT: 682 * CONTEXT:
647 * spin_lock_irq(gcwq->lock). 683 * spin_lock_irq(gcwq->lock).
648 */ 684 */
649static void wake_up_worker(struct global_cwq *gcwq) 685static void wake_up_worker(struct worker_pool *pool)
650{ 686{
651 struct worker *worker = first_worker(gcwq); 687 struct worker *worker = first_worker(pool);
652 688
653 if (likely(worker)) 689 if (likely(worker))
654 wake_up_process(worker->task); 690 wake_up_process(worker->task);
@@ -670,7 +706,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
670 struct worker *worker = kthread_data(task); 706 struct worker *worker = kthread_data(task);
671 707
672 if (!(worker->flags & WORKER_NOT_RUNNING)) 708 if (!(worker->flags & WORKER_NOT_RUNNING))
673 atomic_inc(get_gcwq_nr_running(cpu)); 709 atomic_inc(get_pool_nr_running(worker->pool));
674} 710}
675 711
676/** 712/**
@@ -692,8 +728,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
692 unsigned int cpu) 728 unsigned int cpu)
693{ 729{
694 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 730 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
695 struct global_cwq *gcwq = get_gcwq(cpu); 731 struct worker_pool *pool = worker->pool;
696 atomic_t *nr_running = get_gcwq_nr_running(cpu); 732 atomic_t *nr_running = get_pool_nr_running(pool);
697 733
698 if (worker->flags & WORKER_NOT_RUNNING) 734 if (worker->flags & WORKER_NOT_RUNNING)
699 return NULL; 735 return NULL;
@@ -706,14 +742,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
706 * worklist not empty test sequence is in insert_work(). 742 * worklist not empty test sequence is in insert_work().
707 * Please read comment there. 743 * Please read comment there.
708 * 744 *
709 * NOT_RUNNING is clear. This means that trustee is not in 745 * NOT_RUNNING is clear. This means that we're bound to and
710 * charge and we're running on the local cpu w/ rq lock held 746 * running on the local cpu w/ rq lock held and preemption
711 * and preemption disabled, which in turn means that none else 747 * disabled, which in turn means that none else could be
712 * could be manipulating idle_list, so dereferencing idle_list 748 * manipulating idle_list, so dereferencing idle_list without gcwq
713 * without gcwq lock is safe. 749 * lock is safe.
714 */ 750 */
715 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) 751 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
716 to_wakeup = first_worker(gcwq); 752 to_wakeup = first_worker(pool);
717 return to_wakeup ? to_wakeup->task : NULL; 753 return to_wakeup ? to_wakeup->task : NULL;
718} 754}
719 755
@@ -733,7 +769,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
733static inline void worker_set_flags(struct worker *worker, unsigned int flags, 769static inline void worker_set_flags(struct worker *worker, unsigned int flags,
734 bool wakeup) 770 bool wakeup)
735{ 771{
736 struct global_cwq *gcwq = worker->gcwq; 772 struct worker_pool *pool = worker->pool;
737 773
738 WARN_ON_ONCE(worker->task != current); 774 WARN_ON_ONCE(worker->task != current);
739 775
@@ -744,12 +780,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
744 */ 780 */
745 if ((flags & WORKER_NOT_RUNNING) && 781 if ((flags & WORKER_NOT_RUNNING) &&
746 !(worker->flags & WORKER_NOT_RUNNING)) { 782 !(worker->flags & WORKER_NOT_RUNNING)) {
747 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 783 atomic_t *nr_running = get_pool_nr_running(pool);
748 784
749 if (wakeup) { 785 if (wakeup) {
750 if (atomic_dec_and_test(nr_running) && 786 if (atomic_dec_and_test(nr_running) &&
751 !list_empty(&gcwq->worklist)) 787 !list_empty(&pool->worklist))
752 wake_up_worker(gcwq); 788 wake_up_worker(pool);
753 } else 789 } else
754 atomic_dec(nr_running); 790 atomic_dec(nr_running);
755 } 791 }
@@ -769,7 +805,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
769 */ 805 */
770static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 806static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
771{ 807{
772 struct global_cwq *gcwq = worker->gcwq; 808 struct worker_pool *pool = worker->pool;
773 unsigned int oflags = worker->flags; 809 unsigned int oflags = worker->flags;
774 810
775 WARN_ON_ONCE(worker->task != current); 811 WARN_ON_ONCE(worker->task != current);
@@ -783,7 +819,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
783 */ 819 */
784 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 820 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
785 if (!(worker->flags & WORKER_NOT_RUNNING)) 821 if (!(worker->flags & WORKER_NOT_RUNNING))
786 atomic_inc(get_gcwq_nr_running(gcwq->cpu)); 822 atomic_inc(get_pool_nr_running(pool));
787} 823}
788 824
789/** 825/**
@@ -867,43 +903,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
867} 903}
868 904
869/** 905/**
870 * gcwq_determine_ins_pos - find insertion position
871 * @gcwq: gcwq of interest
872 * @cwq: cwq a work is being queued for
873 *
874 * A work for @cwq is about to be queued on @gcwq, determine insertion
875 * position for the work. If @cwq is for HIGHPRI wq, the work is
876 * queued at the head of the queue but in FIFO order with respect to
877 * other HIGHPRI works; otherwise, at the end of the queue. This
878 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
879 * there are HIGHPRI works pending.
880 *
881 * CONTEXT:
882 * spin_lock_irq(gcwq->lock).
883 *
884 * RETURNS:
885 * Pointer to inserstion position.
886 */
887static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
888 struct cpu_workqueue_struct *cwq)
889{
890 struct work_struct *twork;
891
892 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
893 return &gcwq->worklist;
894
895 list_for_each_entry(twork, &gcwq->worklist, entry) {
896 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
897
898 if (!(tcwq->wq->flags & WQ_HIGHPRI))
899 break;
900 }
901
902 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
903 return &twork->entry;
904}
905
906/**
907 * insert_work - insert a work into gcwq 906 * insert_work - insert a work into gcwq
908 * @cwq: cwq @work belongs to 907 * @cwq: cwq @work belongs to
909 * @work: work to insert 908 * @work: work to insert
@@ -920,7 +919,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
920 struct work_struct *work, struct list_head *head, 919 struct work_struct *work, struct list_head *head,
921 unsigned int extra_flags) 920 unsigned int extra_flags)
922{ 921{
923 struct global_cwq *gcwq = cwq->gcwq; 922 struct worker_pool *pool = cwq->pool;
924 923
925 /* we own @work, set data and link */ 924 /* we own @work, set data and link */
926 set_work_cwq(work, cwq, extra_flags); 925 set_work_cwq(work, cwq, extra_flags);
@@ -940,8 +939,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
940 */ 939 */
941 smp_mb(); 940 smp_mb();
942 941
943 if (__need_more_worker(gcwq)) 942 if (__need_more_worker(pool))
944 wake_up_worker(gcwq); 943 wake_up_worker(pool);
945} 944}
946 945
947/* 946/*
@@ -1043,7 +1042,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1043 if (likely(cwq->nr_active < cwq->max_active)) { 1042 if (likely(cwq->nr_active < cwq->max_active)) {
1044 trace_workqueue_activate_work(work); 1043 trace_workqueue_activate_work(work);
1045 cwq->nr_active++; 1044 cwq->nr_active++;
1046 worklist = gcwq_determine_ins_pos(gcwq, cwq); 1045 worklist = &cwq->pool->worklist;
1047 } else { 1046 } else {
1048 work_flags |= WORK_STRUCT_DELAYED; 1047 work_flags |= WORK_STRUCT_DELAYED;
1049 worklist = &cwq->delayed_works; 1048 worklist = &cwq->delayed_works;
@@ -1192,7 +1191,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1192 */ 1191 */
1193static void worker_enter_idle(struct worker *worker) 1192static void worker_enter_idle(struct worker *worker)
1194{ 1193{
1195 struct global_cwq *gcwq = worker->gcwq; 1194 struct worker_pool *pool = worker->pool;
1195 struct global_cwq *gcwq = pool->gcwq;
1196 1196
1197 BUG_ON(worker->flags & WORKER_IDLE); 1197 BUG_ON(worker->flags & WORKER_IDLE);
1198 BUG_ON(!list_empty(&worker->entry) && 1198 BUG_ON(!list_empty(&worker->entry) &&
@@ -1200,27 +1200,24 @@ static void worker_enter_idle(struct worker *worker)
1200 1200
1201 /* can't use worker_set_flags(), also called from start_worker() */ 1201 /* can't use worker_set_flags(), also called from start_worker() */
1202 worker->flags |= WORKER_IDLE; 1202 worker->flags |= WORKER_IDLE;
1203 gcwq->nr_idle++; 1203 pool->nr_idle++;
1204 worker->last_active = jiffies; 1204 worker->last_active = jiffies;
1205 1205
1206 /* idle_list is LIFO */ 1206 /* idle_list is LIFO */
1207 list_add(&worker->entry, &gcwq->idle_list); 1207 list_add(&worker->entry, &pool->idle_list);
1208 1208
1209 if (likely(!(worker->flags & WORKER_ROGUE))) { 1209 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1210 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) 1210 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1211 mod_timer(&gcwq->idle_timer,
1212 jiffies + IDLE_WORKER_TIMEOUT);
1213 } else
1214 wake_up_all(&gcwq->trustee_wait);
1215 1211
1216 /* 1212 /*
1217 * Sanity check nr_running. Because trustee releases gcwq->lock 1213 * Sanity check nr_running. Because gcwq_unbind_fn() releases
1218 * between setting %WORKER_ROGUE and zapping nr_running, the 1214 * gcwq->lock between setting %WORKER_UNBOUND and zapping
1219 * warning may trigger spuriously. Check iff trustee is idle. 1215 * nr_running, the warning may trigger spuriously. Check iff
1216 * unbind is not in progress.
1220 */ 1217 */
1221 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && 1218 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
1222 gcwq->nr_workers == gcwq->nr_idle && 1219 pool->nr_workers == pool->nr_idle &&
1223 atomic_read(get_gcwq_nr_running(gcwq->cpu))); 1220 atomic_read(get_pool_nr_running(pool)));
1224} 1221}
1225 1222
1226/** 1223/**
@@ -1234,11 +1231,11 @@ static void worker_enter_idle(struct worker *worker)
1234 */ 1231 */
1235static void worker_leave_idle(struct worker *worker) 1232static void worker_leave_idle(struct worker *worker)
1236{ 1233{
1237 struct global_cwq *gcwq = worker->gcwq; 1234 struct worker_pool *pool = worker->pool;
1238 1235
1239 BUG_ON(!(worker->flags & WORKER_IDLE)); 1236 BUG_ON(!(worker->flags & WORKER_IDLE));
1240 worker_clr_flags(worker, WORKER_IDLE); 1237 worker_clr_flags(worker, WORKER_IDLE);
1241 gcwq->nr_idle--; 1238 pool->nr_idle--;
1242 list_del_init(&worker->entry); 1239 list_del_init(&worker->entry);
1243} 1240}
1244 1241
@@ -1258,11 +1255,11 @@ static void worker_leave_idle(struct worker *worker)
1258 * verbatim as it's best effort and blocking and gcwq may be 1255 * verbatim as it's best effort and blocking and gcwq may be
1259 * [dis]associated in the meantime. 1256 * [dis]associated in the meantime.
1260 * 1257 *
1261 * This function tries set_cpus_allowed() and locks gcwq and verifies 1258 * This function tries set_cpus_allowed() and locks gcwq and verifies the
1262 * the binding against GCWQ_DISASSOCIATED which is set during 1259 * binding against %GCWQ_DISASSOCIATED which is set during
1263 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters 1260 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1264 * idle state or fetches works without dropping lock, it can guarantee 1261 * enters idle state or fetches works without dropping lock, it can
1265 * the scheduling requirement described in the first paragraph. 1262 * guarantee the scheduling requirement described in the first paragraph.
1266 * 1263 *
1267 * CONTEXT: 1264 * CONTEXT:
1268 * Might sleep. Called without any lock but returns with gcwq->lock 1265 * Might sleep. Called without any lock but returns with gcwq->lock
@@ -1275,7 +1272,7 @@ static void worker_leave_idle(struct worker *worker)
1275static bool worker_maybe_bind_and_lock(struct worker *worker) 1272static bool worker_maybe_bind_and_lock(struct worker *worker)
1276__acquires(&gcwq->lock) 1273__acquires(&gcwq->lock)
1277{ 1274{
1278 struct global_cwq *gcwq = worker->gcwq; 1275 struct global_cwq *gcwq = worker->pool->gcwq;
1279 struct task_struct *task = worker->task; 1276 struct task_struct *task = worker->task;
1280 1277
1281 while (true) { 1278 while (true) {
@@ -1308,16 +1305,49 @@ __acquires(&gcwq->lock)
1308 } 1305 }
1309} 1306}
1310 1307
1308struct idle_rebind {
1309 int cnt; /* # workers to be rebound */
1310 struct completion done; /* all workers rebound */
1311};
1312
1311/* 1313/*
1312 * Function for worker->rebind_work used to rebind rogue busy workers 1314 * Rebind an idle @worker to its CPU. During CPU onlining, this has to
1313 * to the associated cpu which is coming back online. This is 1315 * happen synchronously for idle workers. worker_thread() will test
1314 * scheduled by cpu up but can race with other cpu hotplug operations 1316 * %WORKER_REBIND before leaving idle and call this function.
1315 * and may be executed twice without intervening cpu down.
1316 */ 1317 */
1317static void worker_rebind_fn(struct work_struct *work) 1318static void idle_worker_rebind(struct worker *worker)
1319{
1320 struct global_cwq *gcwq = worker->pool->gcwq;
1321
1322 /* CPU must be online at this point */
1323 WARN_ON(!worker_maybe_bind_and_lock(worker));
1324 if (!--worker->idle_rebind->cnt)
1325 complete(&worker->idle_rebind->done);
1326 spin_unlock_irq(&worker->pool->gcwq->lock);
1327
1328 /* we did our part, wait for rebind_workers() to finish up */
1329 wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
1330
1331 /*
1332 * rebind_workers() shouldn't finish until all workers passed the
1333 * above WORKER_REBIND wait. Tell it when done.
1334 */
1335 spin_lock_irq(&worker->pool->gcwq->lock);
1336 if (!--worker->idle_rebind->cnt)
1337 complete(&worker->idle_rebind->done);
1338 spin_unlock_irq(&worker->pool->gcwq->lock);
1339}
1340
1341/*
1342 * Function for @worker->rebind.work used to rebind unbound busy workers to
1343 * the associated cpu which is coming back online. This is scheduled by
1344 * cpu up but can race with other cpu hotplug operations and may be
1345 * executed twice without intervening cpu down.
1346 */
1347static void busy_worker_rebind_fn(struct work_struct *work)
1318{ 1348{
1319 struct worker *worker = container_of(work, struct worker, rebind_work); 1349 struct worker *worker = container_of(work, struct worker, rebind_work);
1320 struct global_cwq *gcwq = worker->gcwq; 1350 struct global_cwq *gcwq = worker->pool->gcwq;
1321 1351
1322 if (worker_maybe_bind_and_lock(worker)) 1352 if (worker_maybe_bind_and_lock(worker))
1323 worker_clr_flags(worker, WORKER_REBIND); 1353 worker_clr_flags(worker, WORKER_REBIND);
@@ -1325,6 +1355,133 @@ static void worker_rebind_fn(struct work_struct *work)
1325 spin_unlock_irq(&gcwq->lock); 1355 spin_unlock_irq(&gcwq->lock);
1326} 1356}
1327 1357
1358/**
1359 * rebind_workers - rebind all workers of a gcwq to the associated CPU
1360 * @gcwq: gcwq of interest
1361 *
1362 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
1363 * is different for idle and busy ones.
1364 *
1365 * The idle ones should be rebound synchronously and idle rebinding should
1366 * be complete before any worker starts executing work items with
1367 * concurrency management enabled; otherwise, scheduler may oops trying to
1368 * wake up non-local idle worker from wq_worker_sleeping().
1369 *
1370 * This is achieved by repeatedly requesting rebinding until all idle
1371 * workers are known to have been rebound under @gcwq->lock and holding all
1372 * idle workers from becoming busy until idle rebinding is complete.
1373 *
1374 * Once idle workers are rebound, busy workers can be rebound as they
1375 * finish executing their current work items. Queueing the rebind work at
1376 * the head of their scheduled lists is enough. Note that nr_running will
1377 * be properbly bumped as busy workers rebind.
1378 *
1379 * On return, all workers are guaranteed to either be bound or have rebind
1380 * work item scheduled.
1381 */
1382static void rebind_workers(struct global_cwq *gcwq)
1383 __releases(&gcwq->lock) __acquires(&gcwq->lock)
1384{
1385 struct idle_rebind idle_rebind;
1386 struct worker_pool *pool;
1387 struct worker *worker;
1388 struct hlist_node *pos;
1389 int i;
1390
1391 lockdep_assert_held(&gcwq->lock);
1392
1393 for_each_worker_pool(pool, gcwq)
1394 lockdep_assert_held(&pool->manager_mutex);
1395
1396 /*
1397 * Rebind idle workers. Interlocked both ways. We wait for
1398 * workers to rebind via @idle_rebind.done. Workers will wait for
1399 * us to finish up by watching %WORKER_REBIND.
1400 */
1401 init_completion(&idle_rebind.done);
1402retry:
1403 idle_rebind.cnt = 1;
1404 INIT_COMPLETION(idle_rebind.done);
1405
1406 /* set REBIND and kick idle ones, we'll wait for these later */
1407 for_each_worker_pool(pool, gcwq) {
1408 list_for_each_entry(worker, &pool->idle_list, entry) {
1409 unsigned long worker_flags = worker->flags;
1410
1411 if (worker->flags & WORKER_REBIND)
1412 continue;
1413
1414 /* morph UNBOUND to REBIND atomically */
1415 worker_flags &= ~WORKER_UNBOUND;
1416 worker_flags |= WORKER_REBIND;
1417 ACCESS_ONCE(worker->flags) = worker_flags;
1418
1419 idle_rebind.cnt++;
1420 worker->idle_rebind = &idle_rebind;
1421
1422 /* worker_thread() will call idle_worker_rebind() */
1423 wake_up_process(worker->task);
1424 }
1425 }
1426
1427 if (--idle_rebind.cnt) {
1428 spin_unlock_irq(&gcwq->lock);
1429 wait_for_completion(&idle_rebind.done);
1430 spin_lock_irq(&gcwq->lock);
1431 /* busy ones might have become idle while waiting, retry */
1432 goto retry;
1433 }
1434
1435 /* all idle workers are rebound, rebind busy workers */
1436 for_each_busy_worker(worker, i, pos, gcwq) {
1437 struct work_struct *rebind_work = &worker->rebind_work;
1438 unsigned long worker_flags = worker->flags;
1439
1440 /* morph UNBOUND to REBIND atomically */
1441 worker_flags &= ~WORKER_UNBOUND;
1442 worker_flags |= WORKER_REBIND;
1443 ACCESS_ONCE(worker->flags) = worker_flags;
1444
1445 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1446 work_data_bits(rebind_work)))
1447 continue;
1448
1449 /* wq doesn't matter, use the default one */
1450 debug_work_activate(rebind_work);
1451 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
1452 worker->scheduled.next,
1453 work_color_to_flags(WORK_NO_COLOR));
1454 }
1455
1456 /*
1457 * All idle workers are rebound and waiting for %WORKER_REBIND to
1458 * be cleared inside idle_worker_rebind(). Clear and release.
1459 * Clearing %WORKER_REBIND from this foreign context is safe
1460 * because these workers are still guaranteed to be idle.
1461 *
1462 * We need to make sure all idle workers passed WORKER_REBIND wait
1463 * in idle_worker_rebind() before returning; otherwise, workers can
1464 * get stuck at the wait if hotplug cycle repeats.
1465 */
1466 idle_rebind.cnt = 1;
1467 INIT_COMPLETION(idle_rebind.done);
1468
1469 for_each_worker_pool(pool, gcwq) {
1470 list_for_each_entry(worker, &pool->idle_list, entry) {
1471 worker->flags &= ~WORKER_REBIND;
1472 idle_rebind.cnt++;
1473 }
1474 }
1475
1476 wake_up_all(&gcwq->rebind_hold);
1477
1478 if (--idle_rebind.cnt) {
1479 spin_unlock_irq(&gcwq->lock);
1480 wait_for_completion(&idle_rebind.done);
1481 spin_lock_irq(&gcwq->lock);
1482 }
1483}
1484
1328static struct worker *alloc_worker(void) 1485static struct worker *alloc_worker(void)
1329{ 1486{
1330 struct worker *worker; 1487 struct worker *worker;
@@ -1333,7 +1490,7 @@ static struct worker *alloc_worker(void)
1333 if (worker) { 1490 if (worker) {
1334 INIT_LIST_HEAD(&worker->entry); 1491 INIT_LIST_HEAD(&worker->entry);
1335 INIT_LIST_HEAD(&worker->scheduled); 1492 INIT_LIST_HEAD(&worker->scheduled);
1336 INIT_WORK(&worker->rebind_work, worker_rebind_fn); 1493 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1337 /* on creation a worker is in !idle && prep state */ 1494 /* on creation a worker is in !idle && prep state */
1338 worker->flags = WORKER_PREP; 1495 worker->flags = WORKER_PREP;
1339 } 1496 }
@@ -1342,10 +1499,9 @@ static struct worker *alloc_worker(void)
1342 1499
1343/** 1500/**
1344 * create_worker - create a new workqueue worker 1501 * create_worker - create a new workqueue worker
1345 * @gcwq: gcwq the new worker will belong to 1502 * @pool: pool the new worker will belong to
1346 * @bind: whether to set affinity to @cpu or not
1347 * 1503 *
1348 * Create a new worker which is bound to @gcwq. The returned worker 1504 * Create a new worker which is bound to @pool. The returned worker
1349 * can be started by calling start_worker() or destroyed using 1505 * can be started by calling start_worker() or destroyed using
1350 * destroy_worker(). 1506 * destroy_worker().
1351 * 1507 *
@@ -1355,16 +1511,17 @@ static struct worker *alloc_worker(void)
1355 * RETURNS: 1511 * RETURNS:
1356 * Pointer to the newly created worker. 1512 * Pointer to the newly created worker.
1357 */ 1513 */
1358static struct worker *create_worker(struct global_cwq *gcwq, bool bind) 1514static struct worker *create_worker(struct worker_pool *pool)
1359{ 1515{
1360 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; 1516 struct global_cwq *gcwq = pool->gcwq;
1517 const char *pri = worker_pool_pri(pool) ? "H" : "";
1361 struct worker *worker = NULL; 1518 struct worker *worker = NULL;
1362 int id = -1; 1519 int id = -1;
1363 1520
1364 spin_lock_irq(&gcwq->lock); 1521 spin_lock_irq(&gcwq->lock);
1365 while (ida_get_new(&gcwq->worker_ida, &id)) { 1522 while (ida_get_new(&pool->worker_ida, &id)) {
1366 spin_unlock_irq(&gcwq->lock); 1523 spin_unlock_irq(&gcwq->lock);
1367 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) 1524 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1368 goto fail; 1525 goto fail;
1369 spin_lock_irq(&gcwq->lock); 1526 spin_lock_irq(&gcwq->lock);
1370 } 1527 }
@@ -1374,38 +1531,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1374 if (!worker) 1531 if (!worker)
1375 goto fail; 1532 goto fail;
1376 1533
1377 worker->gcwq = gcwq; 1534 worker->pool = pool;
1378 worker->id = id; 1535 worker->id = id;
1379 1536
1380 if (!on_unbound_cpu) 1537 if (gcwq->cpu != WORK_CPU_UNBOUND)
1381 worker->task = kthread_create_on_node(worker_thread, 1538 worker->task = kthread_create_on_node(worker_thread,
1382 worker, 1539 worker, cpu_to_node(gcwq->cpu),
1383 cpu_to_node(gcwq->cpu), 1540 "kworker/%u:%d%s", gcwq->cpu, id, pri);
1384 "kworker/%u:%d", gcwq->cpu, id);
1385 else 1541 else
1386 worker->task = kthread_create(worker_thread, worker, 1542 worker->task = kthread_create(worker_thread, worker,
1387 "kworker/u:%d", id); 1543 "kworker/u:%d%s", id, pri);
1388 if (IS_ERR(worker->task)) 1544 if (IS_ERR(worker->task))
1389 goto fail; 1545 goto fail;
1390 1546
1547 if (worker_pool_pri(pool))
1548 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1549
1391 /* 1550 /*
1392 * A rogue worker will become a regular one if CPU comes 1551 * Determine CPU binding of the new worker depending on
1393 * online later on. Make sure every worker has 1552 * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the
1394 * PF_THREAD_BOUND set. 1553 * flag remains stable across this function. See the comments
1554 * above the flag definition for details.
1555 *
1556 * As an unbound worker may later become a regular one if CPU comes
1557 * online, make sure every worker has %PF_THREAD_BOUND set.
1395 */ 1558 */
1396 if (bind && !on_unbound_cpu) 1559 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
1397 kthread_bind(worker->task, gcwq->cpu); 1560 kthread_bind(worker->task, gcwq->cpu);
1398 else { 1561 } else {
1399 worker->task->flags |= PF_THREAD_BOUND; 1562 worker->task->flags |= PF_THREAD_BOUND;
1400 if (on_unbound_cpu) 1563 worker->flags |= WORKER_UNBOUND;
1401 worker->flags |= WORKER_UNBOUND;
1402 } 1564 }
1403 1565
1404 return worker; 1566 return worker;
1405fail: 1567fail:
1406 if (id >= 0) { 1568 if (id >= 0) {
1407 spin_lock_irq(&gcwq->lock); 1569 spin_lock_irq(&gcwq->lock);
1408 ida_remove(&gcwq->worker_ida, id); 1570 ida_remove(&pool->worker_ida, id);
1409 spin_unlock_irq(&gcwq->lock); 1571 spin_unlock_irq(&gcwq->lock);
1410 } 1572 }
1411 kfree(worker); 1573 kfree(worker);
@@ -1424,7 +1586,7 @@ fail:
1424static void start_worker(struct worker *worker) 1586static void start_worker(struct worker *worker)
1425{ 1587{
1426 worker->flags |= WORKER_STARTED; 1588 worker->flags |= WORKER_STARTED;
1427 worker->gcwq->nr_workers++; 1589 worker->pool->nr_workers++;
1428 worker_enter_idle(worker); 1590 worker_enter_idle(worker);
1429 wake_up_process(worker->task); 1591 wake_up_process(worker->task);
1430} 1592}
@@ -1440,7 +1602,8 @@ static void start_worker(struct worker *worker)
1440 */ 1602 */
1441static void destroy_worker(struct worker *worker) 1603static void destroy_worker(struct worker *worker)
1442{ 1604{
1443 struct global_cwq *gcwq = worker->gcwq; 1605 struct worker_pool *pool = worker->pool;
1606 struct global_cwq *gcwq = pool->gcwq;
1444 int id = worker->id; 1607 int id = worker->id;
1445 1608
1446 /* sanity check frenzy */ 1609 /* sanity check frenzy */
@@ -1448,9 +1611,9 @@ static void destroy_worker(struct worker *worker)
1448 BUG_ON(!list_empty(&worker->scheduled)); 1611 BUG_ON(!list_empty(&worker->scheduled));
1449 1612
1450 if (worker->flags & WORKER_STARTED) 1613 if (worker->flags & WORKER_STARTED)
1451 gcwq->nr_workers--; 1614 pool->nr_workers--;
1452 if (worker->flags & WORKER_IDLE) 1615 if (worker->flags & WORKER_IDLE)
1453 gcwq->nr_idle--; 1616 pool->nr_idle--;
1454 1617
1455 list_del_init(&worker->entry); 1618 list_del_init(&worker->entry);
1456 worker->flags |= WORKER_DIE; 1619 worker->flags |= WORKER_DIE;
@@ -1461,29 +1624,30 @@ static void destroy_worker(struct worker *worker)
1461 kfree(worker); 1624 kfree(worker);
1462 1625
1463 spin_lock_irq(&gcwq->lock); 1626 spin_lock_irq(&gcwq->lock);
1464 ida_remove(&gcwq->worker_ida, id); 1627 ida_remove(&pool->worker_ida, id);
1465} 1628}
1466 1629
1467static void idle_worker_timeout(unsigned long __gcwq) 1630static void idle_worker_timeout(unsigned long __pool)
1468{ 1631{
1469 struct global_cwq *gcwq = (void *)__gcwq; 1632 struct worker_pool *pool = (void *)__pool;
1633 struct global_cwq *gcwq = pool->gcwq;
1470 1634
1471 spin_lock_irq(&gcwq->lock); 1635 spin_lock_irq(&gcwq->lock);
1472 1636
1473 if (too_many_workers(gcwq)) { 1637 if (too_many_workers(pool)) {
1474 struct worker *worker; 1638 struct worker *worker;
1475 unsigned long expires; 1639 unsigned long expires;
1476 1640
1477 /* idle_list is kept in LIFO order, check the last one */ 1641 /* idle_list is kept in LIFO order, check the last one */
1478 worker = list_entry(gcwq->idle_list.prev, struct worker, entry); 1642 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1479 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1643 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1480 1644
1481 if (time_before(jiffies, expires)) 1645 if (time_before(jiffies, expires))
1482 mod_timer(&gcwq->idle_timer, expires); 1646 mod_timer(&pool->idle_timer, expires);
1483 else { 1647 else {
1484 /* it's been idle for too long, wake up manager */ 1648 /* it's been idle for too long, wake up manager */
1485 gcwq->flags |= GCWQ_MANAGE_WORKERS; 1649 pool->flags |= POOL_MANAGE_WORKERS;
1486 wake_up_worker(gcwq); 1650 wake_up_worker(pool);
1487 } 1651 }
1488 } 1652 }
1489 1653
@@ -1500,7 +1664,7 @@ static bool send_mayday(struct work_struct *work)
1500 return false; 1664 return false;
1501 1665
1502 /* mayday mayday mayday */ 1666 /* mayday mayday mayday */
1503 cpu = cwq->gcwq->cpu; 1667 cpu = cwq->pool->gcwq->cpu;
1504 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1668 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1505 if (cpu == WORK_CPU_UNBOUND) 1669 if (cpu == WORK_CPU_UNBOUND)
1506 cpu = 0; 1670 cpu = 0;
@@ -1509,37 +1673,38 @@ static bool send_mayday(struct work_struct *work)
1509 return true; 1673 return true;
1510} 1674}
1511 1675
1512static void gcwq_mayday_timeout(unsigned long __gcwq) 1676static void gcwq_mayday_timeout(unsigned long __pool)
1513{ 1677{
1514 struct global_cwq *gcwq = (void *)__gcwq; 1678 struct worker_pool *pool = (void *)__pool;
1679 struct global_cwq *gcwq = pool->gcwq;
1515 struct work_struct *work; 1680 struct work_struct *work;
1516 1681
1517 spin_lock_irq(&gcwq->lock); 1682 spin_lock_irq(&gcwq->lock);
1518 1683
1519 if (need_to_create_worker(gcwq)) { 1684 if (need_to_create_worker(pool)) {
1520 /* 1685 /*
1521 * We've been trying to create a new worker but 1686 * We've been trying to create a new worker but
1522 * haven't been successful. We might be hitting an 1687 * haven't been successful. We might be hitting an
1523 * allocation deadlock. Send distress signals to 1688 * allocation deadlock. Send distress signals to
1524 * rescuers. 1689 * rescuers.
1525 */ 1690 */
1526 list_for_each_entry(work, &gcwq->worklist, entry) 1691 list_for_each_entry(work, &pool->worklist, entry)
1527 send_mayday(work); 1692 send_mayday(work);
1528 } 1693 }
1529 1694
1530 spin_unlock_irq(&gcwq->lock); 1695 spin_unlock_irq(&gcwq->lock);
1531 1696
1532 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); 1697 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1533} 1698}
1534 1699
1535/** 1700/**
1536 * maybe_create_worker - create a new worker if necessary 1701 * maybe_create_worker - create a new worker if necessary
1537 * @gcwq: gcwq to create a new worker for 1702 * @pool: pool to create a new worker for
1538 * 1703 *
1539 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to 1704 * Create a new worker for @pool if necessary. @pool is guaranteed to
1540 * have at least one idle worker on return from this function. If 1705 * have at least one idle worker on return from this function. If
1541 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is 1706 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1542 * sent to all rescuers with works scheduled on @gcwq to resolve 1707 * sent to all rescuers with works scheduled on @pool to resolve
1543 * possible allocation deadlock. 1708 * possible allocation deadlock.
1544 * 1709 *
1545 * On return, need_to_create_worker() is guaranteed to be false and 1710 * On return, need_to_create_worker() is guaranteed to be false and
@@ -1554,52 +1719,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
1554 * false if no action was taken and gcwq->lock stayed locked, true 1719 * false if no action was taken and gcwq->lock stayed locked, true
1555 * otherwise. 1720 * otherwise.
1556 */ 1721 */
1557static bool maybe_create_worker(struct global_cwq *gcwq) 1722static bool maybe_create_worker(struct worker_pool *pool)
1558__releases(&gcwq->lock) 1723__releases(&gcwq->lock)
1559__acquires(&gcwq->lock) 1724__acquires(&gcwq->lock)
1560{ 1725{
1561 if (!need_to_create_worker(gcwq)) 1726 struct global_cwq *gcwq = pool->gcwq;
1727
1728 if (!need_to_create_worker(pool))
1562 return false; 1729 return false;
1563restart: 1730restart:
1564 spin_unlock_irq(&gcwq->lock); 1731 spin_unlock_irq(&gcwq->lock);
1565 1732
1566 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 1733 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1567 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1734 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1568 1735
1569 while (true) { 1736 while (true) {
1570 struct worker *worker; 1737 struct worker *worker;
1571 1738
1572 worker = create_worker(gcwq, true); 1739 worker = create_worker(pool);
1573 if (worker) { 1740 if (worker) {
1574 del_timer_sync(&gcwq->mayday_timer); 1741 del_timer_sync(&pool->mayday_timer);
1575 spin_lock_irq(&gcwq->lock); 1742 spin_lock_irq(&gcwq->lock);
1576 start_worker(worker); 1743 start_worker(worker);
1577 BUG_ON(need_to_create_worker(gcwq)); 1744 BUG_ON(need_to_create_worker(pool));
1578 return true; 1745 return true;
1579 } 1746 }
1580 1747
1581 if (!need_to_create_worker(gcwq)) 1748 if (!need_to_create_worker(pool))
1582 break; 1749 break;
1583 1750
1584 __set_current_state(TASK_INTERRUPTIBLE); 1751 __set_current_state(TASK_INTERRUPTIBLE);
1585 schedule_timeout(CREATE_COOLDOWN); 1752 schedule_timeout(CREATE_COOLDOWN);
1586 1753
1587 if (!need_to_create_worker(gcwq)) 1754 if (!need_to_create_worker(pool))
1588 break; 1755 break;
1589 } 1756 }
1590 1757
1591 del_timer_sync(&gcwq->mayday_timer); 1758 del_timer_sync(&pool->mayday_timer);
1592 spin_lock_irq(&gcwq->lock); 1759 spin_lock_irq(&gcwq->lock);
1593 if (need_to_create_worker(gcwq)) 1760 if (need_to_create_worker(pool))
1594 goto restart; 1761 goto restart;
1595 return true; 1762 return true;
1596} 1763}
1597 1764
1598/** 1765/**
1599 * maybe_destroy_worker - destroy workers which have been idle for a while 1766 * maybe_destroy_worker - destroy workers which have been idle for a while
1600 * @gcwq: gcwq to destroy workers for 1767 * @pool: pool to destroy workers for
1601 * 1768 *
1602 * Destroy @gcwq workers which have been idle for longer than 1769 * Destroy @pool workers which have been idle for longer than
1603 * IDLE_WORKER_TIMEOUT. 1770 * IDLE_WORKER_TIMEOUT.
1604 * 1771 *
1605 * LOCKING: 1772 * LOCKING:
@@ -1610,19 +1777,19 @@ restart:
1610 * false if no action was taken and gcwq->lock stayed locked, true 1777 * false if no action was taken and gcwq->lock stayed locked, true
1611 * otherwise. 1778 * otherwise.
1612 */ 1779 */
1613static bool maybe_destroy_workers(struct global_cwq *gcwq) 1780static bool maybe_destroy_workers(struct worker_pool *pool)
1614{ 1781{
1615 bool ret = false; 1782 bool ret = false;
1616 1783
1617 while (too_many_workers(gcwq)) { 1784 while (too_many_workers(pool)) {
1618 struct worker *worker; 1785 struct worker *worker;
1619 unsigned long expires; 1786 unsigned long expires;
1620 1787
1621 worker = list_entry(gcwq->idle_list.prev, struct worker, entry); 1788 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1622 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1789 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1623 1790
1624 if (time_before(jiffies, expires)) { 1791 if (time_before(jiffies, expires)) {
1625 mod_timer(&gcwq->idle_timer, expires); 1792 mod_timer(&pool->idle_timer, expires);
1626 break; 1793 break;
1627 } 1794 }
1628 1795
@@ -1655,31 +1822,59 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
1655 */ 1822 */
1656static bool manage_workers(struct worker *worker) 1823static bool manage_workers(struct worker *worker)
1657{ 1824{
1658 struct global_cwq *gcwq = worker->gcwq; 1825 struct worker_pool *pool = worker->pool;
1659 bool ret = false; 1826 bool ret = false;
1660 1827
1661 if (gcwq->flags & GCWQ_MANAGING_WORKERS) 1828 if (pool->flags & POOL_MANAGING_WORKERS)
1662 return ret; 1829 return ret;
1663 1830
1664 gcwq->flags &= ~GCWQ_MANAGE_WORKERS; 1831 pool->flags |= POOL_MANAGING_WORKERS;
1665 gcwq->flags |= GCWQ_MANAGING_WORKERS;
1666 1832
1667 /* 1833 /*
1668 * Destroy and then create so that may_start_working() is true 1834 * To simplify both worker management and CPU hotplug, hold off
1669 * on return. 1835 * management while hotplug is in progress. CPU hotplug path can't
1836 * grab %POOL_MANAGING_WORKERS to achieve this because that can
1837 * lead to idle worker depletion (all become busy thinking someone
1838 * else is managing) which in turn can result in deadlock under
1839 * extreme circumstances. Use @pool->manager_mutex to synchronize
1840 * manager against CPU hotplug.
1841 *
1842 * manager_mutex would always be free unless CPU hotplug is in
1843 * progress. trylock first without dropping @gcwq->lock.
1670 */ 1844 */
1671 ret |= maybe_destroy_workers(gcwq); 1845 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
1672 ret |= maybe_create_worker(gcwq); 1846 spin_unlock_irq(&pool->gcwq->lock);
1847 mutex_lock(&pool->manager_mutex);
1848 /*
1849 * CPU hotplug could have happened while we were waiting
1850 * for manager_mutex. Hotplug itself can't handle us
1851 * because manager isn't either on idle or busy list, and
1852 * @gcwq's state and ours could have deviated.
1853 *
1854 * As hotplug is now excluded via manager_mutex, we can
1855 * simply try to bind. It will succeed or fail depending
1856 * on @gcwq's current state. Try it and adjust
1857 * %WORKER_UNBOUND accordingly.
1858 */
1859 if (worker_maybe_bind_and_lock(worker))
1860 worker->flags &= ~WORKER_UNBOUND;
1861 else
1862 worker->flags |= WORKER_UNBOUND;
1673 1863
1674 gcwq->flags &= ~GCWQ_MANAGING_WORKERS; 1864 ret = true;
1865 }
1866
1867 pool->flags &= ~POOL_MANAGE_WORKERS;
1675 1868
1676 /* 1869 /*
1677 * The trustee might be waiting to take over the manager 1870 * Destroy and then create so that may_start_working() is true
1678 * position, tell it we're done. 1871 * on return.
1679 */ 1872 */
1680 if (unlikely(gcwq->trustee)) 1873 ret |= maybe_destroy_workers(pool);
1681 wake_up_all(&gcwq->trustee_wait); 1874 ret |= maybe_create_worker(pool);
1682 1875
1876 pool->flags &= ~POOL_MANAGING_WORKERS;
1877 mutex_unlock(&pool->manager_mutex);
1683 return ret; 1878 return ret;
1684} 1879}
1685 1880
@@ -1728,10 +1923,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1728{ 1923{
1729 struct work_struct *work = list_first_entry(&cwq->delayed_works, 1924 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1730 struct work_struct, entry); 1925 struct work_struct, entry);
1731 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1732 1926
1733 trace_workqueue_activate_work(work); 1927 trace_workqueue_activate_work(work);
1734 move_linked_works(work, pos, NULL); 1928 move_linked_works(work, &cwq->pool->worklist, NULL);
1735 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 1929 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1736 cwq->nr_active++; 1930 cwq->nr_active++;
1737} 1931}
@@ -1804,7 +1998,8 @@ __releases(&gcwq->lock)
1804__acquires(&gcwq->lock) 1998__acquires(&gcwq->lock)
1805{ 1999{
1806 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 2000 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1807 struct global_cwq *gcwq = cwq->gcwq; 2001 struct worker_pool *pool = worker->pool;
2002 struct global_cwq *gcwq = pool->gcwq;
1808 struct hlist_head *bwh = busy_worker_head(gcwq, work); 2003 struct hlist_head *bwh = busy_worker_head(gcwq, work);
1809 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; 2004 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1810 work_func_t f = work->func; 2005 work_func_t f = work->func;
@@ -1823,6 +2018,15 @@ __acquires(&gcwq->lock)
1823 lockdep_copy_map(&lockdep_map, &work->lockdep_map); 2018 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
1824#endif 2019#endif
1825 /* 2020 /*
2021 * Ensure we're on the correct CPU. DISASSOCIATED test is
2022 * necessary to avoid spurious warnings from rescuers servicing the
2023 * unbound or a disassociated gcwq.
2024 */
2025 WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
2026 !(gcwq->flags & GCWQ_DISASSOCIATED) &&
2027 raw_smp_processor_id() != gcwq->cpu);
2028
2029 /*
1826 * A single work shouldn't be executed concurrently by 2030 * A single work shouldn't be executed concurrently by
1827 * multiple workers on a single cpu. Check whether anyone is 2031 * multiple workers on a single cpu. Check whether anyone is
1828 * already processing the work. If so, defer the work to the 2032 * already processing the work. If so, defer the work to the
@@ -1846,27 +2050,19 @@ __acquires(&gcwq->lock)
1846 list_del_init(&work->entry); 2050 list_del_init(&work->entry);
1847 2051
1848 /* 2052 /*
1849 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1850 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1851 */
1852 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1853 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1854 struct work_struct, entry);
1855
1856 if (!list_empty(&gcwq->worklist) &&
1857 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1858 wake_up_worker(gcwq);
1859 else
1860 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1861 }
1862
1863 /*
1864 * CPU intensive works don't participate in concurrency 2053 * CPU intensive works don't participate in concurrency
1865 * management. They're the scheduler's responsibility. 2054 * management. They're the scheduler's responsibility.
1866 */ 2055 */
1867 if (unlikely(cpu_intensive)) 2056 if (unlikely(cpu_intensive))
1868 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 2057 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1869 2058
2059 /*
2060 * Unbound gcwq isn't concurrency managed and work items should be
2061 * executed ASAP. Wake up another worker if necessary.
2062 */
2063 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2064 wake_up_worker(pool);
2065
1870 spin_unlock_irq(&gcwq->lock); 2066 spin_unlock_irq(&gcwq->lock);
1871 2067
1872 work_clear_pending(work); 2068 work_clear_pending(work);
@@ -1939,28 +2135,38 @@ static void process_scheduled_works(struct worker *worker)
1939static int worker_thread(void *__worker) 2135static int worker_thread(void *__worker)
1940{ 2136{
1941 struct worker *worker = __worker; 2137 struct worker *worker = __worker;
1942 struct global_cwq *gcwq = worker->gcwq; 2138 struct worker_pool *pool = worker->pool;
2139 struct global_cwq *gcwq = pool->gcwq;
1943 2140
1944 /* tell the scheduler that this is a workqueue worker */ 2141 /* tell the scheduler that this is a workqueue worker */
1945 worker->task->flags |= PF_WQ_WORKER; 2142 worker->task->flags |= PF_WQ_WORKER;
1946woke_up: 2143woke_up:
1947 spin_lock_irq(&gcwq->lock); 2144 spin_lock_irq(&gcwq->lock);
1948 2145
1949 /* DIE can be set only while we're idle, checking here is enough */ 2146 /*
1950 if (worker->flags & WORKER_DIE) { 2147 * DIE can be set only while idle and REBIND set while busy has
2148 * @worker->rebind_work scheduled. Checking here is enough.
2149 */
2150 if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
1951 spin_unlock_irq(&gcwq->lock); 2151 spin_unlock_irq(&gcwq->lock);
1952 worker->task->flags &= ~PF_WQ_WORKER; 2152
1953 return 0; 2153 if (worker->flags & WORKER_DIE) {
2154 worker->task->flags &= ~PF_WQ_WORKER;
2155 return 0;
2156 }
2157
2158 idle_worker_rebind(worker);
2159 goto woke_up;
1954 } 2160 }
1955 2161
1956 worker_leave_idle(worker); 2162 worker_leave_idle(worker);
1957recheck: 2163recheck:
1958 /* no more worker necessary? */ 2164 /* no more worker necessary? */
1959 if (!need_more_worker(gcwq)) 2165 if (!need_more_worker(pool))
1960 goto sleep; 2166 goto sleep;
1961 2167
1962 /* do we need to manage? */ 2168 /* do we need to manage? */
1963 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) 2169 if (unlikely(!may_start_working(pool)) && manage_workers(worker))
1964 goto recheck; 2170 goto recheck;
1965 2171
1966 /* 2172 /*
@@ -1979,7 +2185,7 @@ recheck:
1979 2185
1980 do { 2186 do {
1981 struct work_struct *work = 2187 struct work_struct *work =
1982 list_first_entry(&gcwq->worklist, 2188 list_first_entry(&pool->worklist,
1983 struct work_struct, entry); 2189 struct work_struct, entry);
1984 2190
1985 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { 2191 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1991,11 +2197,11 @@ recheck:
1991 move_linked_works(work, &worker->scheduled, NULL); 2197 move_linked_works(work, &worker->scheduled, NULL);
1992 process_scheduled_works(worker); 2198 process_scheduled_works(worker);
1993 } 2199 }
1994 } while (keep_working(gcwq)); 2200 } while (keep_working(pool));
1995 2201
1996 worker_set_flags(worker, WORKER_PREP, false); 2202 worker_set_flags(worker, WORKER_PREP, false);
1997sleep: 2203sleep:
1998 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) 2204 if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
1999 goto recheck; 2205 goto recheck;
2000 2206
2001 /* 2207 /*
@@ -2053,14 +2259,15 @@ repeat:
2053 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2259 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2054 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2260 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2055 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); 2261 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2056 struct global_cwq *gcwq = cwq->gcwq; 2262 struct worker_pool *pool = cwq->pool;
2263 struct global_cwq *gcwq = pool->gcwq;
2057 struct work_struct *work, *n; 2264 struct work_struct *work, *n;
2058 2265
2059 __set_current_state(TASK_RUNNING); 2266 __set_current_state(TASK_RUNNING);
2060 mayday_clear_cpu(cpu, wq->mayday_mask); 2267 mayday_clear_cpu(cpu, wq->mayday_mask);
2061 2268
2062 /* migrate to the target cpu if possible */ 2269 /* migrate to the target cpu if possible */
2063 rescuer->gcwq = gcwq; 2270 rescuer->pool = pool;
2064 worker_maybe_bind_and_lock(rescuer); 2271 worker_maybe_bind_and_lock(rescuer);
2065 2272
2066 /* 2273 /*
@@ -2068,7 +2275,7 @@ repeat:
2068 * process'em. 2275 * process'em.
2069 */ 2276 */
2070 BUG_ON(!list_empty(&rescuer->scheduled)); 2277 BUG_ON(!list_empty(&rescuer->scheduled));
2071 list_for_each_entry_safe(work, n, &gcwq->worklist, entry) 2278 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2072 if (get_work_cwq(work) == cwq) 2279 if (get_work_cwq(work) == cwq)
2073 move_linked_works(work, scheduled, &n); 2280 move_linked_works(work, scheduled, &n);
2074 2281
@@ -2079,8 +2286,8 @@ repeat:
2079 * regular worker; otherwise, we end up with 0 concurrency 2286 * regular worker; otherwise, we end up with 0 concurrency
2080 * and stalling the execution. 2287 * and stalling the execution.
2081 */ 2288 */
2082 if (keep_working(gcwq)) 2289 if (keep_working(pool))
2083 wake_up_worker(gcwq); 2290 wake_up_worker(pool);
2084 2291
2085 spin_unlock_irq(&gcwq->lock); 2292 spin_unlock_irq(&gcwq->lock);
2086 } 2293 }
@@ -2205,7 +2412,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2205 2412
2206 for_each_cwq_cpu(cpu, wq) { 2413 for_each_cwq_cpu(cpu, wq) {
2207 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2208 struct global_cwq *gcwq = cwq->gcwq; 2415 struct global_cwq *gcwq = cwq->pool->gcwq;
2209 2416
2210 spin_lock_irq(&gcwq->lock); 2417 spin_lock_irq(&gcwq->lock);
2211 2418
@@ -2421,9 +2628,9 @@ reflush:
2421 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2628 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2422 bool drained; 2629 bool drained;
2423 2630
2424 spin_lock_irq(&cwq->gcwq->lock); 2631 spin_lock_irq(&cwq->pool->gcwq->lock);
2425 drained = !cwq->nr_active && list_empty(&cwq->delayed_works); 2632 drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2426 spin_unlock_irq(&cwq->gcwq->lock); 2633 spin_unlock_irq(&cwq->pool->gcwq->lock);
2427 2634
2428 if (drained) 2635 if (drained)
2429 continue; 2636 continue;
@@ -2463,7 +2670,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2463 */ 2670 */
2464 smp_rmb(); 2671 smp_rmb();
2465 cwq = get_work_cwq(work); 2672 cwq = get_work_cwq(work);
2466 if (unlikely(!cwq || gcwq != cwq->gcwq)) 2673 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2467 goto already_gone; 2674 goto already_gone;
2468 } else if (wait_executing) { 2675 } else if (wait_executing) {
2469 worker = find_worker_executing_work(gcwq, work); 2676 worker = find_worker_executing_work(gcwq, work);
@@ -2984,13 +3191,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
2984 if (flags & WQ_MEM_RECLAIM) 3191 if (flags & WQ_MEM_RECLAIM)
2985 flags |= WQ_RESCUER; 3192 flags |= WQ_RESCUER;
2986 3193
2987 /*
2988 * Unbound workqueues aren't concurrency managed and should be
2989 * dispatched to workers immediately.
2990 */
2991 if (flags & WQ_UNBOUND)
2992 flags |= WQ_HIGHPRI;
2993
2994 max_active = max_active ?: WQ_DFL_ACTIVE; 3194 max_active = max_active ?: WQ_DFL_ACTIVE;
2995 max_active = wq_clamp_max_active(max_active, flags, wq->name); 3195 max_active = wq_clamp_max_active(max_active, flags, wq->name);
2996 3196
@@ -3011,9 +3211,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3011 for_each_cwq_cpu(cpu, wq) { 3211 for_each_cwq_cpu(cpu, wq) {
3012 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3212 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3013 struct global_cwq *gcwq = get_gcwq(cpu); 3213 struct global_cwq *gcwq = get_gcwq(cpu);
3214 int pool_idx = (bool)(flags & WQ_HIGHPRI);
3014 3215
3015 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3216 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
3016 cwq->gcwq = gcwq; 3217 cwq->pool = &gcwq->pools[pool_idx];
3017 cwq->wq = wq; 3218 cwq->wq = wq;
3018 cwq->flush_color = -1; 3219 cwq->flush_color = -1;
3019 cwq->max_active = max_active; 3220 cwq->max_active = max_active;
@@ -3225,369 +3426,143 @@ EXPORT_SYMBOL_GPL(work_busy);
3225 * gcwqs serve mix of short, long and very long running works making 3426 * gcwqs serve mix of short, long and very long running works making
3226 * blocked draining impractical. 3427 * blocked draining impractical.
3227 * 3428 *
3228 * This is solved by allowing a gcwq to be detached from CPU, running 3429 * This is solved by allowing a gcwq to be disassociated from the CPU
3229 * it with unbound (rogue) workers and allowing it to be reattached 3430 * running as an unbound one and allowing it to be reattached later if the
3230 * later if the cpu comes back online. A separate thread is created 3431 * cpu comes back online.
3231 * to govern a gcwq in such state and is called the trustee of the
3232 * gcwq.
3233 *
3234 * Trustee states and their descriptions.
3235 *
3236 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3237 * new trustee is started with this state.
3238 *
3239 * IN_CHARGE Once started, trustee will enter this state after
3240 * assuming the manager role and making all existing
3241 * workers rogue. DOWN_PREPARE waits for trustee to
3242 * enter this state. After reaching IN_CHARGE, trustee
3243 * tries to execute the pending worklist until it's empty
3244 * and the state is set to BUTCHER, or the state is set
3245 * to RELEASE.
3246 *
3247 * BUTCHER Command state which is set by the cpu callback after
3248 * the cpu has went down. Once this state is set trustee
3249 * knows that there will be no new works on the worklist
3250 * and once the worklist is empty it can proceed to
3251 * killing idle workers.
3252 *
3253 * RELEASE Command state which is set by the cpu callback if the
3254 * cpu down has been canceled or it has come online
3255 * again. After recognizing this state, trustee stops
3256 * trying to drain or butcher and clears ROGUE, rebinds
3257 * all remaining workers back to the cpu and releases
3258 * manager role.
3259 *
3260 * DONE Trustee will enter this state after BUTCHER or RELEASE
3261 * is complete.
3262 *
3263 * trustee CPU draining
3264 * took over down complete
3265 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3266 * | | ^
3267 * | CPU is back online v return workers |
3268 * ----------------> RELEASE --------------
3269 */ 3432 */
3270 3433
3271/** 3434/* claim manager positions of all pools */
3272 * trustee_wait_event_timeout - timed event wait for trustee 3435static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
3273 * @cond: condition to wait for
3274 * @timeout: timeout in jiffies
3275 *
3276 * wait_event_timeout() for trustee to use. Handles locking and
3277 * checks for RELEASE request.
3278 *
3279 * CONTEXT:
3280 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3281 * multiple times. To be used by trustee.
3282 *
3283 * RETURNS:
3284 * Positive indicating left time if @cond is satisfied, 0 if timed
3285 * out, -1 if canceled.
3286 */
3287#define trustee_wait_event_timeout(cond, timeout) ({ \
3288 long __ret = (timeout); \
3289 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3290 __ret) { \
3291 spin_unlock_irq(&gcwq->lock); \
3292 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3293 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3294 __ret); \
3295 spin_lock_irq(&gcwq->lock); \
3296 } \
3297 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3298})
3299
3300/**
3301 * trustee_wait_event - event wait for trustee
3302 * @cond: condition to wait for
3303 *
3304 * wait_event() for trustee to use. Automatically handles locking and
3305 * checks for CANCEL request.
3306 *
3307 * CONTEXT:
3308 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3309 * multiple times. To be used by trustee.
3310 *
3311 * RETURNS:
3312 * 0 if @cond is satisfied, -1 if canceled.
3313 */
3314#define trustee_wait_event(cond) ({ \
3315 long __ret1; \
3316 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3317 __ret1 < 0 ? -1 : 0; \
3318})
3319
3320static int __cpuinit trustee_thread(void *__gcwq)
3321{ 3436{
3322 struct global_cwq *gcwq = __gcwq; 3437 struct worker_pool *pool;
3323 struct worker *worker;
3324 struct work_struct *work;
3325 struct hlist_node *pos;
3326 long rc;
3327 int i;
3328
3329 BUG_ON(gcwq->cpu != smp_processor_id());
3330 3438
3439 for_each_worker_pool(pool, gcwq)
3440 mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
3331 spin_lock_irq(&gcwq->lock); 3441 spin_lock_irq(&gcwq->lock);
3332 /* 3442}
3333 * Claim the manager position and make all workers rogue.
3334 * Trustee must be bound to the target cpu and can't be
3335 * cancelled.
3336 */
3337 BUG_ON(gcwq->cpu != smp_processor_id());
3338 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3339 BUG_ON(rc < 0);
3340
3341 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3342
3343 list_for_each_entry(worker, &gcwq->idle_list, entry)
3344 worker->flags |= WORKER_ROGUE;
3345 3443
3346 for_each_busy_worker(worker, i, pos, gcwq) 3444/* release manager positions */
3347 worker->flags |= WORKER_ROGUE; 3445static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
3446{
3447 struct worker_pool *pool;
3348 3448
3349 /*
3350 * Call schedule() so that we cross rq->lock and thus can
3351 * guarantee sched callbacks see the rogue flag. This is
3352 * necessary as scheduler callbacks may be invoked from other
3353 * cpus.
3354 */
3355 spin_unlock_irq(&gcwq->lock); 3449 spin_unlock_irq(&gcwq->lock);
3356 schedule(); 3450 for_each_worker_pool(pool, gcwq)
3357 spin_lock_irq(&gcwq->lock); 3451 mutex_unlock(&pool->manager_mutex);
3452}
3358 3453
3359 /* 3454static void gcwq_unbind_fn(struct work_struct *work)
3360 * Sched callbacks are disabled now. Zap nr_running. After 3455{
3361 * this, nr_running stays zero and need_more_worker() and 3456 struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3362 * keep_working() are always true as long as the worklist is 3457 struct worker_pool *pool;
3363 * not empty. 3458 struct worker *worker;
3364 */ 3459 struct hlist_node *pos;
3365 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); 3460 int i;
3366 3461
3367 spin_unlock_irq(&gcwq->lock); 3462 BUG_ON(gcwq->cpu != smp_processor_id());
3368 del_timer_sync(&gcwq->idle_timer);
3369 spin_lock_irq(&gcwq->lock);
3370 3463
3371 /* 3464 gcwq_claim_management_and_lock(gcwq);
3372 * We're now in charge. Notify and proceed to drain. We need
3373 * to keep the gcwq running during the whole CPU down
3374 * procedure as other cpu hotunplug callbacks may need to
3375 * flush currently running tasks.
3376 */
3377 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3378 wake_up_all(&gcwq->trustee_wait);
3379 3465
3380 /* 3466 /*
3381 * The original cpu is in the process of dying and may go away 3467 * We've claimed all manager positions. Make all workers unbound
3382 * anytime now. When that happens, we and all workers would 3468 * and set DISASSOCIATED. Before this, all workers except for the
3383 * be migrated to other cpus. Try draining any left work. We 3469 * ones which are still executing works from before the last CPU
3384 * want to get it over with ASAP - spam rescuers, wake up as 3470 * down must be on the cpu. After this, they may become diasporas.
3385 * many idlers as necessary and create new ones till the
3386 * worklist is empty. Note that if the gcwq is frozen, there
3387 * may be frozen works in freezable cwqs. Don't declare
3388 * completion while frozen.
3389 */ 3471 */
3390 while (gcwq->nr_workers != gcwq->nr_idle || 3472 for_each_worker_pool(pool, gcwq)
3391 gcwq->flags & GCWQ_FREEZING || 3473 list_for_each_entry(worker, &pool->idle_list, entry)
3392 gcwq->trustee_state == TRUSTEE_IN_CHARGE) { 3474 worker->flags |= WORKER_UNBOUND;
3393 int nr_works = 0;
3394
3395 list_for_each_entry(work, &gcwq->worklist, entry) {
3396 send_mayday(work);
3397 nr_works++;
3398 }
3399 3475
3400 list_for_each_entry(worker, &gcwq->idle_list, entry) { 3476 for_each_busy_worker(worker, i, pos, gcwq)
3401 if (!nr_works--) 3477 worker->flags |= WORKER_UNBOUND;
3402 break;
3403 wake_up_process(worker->task);
3404 }
3405 3478
3406 if (need_to_create_worker(gcwq)) { 3479 gcwq->flags |= GCWQ_DISASSOCIATED;
3407 spin_unlock_irq(&gcwq->lock);
3408 worker = create_worker(gcwq, false);
3409 spin_lock_irq(&gcwq->lock);
3410 if (worker) {
3411 worker->flags |= WORKER_ROGUE;
3412 start_worker(worker);
3413 }
3414 }
3415 3480
3416 /* give a breather */ 3481 gcwq_release_management_and_unlock(gcwq);
3417 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3418 break;
3419 }
3420 3482
3421 /* 3483 /*
3422 * Either all works have been scheduled and cpu is down, or 3484 * Call schedule() so that we cross rq->lock and thus can guarantee
3423 * cpu down has already been canceled. Wait for and butcher 3485 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
3424 * all workers till we're canceled. 3486 * as scheduler callbacks may be invoked from other cpus.
3425 */ 3487 */
3426 do { 3488 schedule();
3427 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3428 while (!list_empty(&gcwq->idle_list))
3429 destroy_worker(list_first_entry(&gcwq->idle_list,
3430 struct worker, entry));
3431 } while (gcwq->nr_workers && rc >= 0);
3432 3489
3433 /* 3490 /*
3434 * At this point, either draining has completed and no worker 3491 * Sched callbacks are disabled now. Zap nr_running. After this,
3435 * is left, or cpu down has been canceled or the cpu is being 3492 * nr_running stays zero and need_more_worker() and keep_working()
3436 * brought back up. There shouldn't be any idle one left. 3493 * are always true as long as the worklist is not empty. @gcwq now
3437 * Tell the remaining busy ones to rebind once it finishes the 3494 * behaves as unbound (in terms of concurrency management) gcwq
3438 * currently scheduled works by scheduling the rebind_work. 3495 * which is served by workers tied to the CPU.
3496 *
3497 * On return from this function, the current worker would trigger
3498 * unbound chain execution of pending work items if other workers
3499 * didn't already.
3439 */ 3500 */
3440 WARN_ON(!list_empty(&gcwq->idle_list)); 3501 for_each_worker_pool(pool, gcwq)
3441 3502 atomic_set(get_pool_nr_running(pool), 0);
3442 for_each_busy_worker(worker, i, pos, gcwq) {
3443 struct work_struct *rebind_work = &worker->rebind_work;
3444
3445 /*
3446 * Rebind_work may race with future cpu hotplug
3447 * operations. Use a separate flag to mark that
3448 * rebinding is scheduled.
3449 */
3450 worker->flags |= WORKER_REBIND;
3451 worker->flags &= ~WORKER_ROGUE;
3452
3453 /* queue rebind_work, wq doesn't matter, use the default one */
3454 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3455 work_data_bits(rebind_work)))
3456 continue;
3457
3458 debug_work_activate(rebind_work);
3459 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3460 worker->scheduled.next,
3461 work_color_to_flags(WORK_NO_COLOR));
3462 }
3463
3464 /* relinquish manager role */
3465 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3466
3467 /* notify completion */
3468 gcwq->trustee = NULL;
3469 gcwq->trustee_state = TRUSTEE_DONE;
3470 wake_up_all(&gcwq->trustee_wait);
3471 spin_unlock_irq(&gcwq->lock);
3472 return 0;
3473} 3503}
3474 3504
3475/** 3505/*
3476 * wait_trustee_state - wait for trustee to enter the specified state 3506 * Workqueues should be brought up before normal priority CPU notifiers.
3477 * @gcwq: gcwq the trustee of interest belongs to 3507 * This will be registered high priority CPU notifier.
3478 * @state: target state to wait for
3479 *
3480 * Wait for the trustee to reach @state. DONE is already matched.
3481 *
3482 * CONTEXT:
3483 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3484 * multiple times. To be used by cpu_callback.
3485 */ 3508 */
3486static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) 3509static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3487__releases(&gcwq->lock) 3510 unsigned long action,
3488__acquires(&gcwq->lock) 3511 void *hcpu)
3489{
3490 if (!(gcwq->trustee_state == state ||
3491 gcwq->trustee_state == TRUSTEE_DONE)) {
3492 spin_unlock_irq(&gcwq->lock);
3493 __wait_event(gcwq->trustee_wait,
3494 gcwq->trustee_state == state ||
3495 gcwq->trustee_state == TRUSTEE_DONE);
3496 spin_lock_irq(&gcwq->lock);
3497 }
3498}
3499
3500static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3501 unsigned long action,
3502 void *hcpu)
3503{ 3512{
3504 unsigned int cpu = (unsigned long)hcpu; 3513 unsigned int cpu = (unsigned long)hcpu;
3505 struct global_cwq *gcwq = get_gcwq(cpu); 3514 struct global_cwq *gcwq = get_gcwq(cpu);
3506 struct task_struct *new_trustee = NULL; 3515 struct worker_pool *pool;
3507 struct worker *uninitialized_var(new_worker);
3508 unsigned long flags;
3509
3510 action &= ~CPU_TASKS_FROZEN;
3511 3516
3512 switch (action) { 3517 switch (action & ~CPU_TASKS_FROZEN) {
3513 case CPU_DOWN_PREPARE:
3514 new_trustee = kthread_create(trustee_thread, gcwq,
3515 "workqueue_trustee/%d\n", cpu);
3516 if (IS_ERR(new_trustee))
3517 return notifier_from_errno(PTR_ERR(new_trustee));
3518 kthread_bind(new_trustee, cpu);
3519 /* fall through */
3520 case CPU_UP_PREPARE: 3518 case CPU_UP_PREPARE:
3521 BUG_ON(gcwq->first_idle); 3519 for_each_worker_pool(pool, gcwq) {
3522 new_worker = create_worker(gcwq, false); 3520 struct worker *worker;
3523 if (!new_worker) {
3524 if (new_trustee)
3525 kthread_stop(new_trustee);
3526 return NOTIFY_BAD;
3527 }
3528 }
3529
3530 /* some are called w/ irq disabled, don't disturb irq status */
3531 spin_lock_irqsave(&gcwq->lock, flags);
3532 3521
3533 switch (action) { 3522 if (pool->nr_workers)
3534 case CPU_DOWN_PREPARE: 3523 continue;
3535 /* initialize trustee and tell it to acquire the gcwq */
3536 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3537 gcwq->trustee = new_trustee;
3538 gcwq->trustee_state = TRUSTEE_START;
3539 wake_up_process(gcwq->trustee);
3540 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3541 /* fall through */
3542 case CPU_UP_PREPARE:
3543 BUG_ON(gcwq->first_idle);
3544 gcwq->first_idle = new_worker;
3545 break;
3546 3524
3547 case CPU_DYING: 3525 worker = create_worker(pool);
3548 /* 3526 if (!worker)
3549 * Before this, the trustee and all workers except for 3527 return NOTIFY_BAD;
3550 * the ones which are still executing works from
3551 * before the last CPU down must be on the cpu. After
3552 * this, they'll all be diasporas.
3553 */
3554 gcwq->flags |= GCWQ_DISASSOCIATED;
3555 break;
3556 3528
3557 case CPU_POST_DEAD: 3529 spin_lock_irq(&gcwq->lock);
3558 gcwq->trustee_state = TRUSTEE_BUTCHER; 3530 start_worker(worker);
3559 /* fall through */ 3531 spin_unlock_irq(&gcwq->lock);
3560 case CPU_UP_CANCELED: 3532 }
3561 destroy_worker(gcwq->first_idle);
3562 gcwq->first_idle = NULL;
3563 break; 3533 break;
3564 3534
3565 case CPU_DOWN_FAILED: 3535 case CPU_DOWN_FAILED:
3566 case CPU_ONLINE: 3536 case CPU_ONLINE:
3537 gcwq_claim_management_and_lock(gcwq);
3567 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3538 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3568 if (gcwq->trustee_state != TRUSTEE_DONE) { 3539 rebind_workers(gcwq);
3569 gcwq->trustee_state = TRUSTEE_RELEASE; 3540 gcwq_release_management_and_unlock(gcwq);
3570 wake_up_process(gcwq->trustee);
3571 wait_trustee_state(gcwq, TRUSTEE_DONE);
3572 }
3573
3574 /*
3575 * Trustee is done and there might be no worker left.
3576 * Put the first_idle in and request a real manager to
3577 * take a look.
3578 */
3579 spin_unlock_irq(&gcwq->lock);
3580 kthread_bind(gcwq->first_idle->task, cpu);
3581 spin_lock_irq(&gcwq->lock);
3582 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3583 start_worker(gcwq->first_idle);
3584 gcwq->first_idle = NULL;
3585 break; 3541 break;
3586 } 3542 }
3543 return NOTIFY_OK;
3544}
3587 3545
3588 spin_unlock_irqrestore(&gcwq->lock, flags); 3546/*
3547 * Workqueues should be brought down after normal priority CPU notifiers.
3548 * This will be registered as low priority CPU notifier.
3549 */
3550static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3551 unsigned long action,
3552 void *hcpu)
3553{
3554 unsigned int cpu = (unsigned long)hcpu;
3555 struct work_struct unbind_work;
3589 3556
3590 return notifier_from_errno(0); 3557 switch (action & ~CPU_TASKS_FROZEN) {
3558 case CPU_DOWN_PREPARE:
3559 /* unbinding should happen on the local CPU */
3560 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3561 schedule_work_on(cpu, &unbind_work);
3562 flush_work(&unbind_work);
3563 break;
3564 }
3565 return NOTIFY_OK;
3591} 3566}
3592 3567
3593#ifdef CONFIG_SMP 3568#ifdef CONFIG_SMP
@@ -3746,6 +3721,7 @@ void thaw_workqueues(void)
3746 3721
3747 for_each_gcwq_cpu(cpu) { 3722 for_each_gcwq_cpu(cpu) {
3748 struct global_cwq *gcwq = get_gcwq(cpu); 3723 struct global_cwq *gcwq = get_gcwq(cpu);
3724 struct worker_pool *pool;
3749 struct workqueue_struct *wq; 3725 struct workqueue_struct *wq;
3750 3726
3751 spin_lock_irq(&gcwq->lock); 3727 spin_lock_irq(&gcwq->lock);
@@ -3767,7 +3743,8 @@ void thaw_workqueues(void)
3767 cwq_activate_first_delayed(cwq); 3743 cwq_activate_first_delayed(cwq);
3768 } 3744 }
3769 3745
3770 wake_up_worker(gcwq); 3746 for_each_worker_pool(pool, gcwq)
3747 wake_up_worker(pool);
3771 3748
3772 spin_unlock_irq(&gcwq->lock); 3749 spin_unlock_irq(&gcwq->lock);
3773 } 3750 }
@@ -3783,46 +3760,57 @@ static int __init init_workqueues(void)
3783 unsigned int cpu; 3760 unsigned int cpu;
3784 int i; 3761 int i;
3785 3762
3786 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); 3763 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3764 cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3787 3765
3788 /* initialize gcwqs */ 3766 /* initialize gcwqs */
3789 for_each_gcwq_cpu(cpu) { 3767 for_each_gcwq_cpu(cpu) {
3790 struct global_cwq *gcwq = get_gcwq(cpu); 3768 struct global_cwq *gcwq = get_gcwq(cpu);
3769 struct worker_pool *pool;
3791 3770
3792 spin_lock_init(&gcwq->lock); 3771 spin_lock_init(&gcwq->lock);
3793 INIT_LIST_HEAD(&gcwq->worklist);
3794 gcwq->cpu = cpu; 3772 gcwq->cpu = cpu;
3795 gcwq->flags |= GCWQ_DISASSOCIATED; 3773 gcwq->flags |= GCWQ_DISASSOCIATED;
3796 3774
3797 INIT_LIST_HEAD(&gcwq->idle_list);
3798 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) 3775 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3799 INIT_HLIST_HEAD(&gcwq->busy_hash[i]); 3776 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3800 3777
3801 init_timer_deferrable(&gcwq->idle_timer); 3778 for_each_worker_pool(pool, gcwq) {
3802 gcwq->idle_timer.function = idle_worker_timeout; 3779 pool->gcwq = gcwq;
3803 gcwq->idle_timer.data = (unsigned long)gcwq; 3780 INIT_LIST_HEAD(&pool->worklist);
3781 INIT_LIST_HEAD(&pool->idle_list);
3804 3782
3805 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, 3783 init_timer_deferrable(&pool->idle_timer);
3806 (unsigned long)gcwq); 3784 pool->idle_timer.function = idle_worker_timeout;
3785 pool->idle_timer.data = (unsigned long)pool;
3807 3786
3808 ida_init(&gcwq->worker_ida); 3787 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
3788 (unsigned long)pool);
3809 3789
3810 gcwq->trustee_state = TRUSTEE_DONE; 3790 mutex_init(&pool->manager_mutex);
3811 init_waitqueue_head(&gcwq->trustee_wait); 3791 ida_init(&pool->worker_ida);
3792 }
3793
3794 init_waitqueue_head(&gcwq->rebind_hold);
3812 } 3795 }
3813 3796
3814 /* create the initial worker */ 3797 /* create the initial worker */
3815 for_each_online_gcwq_cpu(cpu) { 3798 for_each_online_gcwq_cpu(cpu) {
3816 struct global_cwq *gcwq = get_gcwq(cpu); 3799 struct global_cwq *gcwq = get_gcwq(cpu);
3817 struct worker *worker; 3800 struct worker_pool *pool;
3818 3801
3819 if (cpu != WORK_CPU_UNBOUND) 3802 if (cpu != WORK_CPU_UNBOUND)
3820 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3803 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3821 worker = create_worker(gcwq, true); 3804
3822 BUG_ON(!worker); 3805 for_each_worker_pool(pool, gcwq) {
3823 spin_lock_irq(&gcwq->lock); 3806 struct worker *worker;
3824 start_worker(worker); 3807
3825 spin_unlock_irq(&gcwq->lock); 3808 worker = create_worker(pool);
3809 BUG_ON(!worker);
3810 spin_lock_irq(&gcwq->lock);
3811 start_worker(worker);
3812 spin_unlock_irq(&gcwq->lock);
3813 }
3826 } 3814 }
3827 3815
3828 system_wq = alloc_workqueue("events", 0, 0); 3816 system_wq = alloc_workqueue("events", 0, 0);