aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorGrant Likely <grant.likely@secretlab.ca>2012-07-25 00:31:09 -0400
committerGrant Likely <grant.likely@secretlab.ca>2012-07-25 00:34:40 -0400
commit6aeea3ecc33b1f36dbc3b80461d15a7052ae424f (patch)
treebbd273e3e0ca76094aed8e9c77e5adfe2b07f779 /kernel
parent9844a5524ec532aee826c35e3031637c7fc8287b (diff)
parentbdc0077af574800d24318b6945cf2344e8dbb050 (diff)
Merge remote-tracking branch 'origin' into irqdomain/next
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c76
-rw-r--r--kernel/audit.c30
-rw-r--r--kernel/audit_tree.c10
-rw-r--r--kernel/audit_watch.c25
-rw-r--r--kernel/cgroup.c76
-rw-r--r--kernel/debug/kdb/kdb_main.c91
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/events/core.c49
-rw-r--r--kernel/events/uprobes.c461
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/hrtimer.c53
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/kthread.c88
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c50
-rw-r--r--kernel/power/main.c45
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/power/swap.c82
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/power/wakelock.c7
-rw-r--r--kernel/printk.c285
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcutiny.c4
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c72
-rw-r--r--kernel/rcutree.c479
-rw-r--r--kernel/rcutree.h47
-rw-r--r--kernel/rcutree_plugin.h237
-rw-r--r--kernel/rcutree_trace.c148
-rw-r--r--kernel/resource.c13
-rw-r--r--kernel/sched/core.c276
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/signal.c15
-rw-r--r--kernel/smp.c20
-rw-r--r--kernel/smpboot.h2
-rw-r--r--kernel/sys.c16
-rw-r--r--kernel/task_work.c94
-rw-r--r--kernel/time/ntp.c8
-rw-r--r--kernel/time/tick-sched.c194
-rw-r--r--kernel/time/timekeeping.c511
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c110
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c33
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/workqueue.c1144
52 files changed, 2748 insertions, 2278 deletions
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bbe..9d3118384858 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1;
62#define MAX_WORK 32768 62#define MAX_WORK 32768
63 63
64static LIST_HEAD(async_pending); 64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running); 65static ASYNC_DOMAIN(async_running);
66static LIST_HEAD(async_domains);
66static DEFINE_SPINLOCK(async_lock); 67static DEFINE_SPINLOCK(async_lock);
68static DEFINE_MUTEX(async_register_mutex);
67 69
68struct async_entry { 70struct async_entry {
69 struct list_head list; 71 struct list_head list;
@@ -71,7 +73,7 @@ struct async_entry {
71 async_cookie_t cookie; 73 async_cookie_t cookie;
72 async_func_ptr *func; 74 async_func_ptr *func;
73 void *data; 75 void *data;
74 struct list_head *running; 76 struct async_domain *running;
75}; 77};
76 78
77static DECLARE_WAIT_QUEUE_HEAD(async_done); 79static DECLARE_WAIT_QUEUE_HEAD(async_done);
@@ -82,13 +84,12 @@ static atomic_t entry_count;
82/* 84/*
83 * MUST be called with the lock held! 85 * MUST be called with the lock held!
84 */ 86 */
85static async_cookie_t __lowest_in_progress(struct list_head *running) 87static async_cookie_t __lowest_in_progress(struct async_domain *running)
86{ 88{
87 struct async_entry *entry; 89 struct async_entry *entry;
88 90
89 if (!list_empty(running)) { 91 if (!list_empty(&running->domain)) {
90 entry = list_first_entry(running, 92 entry = list_first_entry(&running->domain, typeof(*entry), list);
91 struct async_entry, list);
92 return entry->cookie; 93 return entry->cookie;
93 } 94 }
94 95
@@ -99,7 +100,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running)
99 return next_cookie; /* "infinity" value */ 100 return next_cookie; /* "infinity" value */
100} 101}
101 102
102static async_cookie_t lowest_in_progress(struct list_head *running) 103static async_cookie_t lowest_in_progress(struct async_domain *running)
103{ 104{
104 unsigned long flags; 105 unsigned long flags;
105 async_cookie_t ret; 106 async_cookie_t ret;
@@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work)
119 container_of(work, struct async_entry, work); 120 container_of(work, struct async_entry, work);
120 unsigned long flags; 121 unsigned long flags;
121 ktime_t uninitialized_var(calltime), delta, rettime; 122 ktime_t uninitialized_var(calltime), delta, rettime;
123 struct async_domain *running = entry->running;
122 124
123 /* 1) move self to the running queue */ 125 /* 1) move self to the running queue */
124 spin_lock_irqsave(&async_lock, flags); 126 spin_lock_irqsave(&async_lock, flags);
125 list_move_tail(&entry->list, entry->running); 127 list_move_tail(&entry->list, &running->domain);
126 spin_unlock_irqrestore(&async_lock, flags); 128 spin_unlock_irqrestore(&async_lock, flags);
127 129
128 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
@@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work)
145 /* 3) remove self from the running queue */ 147 /* 3) remove self from the running queue */
146 spin_lock_irqsave(&async_lock, flags); 148 spin_lock_irqsave(&async_lock, flags);
147 list_del(&entry->list); 149 list_del(&entry->list);
150 if (running->registered && --running->count == 0)
151 list_del_init(&running->node);
148 152
149 /* 4) free the entry */ 153 /* 4) free the entry */
150 kfree(entry); 154 kfree(entry);
@@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work)
156 wake_up(&async_done); 160 wake_up(&async_done);
157} 161}
158 162
159static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) 163static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
160{ 164{
161 struct async_entry *entry; 165 struct async_entry *entry;
162 unsigned long flags; 166 unsigned long flags;
@@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
187 spin_lock_irqsave(&async_lock, flags); 191 spin_lock_irqsave(&async_lock, flags);
188 newcookie = entry->cookie = next_cookie++; 192 newcookie = entry->cookie = next_cookie++;
189 list_add_tail(&entry->list, &async_pending); 193 list_add_tail(&entry->list, &async_pending);
194 if (running->registered && running->count++ == 0)
195 list_add_tail(&running->node, &async_domains);
190 atomic_inc(&entry_count); 196 atomic_inc(&entry_count);
191 spin_unlock_irqrestore(&async_lock, flags); 197 spin_unlock_irqrestore(&async_lock, flags);
192 198
@@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
223 * Note: This function may be called from atomic or non-atomic contexts. 229 * Note: This function may be called from atomic or non-atomic contexts.
224 */ 230 */
225async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 231async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
226 struct list_head *running) 232 struct async_domain *running)
227{ 233{
228 return __async_schedule(ptr, data, running); 234 return __async_schedule(ptr, data, running);
229} 235}
@@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
236 */ 242 */
237void async_synchronize_full(void) 243void async_synchronize_full(void)
238{ 244{
245 mutex_lock(&async_register_mutex);
239 do { 246 do {
240 async_synchronize_cookie(next_cookie); 247 struct async_domain *domain = NULL;
241 } while (!list_empty(&async_running) || !list_empty(&async_pending)); 248
249 spin_lock_irq(&async_lock);
250 if (!list_empty(&async_domains))
251 domain = list_first_entry(&async_domains, typeof(*domain), node);
252 spin_unlock_irq(&async_lock);
253
254 async_synchronize_cookie_domain(next_cookie, domain);
255 } while (!list_empty(&async_domains));
256 mutex_unlock(&async_register_mutex);
242} 257}
243EXPORT_SYMBOL_GPL(async_synchronize_full); 258EXPORT_SYMBOL_GPL(async_synchronize_full);
244 259
245/** 260/**
261 * async_unregister_domain - ensure no more anonymous waiters on this domain
262 * @domain: idle domain to flush out of any async_synchronize_full instances
263 *
264 * async_synchronize_{cookie|full}_domain() are not flushed since callers
265 * of these routines should know the lifetime of @domain
266 *
267 * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
268 */
269void async_unregister_domain(struct async_domain *domain)
270{
271 mutex_lock(&async_register_mutex);
272 spin_lock_irq(&async_lock);
273 WARN_ON(!domain->registered || !list_empty(&domain->node) ||
274 !list_empty(&domain->domain));
275 domain->registered = 0;
276 spin_unlock_irq(&async_lock);
277 mutex_unlock(&async_register_mutex);
278}
279EXPORT_SYMBOL_GPL(async_unregister_domain);
280
281/**
246 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 282 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
247 * @list: running list to synchronize on 283 * @domain: running list to synchronize on
248 * 284 *
249 * This function waits until all asynchronous function calls for the 285 * This function waits until all asynchronous function calls for the
250 * synchronization domain specified by the running list @list have been done. 286 * synchronization domain specified by the running list @domain have been done.
251 */ 287 */
252void async_synchronize_full_domain(struct list_head *list) 288void async_synchronize_full_domain(struct async_domain *domain)
253{ 289{
254 async_synchronize_cookie_domain(next_cookie, list); 290 async_synchronize_cookie_domain(next_cookie, domain);
255} 291}
256EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 292EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
257 293
@@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
261 * @running: running list to synchronize on 297 * @running: running list to synchronize on
262 * 298 *
263 * This function waits until all asynchronous function calls for the 299 * This function waits until all asynchronous function calls for the
264 * synchronization domain specified by the running list @list submitted 300 * synchronization domain specified by running list @running submitted
265 * prior to @cookie have been done. 301 * prior to @cookie have been done.
266 */ 302 */
267void async_synchronize_cookie_domain(async_cookie_t cookie, 303void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
268 struct list_head *running)
269{ 304{
270 ktime_t uninitialized_var(starttime), delta, endtime; 305 ktime_t uninitialized_var(starttime), delta, endtime;
271 306
307 if (!running)
308 return;
309
272 if (initcall_debug && system_state == SYSTEM_BOOTING) { 310 if (initcall_debug && system_state == SYSTEM_BOOTING) {
273 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 311 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
274 starttime = ktime_get(); 312 starttime = ktime_get();
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c7f2c61416b..4a3f28d2ca65 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
384static void audit_printk_skb(struct sk_buff *skb) 384static void audit_printk_skb(struct sk_buff *skb)
385{ 385{
386 struct nlmsghdr *nlh = nlmsg_hdr(skb); 386 struct nlmsghdr *nlh = nlmsg_hdr(skb);
387 char *data = NLMSG_DATA(nlh); 387 char *data = nlmsg_data(nlh);
388 388
389 if (nlh->nlmsg_type != AUDIT_EOE) { 389 if (nlh->nlmsg_type != AUDIT_EOE) {
390 if (printk_ratelimit()) 390 if (printk_ratelimit())
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
516 if (!skb) 516 if (!skb)
517 return NULL; 517 return NULL;
518 518
519 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); 519 nlh = nlmsg_put(skb, pid, seq, t, size, flags);
520 data = NLMSG_DATA(nlh); 520 if (!nlh)
521 goto out_kfree_skb;
522 data = nlmsg_data(nlh);
521 memcpy(data, payload, size); 523 memcpy(data, payload, size);
522 return skb; 524 return skb;
523 525
524nlmsg_failure: /* Used by NLMSG_NEW */ 526out_kfree_skb:
525 if (skb) 527 kfree_skb(skb);
526 kfree_skb(skb);
527 return NULL; 528 return NULL;
528} 529}
529 530
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
680 sessionid = audit_get_sessionid(current); 681 sessionid = audit_get_sessionid(current);
681 security_task_getsecid(current, &sid); 682 security_task_getsecid(current, &sid);
682 seq = nlh->nlmsg_seq; 683 seq = nlh->nlmsg_seq;
683 data = NLMSG_DATA(nlh); 684 data = nlmsg_data(nlh);
684 685
685 switch (msg_type) { 686 switch (msg_type) {
686 case AUDIT_GET: 687 case AUDIT_GET:
@@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff *skb)
961static int __init audit_init(void) 962static int __init audit_init(void)
962{ 963{
963 int i; 964 int i;
965 struct netlink_kernel_cfg cfg = {
966 .input = audit_receive,
967 };
964 968
965 if (audit_initialized == AUDIT_DISABLED) 969 if (audit_initialized == AUDIT_DISABLED)
966 return 0; 970 return 0;
967 971
968 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 972 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
969 audit_default ? "enabled" : "disabled"); 973 audit_default ? "enabled" : "disabled");
970 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, 974 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
971 audit_receive, NULL, THIS_MODULE); 975 THIS_MODULE, &cfg);
972 if (!audit_sock) 976 if (!audit_sock)
973 audit_panic("cannot initialize netlink socket"); 977 audit_panic("cannot initialize netlink socket");
974 else 978 else
@@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1060 1064
1061 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); 1065 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1062 if (!ab->skb) 1066 if (!ab->skb)
1063 goto nlmsg_failure; 1067 goto err;
1064 1068
1065 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); 1069 nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
1070 if (!nlh)
1071 goto out_kfree_skb;
1066 1072
1067 return ab; 1073 return ab;
1068 1074
1069nlmsg_failure: /* Used by NLMSG_NEW */ 1075out_kfree_skb:
1070 kfree_skb(ab->skb); 1076 kfree_skb(ab->skb);
1071 ab->skb = NULL; 1077 ab->skb = NULL;
1072err: 1078err:
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e7..3a5ca582ba1e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -595,7 +595,7 @@ void audit_trim_trees(void)
595 595
596 root_mnt = collect_mounts(&path); 596 root_mnt = collect_mounts(&path);
597 path_put(&path); 597 path_put(&path);
598 if (!root_mnt) 598 if (IS_ERR(root_mnt))
599 goto skip_it; 599 goto skip_it;
600 600
601 spin_lock(&hash_lock); 601 spin_lock(&hash_lock);
@@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
669 goto Err; 669 goto Err;
670 mnt = collect_mounts(&path); 670 mnt = collect_mounts(&path);
671 path_put(&path); 671 path_put(&path);
672 if (!mnt) { 672 if (IS_ERR(mnt)) {
673 err = -ENOMEM; 673 err = PTR_ERR(mnt);
674 goto Err; 674 goto Err;
675 } 675 }
676 676
@@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
719 return err; 719 return err;
720 tagged = collect_mounts(&path2); 720 tagged = collect_mounts(&path2);
721 path_put(&path2); 721 path_put(&path2);
722 if (!tagged) 722 if (IS_ERR(tagged))
723 return -ENOMEM; 723 return PTR_ERR(tagged);
724 724
725 err = kern_path(old, 0, &path1); 725 err = kern_path(old, 0, &path1);
726 if (err) { 726 if (err) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d9..3823281401b5 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(struct audit_watch *watch, struct path *parent) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata nd; 358 struct dentry *d = kern_path_locked(watch->path, parent);
359 struct dentry *d; 359 if (IS_ERR(d))
360 int err;
361
362 err = kern_path_parent(watch->path, &nd);
363 if (err)
364 return err;
365
366 if (nd.last_type != LAST_NORM) {
367 path_put(&nd.path);
368 return -EINVAL;
369 }
370
371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 if (IS_ERR(d)) {
374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 path_put(&nd.path);
376 return PTR_ERR(d); 360 return PTR_ERR(d);
377 } 361 mutex_unlock(&parent->dentry->d_inode->i_mutex);
378 if (d->d_inode) { 362 if (d->d_inode) {
379 /* update watch filter fields */ 363 /* update watch filter fields */
380 watch->dev = d->d_inode->i_sb->s_dev; 364 watch->dev = d->d_inode->i_sb->s_dev;
381 watch->ino = d->d_inode->i_ino; 365 watch->ino = d->d_inode->i_ino;
382 } 366 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
384
385 *parent = nd.path;
386 dput(d); 367 dput(d);
387 return 0; 368 return 0;
388} 369}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2097684cf194..79818507e444 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
822 */ 822 */
823 823
824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); 825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
827static int cgroup_populate_dir(struct cgroup *cgrp); 827static int cgroup_populate_dir(struct cgroup *cgrp);
828static const struct inode_operations cgroup_dir_inode_operations; 828static const struct inode_operations cgroup_dir_inode_operations;
@@ -901,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
901 mutex_unlock(&cgroup_mutex); 901 mutex_unlock(&cgroup_mutex);
902 902
903 /* 903 /*
904 * We want to drop the active superblock reference from the 904 * Drop the active superblock reference that we took when we
905 * cgroup creation after all the dentry refs are gone - 905 * created the cgroup
906 * kill_sb gets mighty unhappy otherwise. Mark
907 * dentry->d_fsdata with cgroup_diput() to tell
908 * cgroup_d_release() to call deactivate_super().
909 */ 906 */
910 dentry->d_fsdata = cgroup_diput; 907 deactivate_super(cgrp->root->sb);
911 908
912 /* 909 /*
913 * if we're getting rid of the cgroup, refcount should ensure 910 * if we're getting rid of the cgroup, refcount should ensure
@@ -933,13 +930,6 @@ static int cgroup_delete(const struct dentry *d)
933 return 1; 930 return 1;
934} 931}
935 932
936static void cgroup_d_release(struct dentry *dentry)
937{
938 /* did cgroup_diput() tell me to deactivate super? */
939 if (dentry->d_fsdata == cgroup_diput)
940 deactivate_super(dentry->d_sb);
941}
942
943static void remove_dir(struct dentry *d) 933static void remove_dir(struct dentry *d)
944{ 934{
945 struct dentry *parent = dget(d->d_parent); 935 struct dentry *parent = dget(d->d_parent);
@@ -964,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
964 954
965 dget(d); 955 dget(d);
966 d_delete(d); 956 d_delete(d);
967 simple_unlink(d->d_inode, d); 957 simple_unlink(cgrp->dentry->d_inode, d);
968 list_del_init(&cfe->node); 958 list_del_init(&cfe->node);
969 dput(d); 959 dput(d);
970 960
@@ -1078,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1078 BUG_ON(cgrp->subsys[i]); 1068 BUG_ON(cgrp->subsys[i]);
1079 BUG_ON(!dummytop->subsys[i]); 1069 BUG_ON(!dummytop->subsys[i]);
1080 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1070 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
1081 mutex_lock(&ss->hierarchy_mutex);
1082 cgrp->subsys[i] = dummytop->subsys[i]; 1071 cgrp->subsys[i] = dummytop->subsys[i];
1083 cgrp->subsys[i]->cgroup = cgrp; 1072 cgrp->subsys[i]->cgroup = cgrp;
1084 list_move(&ss->sibling, &root->subsys_list); 1073 list_move(&ss->sibling, &root->subsys_list);
1085 ss->root = root; 1074 ss->root = root;
1086 if (ss->bind) 1075 if (ss->bind)
1087 ss->bind(cgrp); 1076 ss->bind(cgrp);
1088 mutex_unlock(&ss->hierarchy_mutex);
1089 /* refcount was already taken, and we're keeping it */ 1077 /* refcount was already taken, and we're keeping it */
1090 } else if (bit & removed_bits) { 1078 } else if (bit & removed_bits) {
1091 /* We're removing this subsystem */ 1079 /* We're removing this subsystem */
1092 BUG_ON(ss == NULL); 1080 BUG_ON(ss == NULL);
1093 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1081 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1094 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1082 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1095 mutex_lock(&ss->hierarchy_mutex);
1096 if (ss->bind) 1083 if (ss->bind)
1097 ss->bind(dummytop); 1084 ss->bind(dummytop);
1098 dummytop->subsys[i]->cgroup = dummytop; 1085 dummytop->subsys[i]->cgroup = dummytop;
1099 cgrp->subsys[i] = NULL; 1086 cgrp->subsys[i] = NULL;
1100 subsys[i]->root = &rootnode; 1087 subsys[i]->root = &rootnode;
1101 list_move(&ss->sibling, &rootnode.subsys_list); 1088 list_move(&ss->sibling, &rootnode.subsys_list);
1102 mutex_unlock(&ss->hierarchy_mutex);
1103 /* subsystem is now free - drop reference on module */ 1089 /* subsystem is now free - drop reference on module */
1104 module_put(ss->module); 1090 module_put(ss->module);
1105 } else if (bit & final_bits) { 1091 } else if (bit & final_bits) {
@@ -1547,7 +1533,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
1547 static const struct dentry_operations cgroup_dops = { 1533 static const struct dentry_operations cgroup_dops = {
1548 .d_iput = cgroup_diput, 1534 .d_iput = cgroup_diput,
1549 .d_delete = cgroup_delete, 1535 .d_delete = cgroup_delete,
1550 .d_release = cgroup_d_release,
1551 }; 1536 };
1552 1537
1553 struct inode *inode = 1538 struct inode *inode =
@@ -1598,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1598 opts.new_root = new_root; 1583 opts.new_root = new_root;
1599 1584
1600 /* Locate an existing or new sb for this hierarchy */ 1585 /* Locate an existing or new sb for this hierarchy */
1601 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); 1586 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1602 if (IS_ERR(sb)) { 1587 if (IS_ERR(sb)) {
1603 ret = PTR_ERR(sb); 1588 ret = PTR_ERR(sb);
1604 cgroup_drop_root(opts.new_root); 1589 cgroup_drop_root(opts.new_root);
@@ -2581,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2581 .rename = cgroup_rename, 2566 .rename = cgroup_rename,
2582}; 2567};
2583 2568
2584static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 2569static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2585{ 2570{
2586 if (dentry->d_name.len > NAME_MAX) 2571 if (dentry->d_name.len > NAME_MAX)
2587 return ERR_PTR(-ENAMETOOLONG); 2572 return ERR_PTR(-ENAMETOOLONG);
@@ -3894,8 +3879,12 @@ static void css_dput_fn(struct work_struct *work)
3894{ 3879{
3895 struct cgroup_subsys_state *css = 3880 struct cgroup_subsys_state *css =
3896 container_of(work, struct cgroup_subsys_state, dput_work); 3881 container_of(work, struct cgroup_subsys_state, dput_work);
3882 struct dentry *dentry = css->cgroup->dentry;
3883 struct super_block *sb = dentry->d_sb;
3897 3884
3898 dput(css->cgroup->dentry); 3885 atomic_inc(&sb->s_active);
3886 dput(dentry);
3887 deactivate_super(sb);
3899} 3888}
3900 3889
3901static void init_cgroup_css(struct cgroup_subsys_state *css, 3890static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -3922,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3922 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 3911 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3923} 3912}
3924 3913
3925static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
3926{
3927 /* We need to take each hierarchy_mutex in a consistent order */
3928 int i;
3929
3930 /*
3931 * No worry about a race with rebind_subsystems that might mess up the
3932 * locking order, since both parties are under cgroup_mutex.
3933 */
3934 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3935 struct cgroup_subsys *ss = subsys[i];
3936 if (ss == NULL)
3937 continue;
3938 if (ss->root == root)
3939 mutex_lock(&ss->hierarchy_mutex);
3940 }
3941}
3942
3943static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3944{
3945 int i;
3946
3947 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3948 struct cgroup_subsys *ss = subsys[i];
3949 if (ss == NULL)
3950 continue;
3951 if (ss->root == root)
3952 mutex_unlock(&ss->hierarchy_mutex);
3953 }
3954}
3955
3956/* 3914/*
3957 * cgroup_create - create a cgroup 3915 * cgroup_create - create a cgroup
3958 * @parent: cgroup that will be parent of the new cgroup 3916 * @parent: cgroup that will be parent of the new cgroup
@@ -4013,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4013 ss->post_clone(cgrp); 3971 ss->post_clone(cgrp);
4014 } 3972 }
4015 3973
4016 cgroup_lock_hierarchy(root);
4017 list_add(&cgrp->sibling, &cgrp->parent->children); 3974 list_add(&cgrp->sibling, &cgrp->parent->children);
4018 cgroup_unlock_hierarchy(root);
4019 root->number_of_cgroups++; 3975 root->number_of_cgroups++;
4020 3976
4021 err = cgroup_create_dir(cgrp, dentry, mode); 3977 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -4042,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4042 3998
4043 err_remove: 3999 err_remove:
4044 4000
4045 cgroup_lock_hierarchy(root);
4046 list_del(&cgrp->sibling); 4001 list_del(&cgrp->sibling);
4047 cgroup_unlock_hierarchy(root);
4048 root->number_of_cgroups--; 4002 root->number_of_cgroups--;
4049 4003
4050 err_destroy: 4004 err_destroy:
@@ -4252,10 +4206,8 @@ again:
4252 list_del_init(&cgrp->release_list); 4206 list_del_init(&cgrp->release_list);
4253 raw_spin_unlock(&release_list_lock); 4207 raw_spin_unlock(&release_list_lock);
4254 4208
4255 cgroup_lock_hierarchy(cgrp->root);
4256 /* delete this cgroup from parent->children */ 4209 /* delete this cgroup from parent->children */
4257 list_del_init(&cgrp->sibling); 4210 list_del_init(&cgrp->sibling);
4258 cgroup_unlock_hierarchy(cgrp->root);
4259 4211
4260 list_del_init(&cgrp->allcg_node); 4212 list_del_init(&cgrp->allcg_node);
4261 4213
@@ -4329,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4329 * need to invoke fork callbacks here. */ 4281 * need to invoke fork callbacks here. */
4330 BUG_ON(!list_empty(&init_task.tasks)); 4282 BUG_ON(!list_empty(&init_task.tasks));
4331 4283
4332 mutex_init(&ss->hierarchy_mutex);
4333 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4334 ss->active = 1; 4284 ss->active = 1;
4335 4285
4336 /* this function shouldn't be used with modular subsystems, since they 4286 /* this function shouldn't be used with modular subsystems, since they
@@ -4457,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4457 } 4407 }
4458 write_unlock(&css_set_lock); 4408 write_unlock(&css_set_lock);
4459 4409
4460 mutex_init(&ss->hierarchy_mutex);
4461 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4462 ss->active = 1; 4410 ss->active = 1;
4463 4411
4464 /* success! */ 4412 /* success! */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..1f91413edb87 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/kmsg_dump.h>
17#include <linux/reboot.h> 18#include <linux/reboot.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/sysrq.h> 20#include <linux/sysrq.h>
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv)
2040 */ 2041 */
2041static int kdb_dmesg(int argc, const char **argv) 2042static int kdb_dmesg(int argc, const char **argv)
2042{ 2043{
2043 char *syslog_data[4], *start, *end, c = '\0', *p; 2044 int diag;
2044 int diag, logging, logsize, lines = 0, adjust = 0, n; 2045 int logging;
2046 int lines = 0;
2047 int adjust = 0;
2048 int n = 0;
2049 int skip = 0;
2050 struct kmsg_dumper dumper = { .active = 1 };
2051 size_t len;
2052 char buf[201];
2045 2053
2046 if (argc > 2) 2054 if (argc > 2)
2047 return KDB_ARGCOUNT; 2055 return KDB_ARGCOUNT;
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv)
2064 kdb_set(2, setargs); 2072 kdb_set(2, setargs);
2065 } 2073 }
2066 2074
2067 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] 2075 kmsg_dump_rewind_nolock(&dumper);
2068 * logical start, end+1. */ 2076 while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
2069 kdb_syslog_data(syslog_data); 2077 n++;
2070 if (syslog_data[2] == syslog_data[3]) 2078
2071 return 0;
2072 logsize = syslog_data[1] - syslog_data[0];
2073 start = syslog_data[2];
2074 end = syslog_data[3];
2075#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
2076 for (n = 0, p = start; p < end; ++p) {
2077 c = *KDB_WRAP(p);
2078 if (c == '\n')
2079 ++n;
2080 }
2081 if (c != '\n')
2082 ++n;
2083 if (lines < 0) { 2079 if (lines < 0) {
2084 if (adjust >= n) 2080 if (adjust >= n)
2085 kdb_printf("buffer only contains %d lines, nothing " 2081 kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv)
2087 else if (adjust - lines >= n) 2083 else if (adjust - lines >= n)
2088 kdb_printf("buffer only contains %d lines, last %d " 2084 kdb_printf("buffer only contains %d lines, last %d "
2089 "lines printed\n", n, n - adjust); 2085 "lines printed\n", n, n - adjust);
2090 if (adjust) { 2086 skip = adjust;
2091 for (; start < end && adjust; ++start) { 2087 lines = abs(lines);
2092 if (*KDB_WRAP(start) == '\n')
2093 --adjust;
2094 }
2095 if (start < end)
2096 ++start;
2097 }
2098 for (p = start; p < end && lines; ++p) {
2099 if (*KDB_WRAP(p) == '\n')
2100 ++lines;
2101 }
2102 end = p;
2103 } else if (lines > 0) { 2088 } else if (lines > 0) {
2104 int skip = n - (adjust + lines); 2089 skip = n - lines - adjust;
2090 lines = abs(lines);
2105 if (adjust >= n) { 2091 if (adjust >= n) {
2106 kdb_printf("buffer only contains %d lines, " 2092 kdb_printf("buffer only contains %d lines, "
2107 "nothing printed\n", n); 2093 "nothing printed\n", n);
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv)
2112 kdb_printf("buffer only contains %d lines, first " 2098 kdb_printf("buffer only contains %d lines, first "
2113 "%d lines printed\n", n, lines); 2099 "%d lines printed\n", n, lines);
2114 } 2100 }
2115 for (; start < end && skip; ++start) { 2101 } else {
2116 if (*KDB_WRAP(start) == '\n') 2102 lines = n;
2117 --skip;
2118 }
2119 for (p = start; p < end && lines; ++p) {
2120 if (*KDB_WRAP(p) == '\n')
2121 --lines;
2122 }
2123 end = p;
2124 } 2103 }
2125 /* Do a line at a time (max 200 chars) to reduce protocol overhead */ 2104
2126 c = '\n'; 2105 if (skip >= n || skip < 0)
2127 while (start != end) { 2106 return 0;
2128 char buf[201]; 2107
2129 p = buf; 2108 kmsg_dump_rewind_nolock(&dumper);
2130 if (KDB_FLAG(CMD_INTERRUPT)) 2109 while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
2131 return 0; 2110 if (skip) {
2132 while (start < end && (c = *KDB_WRAP(start)) && 2111 skip--;
2133 (p - buf) < sizeof(buf)-1) { 2112 continue;
2134 ++start;
2135 *p++ = c;
2136 if (c == '\n')
2137 break;
2138 } 2113 }
2139 *p = '\0'; 2114 if (!lines--)
2140 kdb_printf("%s", buf); 2115 break;
2116
2117 kdb_printf("%.*s\n", (int)len - 1, buf);
2141 } 2118 }
2142 if (c != '\n')
2143 kdb_printf("\n");
2144 2119
2145 return 0; 2120 return 0;
2146} 2121}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
205extern int kdb_grep_leading; 205extern int kdb_grep_leading;
206extern int kdb_grep_trailing; 206extern int kdb_grep_trailing;
207extern char *kdb_cmds[]; 207extern char *kdb_cmds[];
208extern void kdb_syslog_data(char *syslog_data[]);
209extern unsigned long kdb_task_state_string(const char *); 208extern unsigned long kdb_task_state_string(const char *);
210extern char kdb_task_state_char (const struct task_struct *); 209extern char kdb_task_state_char (const struct task_struct *);
211extern unsigned long kdb_task_state(const struct task_struct *p, 210extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7d71d6ec972..f1cf0edeb39a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1645 lockdep_assert_held(&ctx->mutex); 1645 lockdep_assert_held(&ctx->mutex);
1646 1646
1647 event->ctx = ctx; 1647 event->ctx = ctx;
1648 if (event->cpu != -1)
1649 event->cpu = cpu;
1648 1650
1649 if (!task) { 1651 if (!task) {
1650 /* 1652 /*
@@ -6252,6 +6254,8 @@ SYSCALL_DEFINE5(perf_event_open,
6252 } 6254 }
6253 } 6255 }
6254 6256
6257 get_online_cpus();
6258
6255 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 6259 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6256 NULL, NULL); 6260 NULL, NULL);
6257 if (IS_ERR(event)) { 6261 if (IS_ERR(event)) {
@@ -6304,7 +6308,7 @@ SYSCALL_DEFINE5(perf_event_open,
6304 /* 6308 /*
6305 * Get the target context (task or percpu): 6309 * Get the target context (task or percpu):
6306 */ 6310 */
6307 ctx = find_get_context(pmu, task, cpu); 6311 ctx = find_get_context(pmu, task, event->cpu);
6308 if (IS_ERR(ctx)) { 6312 if (IS_ERR(ctx)) {
6309 err = PTR_ERR(ctx); 6313 err = PTR_ERR(ctx);
6310 goto err_alloc; 6314 goto err_alloc;
@@ -6377,20 +6381,23 @@ SYSCALL_DEFINE5(perf_event_open,
6377 mutex_lock(&ctx->mutex); 6381 mutex_lock(&ctx->mutex);
6378 6382
6379 if (move_group) { 6383 if (move_group) {
6380 perf_install_in_context(ctx, group_leader, cpu); 6384 synchronize_rcu();
6385 perf_install_in_context(ctx, group_leader, event->cpu);
6381 get_ctx(ctx); 6386 get_ctx(ctx);
6382 list_for_each_entry(sibling, &group_leader->sibling_list, 6387 list_for_each_entry(sibling, &group_leader->sibling_list,
6383 group_entry) { 6388 group_entry) {
6384 perf_install_in_context(ctx, sibling, cpu); 6389 perf_install_in_context(ctx, sibling, event->cpu);
6385 get_ctx(ctx); 6390 get_ctx(ctx);
6386 } 6391 }
6387 } 6392 }
6388 6393
6389 perf_install_in_context(ctx, event, cpu); 6394 perf_install_in_context(ctx, event, event->cpu);
6390 ++ctx->generation; 6395 ++ctx->generation;
6391 perf_unpin_context(ctx); 6396 perf_unpin_context(ctx);
6392 mutex_unlock(&ctx->mutex); 6397 mutex_unlock(&ctx->mutex);
6393 6398
6399 put_online_cpus();
6400
6394 event->owner = current; 6401 event->owner = current;
6395 6402
6396 mutex_lock(&current->perf_event_mutex); 6403 mutex_lock(&current->perf_event_mutex);
@@ -6419,6 +6426,7 @@ err_context:
6419err_alloc: 6426err_alloc:
6420 free_event(event); 6427 free_event(event);
6421err_task: 6428err_task:
6429 put_online_cpus();
6422 if (task) 6430 if (task)
6423 put_task_struct(task); 6431 put_task_struct(task);
6424err_group_fd: 6432err_group_fd:
@@ -6479,6 +6487,39 @@ err:
6479} 6487}
6480EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 6488EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6481 6489
6490void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
6491{
6492 struct perf_event_context *src_ctx;
6493 struct perf_event_context *dst_ctx;
6494 struct perf_event *event, *tmp;
6495 LIST_HEAD(events);
6496
6497 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
6498 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
6499
6500 mutex_lock(&src_ctx->mutex);
6501 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
6502 event_entry) {
6503 perf_remove_from_context(event);
6504 put_ctx(src_ctx);
6505 list_add(&event->event_entry, &events);
6506 }
6507 mutex_unlock(&src_ctx->mutex);
6508
6509 synchronize_rcu();
6510
6511 mutex_lock(&dst_ctx->mutex);
6512 list_for_each_entry_safe(event, tmp, &events, event_entry) {
6513 list_del(&event->event_entry);
6514 if (event->state >= PERF_EVENT_STATE_OFF)
6515 event->state = PERF_EVENT_STATE_INACTIVE;
6516 perf_install_in_context(dst_ctx, event, dst_cpu);
6517 get_ctx(dst_ctx);
6518 }
6519 mutex_unlock(&dst_ctx->mutex);
6520}
6521EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
6522
6482static void sync_child_event(struct perf_event *child_event, 6523static void sync_child_event(struct perf_event *child_event,
6483 struct task_struct *child) 6524 struct task_struct *child)
6484{ 6525{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 985be4d80fe8..f93532748bca 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -38,13 +38,29 @@
38#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) 38#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
39#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE 39#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
40 40
41static struct srcu_struct uprobes_srcu;
42static struct rb_root uprobes_tree = RB_ROOT; 41static struct rb_root uprobes_tree = RB_ROOT;
43 42
44static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ 43static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
45 44
46#define UPROBES_HASH_SZ 13 45#define UPROBES_HASH_SZ 13
47 46
47/*
48 * We need separate register/unregister and mmap/munmap lock hashes because
49 * of mmap_sem nesting.
50 *
51 * uprobe_register() needs to install probes on (potentially) all processes
52 * and thus needs to acquire multiple mmap_sems (consequtively, not
53 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
54 * for the particular process doing the mmap.
55 *
56 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
57 * because of lock order against i_mmap_mutex. This means there's a hole in
58 * the register vma iteration where a mmap() can happen.
59 *
60 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
61 * install a probe where one is already installed.
62 */
63
48/* serialize (un)register */ 64/* serialize (un)register */
49static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; 65static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
50 66
@@ -61,17 +77,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
61 */ 77 */
62static atomic_t uprobe_events = ATOMIC_INIT(0); 78static atomic_t uprobe_events = ATOMIC_INIT(0);
63 79
64/*
65 * Maintain a temporary per vma info that can be used to search if a vma
66 * has already been handled. This structure is introduced since extending
67 * vm_area_struct wasnt recommended.
68 */
69struct vma_info {
70 struct list_head probe_list;
71 struct mm_struct *mm;
72 loff_t vaddr;
73};
74
75struct uprobe { 80struct uprobe {
76 struct rb_node rb_node; /* node in the rb tree */ 81 struct rb_node rb_node; /* node in the rb tree */
77 atomic_t ref; 82 atomic_t ref;
@@ -100,7 +105,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
100 if (!is_register) 105 if (!is_register)
101 return true; 106 return true;
102 107
103 if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) 108 if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
109 == (VM_READ|VM_EXEC))
104 return true; 110 return true;
105 111
106 return false; 112 return false;
@@ -129,33 +135,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
129static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) 135static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
130{ 136{
131 struct mm_struct *mm = vma->vm_mm; 137 struct mm_struct *mm = vma->vm_mm;
132 pgd_t *pgd;
133 pud_t *pud;
134 pmd_t *pmd;
135 pte_t *ptep;
136 spinlock_t *ptl;
137 unsigned long addr; 138 unsigned long addr;
138 int err = -EFAULT; 139 spinlock_t *ptl;
140 pte_t *ptep;
139 141
140 addr = page_address_in_vma(page, vma); 142 addr = page_address_in_vma(page, vma);
141 if (addr == -EFAULT) 143 if (addr == -EFAULT)
142 goto out; 144 return -EFAULT;
143
144 pgd = pgd_offset(mm, addr);
145 if (!pgd_present(*pgd))
146 goto out;
147
148 pud = pud_offset(pgd, addr);
149 if (!pud_present(*pud))
150 goto out;
151
152 pmd = pmd_offset(pud, addr);
153 if (!pmd_present(*pmd))
154 goto out;
155 145
156 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 146 ptep = page_check_address(page, mm, addr, &ptl, 0);
157 if (!ptep) 147 if (!ptep)
158 goto out; 148 return -EAGAIN;
159 149
160 get_page(kpage); 150 get_page(kpage);
161 page_add_new_anon_rmap(kpage, vma, addr); 151 page_add_new_anon_rmap(kpage, vma, addr);
@@ -174,10 +164,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
174 try_to_free_swap(page); 164 try_to_free_swap(page);
175 put_page(page); 165 put_page(page);
176 pte_unmap_unlock(ptep, ptl); 166 pte_unmap_unlock(ptep, ptl);
177 err = 0;
178 167
179out: 168 return 0;
180 return err;
181} 169}
182 170
183/** 171/**
@@ -222,9 +210,8 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
222 void *vaddr_old, *vaddr_new; 210 void *vaddr_old, *vaddr_new;
223 struct vm_area_struct *vma; 211 struct vm_area_struct *vma;
224 struct uprobe *uprobe; 212 struct uprobe *uprobe;
225 loff_t addr;
226 int ret; 213 int ret;
227 214retry:
228 /* Read the page with vaddr into memory */ 215 /* Read the page with vaddr into memory */
229 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); 216 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
230 if (ret <= 0) 217 if (ret <= 0)
@@ -246,10 +233,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
246 if (mapping != vma->vm_file->f_mapping) 233 if (mapping != vma->vm_file->f_mapping)
247 goto put_out; 234 goto put_out;
248 235
249 addr = vma_address(vma, uprobe->offset);
250 if (vaddr != (unsigned long)addr)
251 goto put_out;
252
253 ret = -ENOMEM; 236 ret = -ENOMEM;
254 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 237 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
255 if (!new_page) 238 if (!new_page)
@@ -267,11 +250,7 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
267 vaddr_new = kmap_atomic(new_page); 250 vaddr_new = kmap_atomic(new_page);
268 251
269 memcpy(vaddr_new, vaddr_old, PAGE_SIZE); 252 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
270 253 memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
271 /* poke the new insn in, ASSUMES we don't cross page boundary */
272 vaddr &= ~PAGE_MASK;
273 BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
274 memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
275 254
276 kunmap_atomic(vaddr_new); 255 kunmap_atomic(vaddr_new);
277 kunmap_atomic(vaddr_old); 256 kunmap_atomic(vaddr_old);
@@ -291,6 +270,8 @@ unlock_out:
291put_out: 270put_out:
292 put_page(old_page); 271 put_page(old_page);
293 272
273 if (unlikely(ret == -EAGAIN))
274 goto retry;
294 return ret; 275 return ret;
295} 276}
296 277
@@ -312,7 +293,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
312 void *vaddr_new; 293 void *vaddr_new;
313 int ret; 294 int ret;
314 295
315 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); 296 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
316 if (ret <= 0) 297 if (ret <= 0)
317 return ret; 298 return ret;
318 299
@@ -333,10 +314,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
333 uprobe_opcode_t opcode; 314 uprobe_opcode_t opcode;
334 int result; 315 int result;
335 316
317 if (current->mm == mm) {
318 pagefault_disable();
319 result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
320 sizeof(opcode));
321 pagefault_enable();
322
323 if (likely(result == 0))
324 goto out;
325 }
326
336 result = read_opcode(mm, vaddr, &opcode); 327 result = read_opcode(mm, vaddr, &opcode);
337 if (result) 328 if (result)
338 return result; 329 return result;
339 330out:
340 if (is_swbp_insn(&opcode)) 331 if (is_swbp_insn(&opcode))
341 return 1; 332 return 1;
342 333
@@ -355,7 +346,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
355int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 346int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
356{ 347{
357 int result; 348 int result;
358 349 /*
350 * See the comment near uprobes_hash().
351 */
359 result = is_swbp_at_addr(mm, vaddr); 352 result = is_swbp_at_addr(mm, vaddr);
360 if (result == 1) 353 if (result == 1)
361 return -EEXIST; 354 return -EEXIST;
@@ -520,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
520 uprobe->inode = igrab(inode); 513 uprobe->inode = igrab(inode);
521 uprobe->offset = offset; 514 uprobe->offset = offset;
522 init_rwsem(&uprobe->consumer_rwsem); 515 init_rwsem(&uprobe->consumer_rwsem);
523 INIT_LIST_HEAD(&uprobe->pending_list);
524 516
525 /* add to uprobes_tree, sorted on inode:offset */ 517 /* add to uprobes_tree, sorted on inode:offset */
526 cur_uprobe = insert_uprobe(uprobe); 518 cur_uprobe = insert_uprobe(uprobe);
@@ -588,20 +580,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
588} 580}
589 581
590static int 582static int
591__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, 583__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
592 unsigned long nbytes, unsigned long offset) 584 unsigned long nbytes, loff_t offset)
593{ 585{
594 struct file *filp = vma->vm_file;
595 struct page *page; 586 struct page *page;
596 void *vaddr; 587 void *vaddr;
597 unsigned long off1; 588 unsigned long off;
598 unsigned long idx; 589 pgoff_t idx;
599 590
600 if (!filp) 591 if (!filp)
601 return -EINVAL; 592 return -EINVAL;
602 593
603 idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); 594 if (!mapping->a_ops->readpage)
604 off1 = offset &= ~PAGE_MASK; 595 return -EIO;
596
597 idx = offset >> PAGE_CACHE_SHIFT;
598 off = offset & ~PAGE_MASK;
605 599
606 /* 600 /*
607 * Ensure that the page that has the original instruction is 601 * Ensure that the page that has the original instruction is
@@ -612,22 +606,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins
612 return PTR_ERR(page); 606 return PTR_ERR(page);
613 607
614 vaddr = kmap_atomic(page); 608 vaddr = kmap_atomic(page);
615 memcpy(insn, vaddr + off1, nbytes); 609 memcpy(insn, vaddr + off, nbytes);
616 kunmap_atomic(vaddr); 610 kunmap_atomic(vaddr);
617 page_cache_release(page); 611 page_cache_release(page);
618 612
619 return 0; 613 return 0;
620} 614}
621 615
622static int 616static int copy_insn(struct uprobe *uprobe, struct file *filp)
623copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
624{ 617{
625 struct address_space *mapping; 618 struct address_space *mapping;
626 unsigned long nbytes; 619 unsigned long nbytes;
627 int bytes; 620 int bytes;
628 621
629 addr &= ~PAGE_MASK; 622 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
630 nbytes = PAGE_SIZE - addr;
631 mapping = uprobe->inode->i_mapping; 623 mapping = uprobe->inode->i_mapping;
632 624
633 /* Instruction at end of binary; copy only available bytes */ 625 /* Instruction at end of binary; copy only available bytes */
@@ -638,13 +630,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
638 630
639 /* Instruction at the page-boundary; copy bytes in second page */ 631 /* Instruction at the page-boundary; copy bytes in second page */
640 if (nbytes < bytes) { 632 if (nbytes < bytes) {
641 if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, 633 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
642 bytes - nbytes, uprobe->offset + nbytes)) 634 bytes - nbytes, uprobe->offset + nbytes);
643 return -ENOMEM; 635 if (err)
644 636 return err;
645 bytes = nbytes; 637 bytes = nbytes;
646 } 638 }
647 return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); 639 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
648} 640}
649 641
650/* 642/*
@@ -672,9 +664,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
672 */ 664 */
673static int 665static int
674install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, 666install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
675 struct vm_area_struct *vma, loff_t vaddr) 667 struct vm_area_struct *vma, unsigned long vaddr)
676{ 668{
677 unsigned long addr;
678 int ret; 669 int ret;
679 670
680 /* 671 /*
@@ -687,20 +678,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
687 if (!uprobe->consumers) 678 if (!uprobe->consumers)
688 return -EEXIST; 679 return -EEXIST;
689 680
690 addr = (unsigned long)vaddr;
691
692 if (!(uprobe->flags & UPROBE_COPY_INSN)) { 681 if (!(uprobe->flags & UPROBE_COPY_INSN)) {
693 ret = copy_insn(uprobe, vma, addr); 682 ret = copy_insn(uprobe, vma->vm_file);
694 if (ret) 683 if (ret)
695 return ret; 684 return ret;
696 685
697 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) 686 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
698 return -EEXIST; 687 return -ENOTSUPP;
699 688
700 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); 689 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
701 if (ret) 690 if (ret)
702 return ret; 691 return ret;
703 692
693 /* write_opcode() assumes we don't cross page boundary */
694 BUG_ON((uprobe->offset & ~PAGE_MASK) +
695 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
696
704 uprobe->flags |= UPROBE_COPY_INSN; 697 uprobe->flags |= UPROBE_COPY_INSN;
705 } 698 }
706 699
@@ -713,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
713 * Hence increment before and decrement on failure. 706 * Hence increment before and decrement on failure.
714 */ 707 */
715 atomic_inc(&mm->uprobes_state.count); 708 atomic_inc(&mm->uprobes_state.count);
716 ret = set_swbp(&uprobe->arch, mm, addr); 709 ret = set_swbp(&uprobe->arch, mm, vaddr);
717 if (ret) 710 if (ret)
718 atomic_dec(&mm->uprobes_state.count); 711 atomic_dec(&mm->uprobes_state.count);
719 712
@@ -721,27 +714,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
721} 714}
722 715
723static void 716static void
724remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) 717remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
725{ 718{
726 if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) 719 if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
727 atomic_dec(&mm->uprobes_state.count); 720 atomic_dec(&mm->uprobes_state.count);
728} 721}
729 722
730/* 723/*
731 * There could be threads that have hit the breakpoint and are entering the 724 * There could be threads that have already hit the breakpoint. They
732 * notifier code and trying to acquire the uprobes_treelock. The thread 725 * will recheck the current insn and restart if find_uprobe() fails.
733 * calling delete_uprobe() that is removing the uprobe from the rb_tree can 726 * See find_active_uprobe().
734 * race with these threads and might acquire the uprobes_treelock compared
735 * to some of the breakpoint hit threads. In such a case, the breakpoint
736 * hit threads will not find the uprobe. The current unregistering thread
737 * waits till all other threads have hit a breakpoint, to acquire the
738 * uprobes_treelock before the uprobe is removed from the rbtree.
739 */ 727 */
740static void delete_uprobe(struct uprobe *uprobe) 728static void delete_uprobe(struct uprobe *uprobe)
741{ 729{
742 unsigned long flags; 730 unsigned long flags;
743 731
744 synchronize_srcu(&uprobes_srcu);
745 spin_lock_irqsave(&uprobes_treelock, flags); 732 spin_lock_irqsave(&uprobes_treelock, flags);
746 rb_erase(&uprobe->rb_node, &uprobes_tree); 733 rb_erase(&uprobe->rb_node, &uprobes_tree);
747 spin_unlock_irqrestore(&uprobes_treelock, flags); 734 spin_unlock_irqrestore(&uprobes_treelock, flags);
@@ -750,139 +737,135 @@ static void delete_uprobe(struct uprobe *uprobe)
750 atomic_dec(&uprobe_events); 737 atomic_dec(&uprobe_events);
751} 738}
752 739
753static struct vma_info * 740struct map_info {
754__find_next_vma_info(struct address_space *mapping, struct list_head *head, 741 struct map_info *next;
755 struct vma_info *vi, loff_t offset, bool is_register) 742 struct mm_struct *mm;
743 unsigned long vaddr;
744};
745
746static inline struct map_info *free_map_info(struct map_info *info)
747{
748 struct map_info *next = info->next;
749 kfree(info);
750 return next;
751}
752
753static struct map_info *
754build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
756{ 755{
756 unsigned long pgoff = offset >> PAGE_SHIFT;
757 struct prio_tree_iter iter; 757 struct prio_tree_iter iter;
758 struct vm_area_struct *vma; 758 struct vm_area_struct *vma;
759 struct vma_info *tmpvi; 759 struct map_info *curr = NULL;
760 unsigned long pgoff; 760 struct map_info *prev = NULL;
761 int existing_vma; 761 struct map_info *info;
762 loff_t vaddr; 762 int more = 0;
763
764 pgoff = offset >> PAGE_SHIFT;
765 763
764 again:
765 mutex_lock(&mapping->i_mmap_mutex);
766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
767 if (!valid_vma(vma, is_register)) 767 if (!valid_vma(vma, is_register))
768 continue; 768 continue;
769 769
770 existing_vma = 0; 770 if (!prev && !more) {
771 vaddr = vma_address(vma, offset); 771 /*
772 772 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
773 list_for_each_entry(tmpvi, head, probe_list) { 773 * reclaim. This is optimistic, no harm done if it fails.
774 if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { 774 */
775 existing_vma = 1; 775 prev = kmalloc(sizeof(struct map_info),
776 break; 776 GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
777 } 777 if (prev)
778 prev->next = NULL;
778 } 779 }
779 780 if (!prev) {
780 /* 781 more++;
781 * Another vma needs a probe to be installed. However skip 782 continue;
782 * installing the probe if the vma is about to be unlinked.
783 */
784 if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
785 vi->mm = vma->vm_mm;
786 vi->vaddr = vaddr;
787 list_add(&vi->probe_list, head);
788
789 return vi;
790 } 783 }
791 }
792 784
793 return NULL; 785 if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
794} 786 continue;
795
796/*
797 * Iterate in the rmap prio tree and find a vma where a probe has not
798 * yet been inserted.
799 */
800static struct vma_info *
801find_next_vma_info(struct address_space *mapping, struct list_head *head,
802 loff_t offset, bool is_register)
803{
804 struct vma_info *vi, *retvi;
805 787
806 vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); 788 info = prev;
807 if (!vi) 789 prev = prev->next;
808 return ERR_PTR(-ENOMEM); 790 info->next = curr;
791 curr = info;
809 792
810 mutex_lock(&mapping->i_mmap_mutex); 793 info->mm = vma->vm_mm;
811 retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); 794 info->vaddr = vma_address(vma, offset);
795 }
812 mutex_unlock(&mapping->i_mmap_mutex); 796 mutex_unlock(&mapping->i_mmap_mutex);
813 797
814 if (!retvi) 798 if (!more)
815 kfree(vi); 799 goto out;
800
801 prev = curr;
802 while (curr) {
803 mmput(curr->mm);
804 curr = curr->next;
805 }
816 806
817 return retvi; 807 do {
808 info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
809 if (!info) {
810 curr = ERR_PTR(-ENOMEM);
811 goto out;
812 }
813 info->next = prev;
814 prev = info;
815 } while (--more);
816
817 goto again;
818 out:
819 while (prev)
820 prev = free_map_info(prev);
821 return curr;
818} 822}
819 823
820static int register_for_each_vma(struct uprobe *uprobe, bool is_register) 824static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
821{ 825{
822 struct list_head try_list; 826 struct map_info *info;
823 struct vm_area_struct *vma; 827 int err = 0;
824 struct address_space *mapping;
825 struct vma_info *vi, *tmpvi;
826 struct mm_struct *mm;
827 loff_t vaddr;
828 int ret;
829 828
830 mapping = uprobe->inode->i_mapping; 829 info = build_map_info(uprobe->inode->i_mapping,
831 INIT_LIST_HEAD(&try_list); 830 uprobe->offset, is_register);
831 if (IS_ERR(info))
832 return PTR_ERR(info);
832 833
833 ret = 0; 834 while (info) {
835 struct mm_struct *mm = info->mm;
836 struct vm_area_struct *vma;
834 837
835 for (;;) { 838 if (err)
836 vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); 839 goto free;
837 if (!vi)
838 break;
839 840
840 if (IS_ERR(vi)) { 841 down_write(&mm->mmap_sem);
841 ret = PTR_ERR(vi); 842 vma = find_vma(mm, (unsigned long)info->vaddr);
842 break; 843 if (!vma || !valid_vma(vma, is_register))
843 } 844 goto unlock;
844 845
845 mm = vi->mm;
846 down_read(&mm->mmap_sem);
847 vma = find_vma(mm, (unsigned long)vi->vaddr);
848 if (!vma || !valid_vma(vma, is_register)) {
849 list_del(&vi->probe_list);
850 kfree(vi);
851 up_read(&mm->mmap_sem);
852 mmput(mm);
853 continue;
854 }
855 vaddr = vma_address(vma, uprobe->offset);
856 if (vma->vm_file->f_mapping->host != uprobe->inode || 846 if (vma->vm_file->f_mapping->host != uprobe->inode ||
857 vaddr != vi->vaddr) { 847 vma_address(vma, uprobe->offset) != info->vaddr)
858 list_del(&vi->probe_list); 848 goto unlock;
859 kfree(vi);
860 up_read(&mm->mmap_sem);
861 mmput(mm);
862 continue;
863 }
864
865 if (is_register)
866 ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
867 else
868 remove_breakpoint(uprobe, mm, vi->vaddr);
869 849
870 up_read(&mm->mmap_sem);
871 mmput(mm);
872 if (is_register) { 850 if (is_register) {
873 if (ret && ret == -EEXIST) 851 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
874 ret = 0; 852 /*
875 if (ret) 853 * We can race against uprobe_mmap(), see the
876 break; 854 * comment near uprobe_hash().
855 */
856 if (err == -EEXIST)
857 err = 0;
858 } else {
859 remove_breakpoint(uprobe, mm, info->vaddr);
877 } 860 }
861 unlock:
862 up_write(&mm->mmap_sem);
863 free:
864 mmput(mm);
865 info = free_map_info(info);
878 } 866 }
879 867
880 list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { 868 return err;
881 list_del(&vi->probe_list);
882 kfree(vi);
883 }
884
885 return ret;
886} 869}
887 870
888static int __uprobe_register(struct uprobe *uprobe) 871static int __uprobe_register(struct uprobe *uprobe)
@@ -1048,7 +1031,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
1048int uprobe_mmap(struct vm_area_struct *vma) 1031int uprobe_mmap(struct vm_area_struct *vma)
1049{ 1032{
1050 struct list_head tmp_list; 1033 struct list_head tmp_list;
1051 struct uprobe *uprobe, *u; 1034 struct uprobe *uprobe;
1052 struct inode *inode; 1035 struct inode *inode;
1053 int ret, count; 1036 int ret, count;
1054 1037
@@ -1066,12 +1049,9 @@ int uprobe_mmap(struct vm_area_struct *vma)
1066 ret = 0; 1049 ret = 0;
1067 count = 0; 1050 count = 0;
1068 1051
1069 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1052 list_for_each_entry(uprobe, &tmp_list, pending_list) {
1070 loff_t vaddr;
1071
1072 list_del(&uprobe->pending_list);
1073 if (!ret) { 1053 if (!ret) {
1074 vaddr = vma_address(vma, uprobe->offset); 1054 loff_t vaddr = vma_address(vma, uprobe->offset);
1075 1055
1076 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { 1056 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
1077 put_uprobe(uprobe); 1057 put_uprobe(uprobe);
@@ -1079,8 +1059,10 @@ int uprobe_mmap(struct vm_area_struct *vma)
1079 } 1059 }
1080 1060
1081 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1061 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1082 1062 /*
1083 /* Ignore double add: */ 1063 * We can race against uprobe_register(), see the
1064 * comment near uprobe_hash().
1065 */
1084 if (ret == -EEXIST) { 1066 if (ret == -EEXIST) {
1085 ret = 0; 1067 ret = 0;
1086 1068
@@ -1115,7 +1097,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
1115void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1097void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1116{ 1098{
1117 struct list_head tmp_list; 1099 struct list_head tmp_list;
1118 struct uprobe *uprobe, *u; 1100 struct uprobe *uprobe;
1119 struct inode *inode; 1101 struct inode *inode;
1120 1102
1121 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1103 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
@@ -1132,11 +1114,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1132 mutex_lock(uprobes_mmap_hash(inode)); 1114 mutex_lock(uprobes_mmap_hash(inode));
1133 build_probe_list(inode, &tmp_list); 1115 build_probe_list(inode, &tmp_list);
1134 1116
1135 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1117 list_for_each_entry(uprobe, &tmp_list, pending_list) {
1136 loff_t vaddr; 1118 loff_t vaddr = vma_address(vma, uprobe->offset);
1137
1138 list_del(&uprobe->pending_list);
1139 vaddr = vma_address(vma, uprobe->offset);
1140 1119
1141 if (vaddr >= start && vaddr < end) { 1120 if (vaddr >= start && vaddr < end) {
1142 /* 1121 /*
@@ -1378,9 +1357,6 @@ void uprobe_free_utask(struct task_struct *t)
1378{ 1357{
1379 struct uprobe_task *utask = t->utask; 1358 struct uprobe_task *utask = t->utask;
1380 1359
1381 if (t->uprobe_srcu_id != -1)
1382 srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
1383
1384 if (!utask) 1360 if (!utask)
1385 return; 1361 return;
1386 1362
@@ -1398,7 +1374,6 @@ void uprobe_free_utask(struct task_struct *t)
1398void uprobe_copy_process(struct task_struct *t) 1374void uprobe_copy_process(struct task_struct *t)
1399{ 1375{
1400 t->utask = NULL; 1376 t->utask = NULL;
1401 t->uprobe_srcu_id = -1;
1402} 1377}
1403 1378
1404/* 1379/*
@@ -1417,7 +1392,6 @@ static struct uprobe_task *add_utask(void)
1417 if (unlikely(!utask)) 1392 if (unlikely(!utask))
1418 return NULL; 1393 return NULL;
1419 1394
1420 utask->active_uprobe = NULL;
1421 current->utask = utask; 1395 current->utask = utask;
1422 return utask; 1396 return utask;
1423} 1397}
@@ -1479,41 +1453,64 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1479 return false; 1453 return false;
1480} 1454}
1481 1455
1456static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1457{
1458 struct mm_struct *mm = current->mm;
1459 struct uprobe *uprobe = NULL;
1460 struct vm_area_struct *vma;
1461
1462 down_read(&mm->mmap_sem);
1463 vma = find_vma(mm, bp_vaddr);
1464 if (vma && vma->vm_start <= bp_vaddr) {
1465 if (valid_vma(vma, false)) {
1466 struct inode *inode;
1467 loff_t offset;
1468
1469 inode = vma->vm_file->f_mapping->host;
1470 offset = bp_vaddr - vma->vm_start;
1471 offset += (vma->vm_pgoff << PAGE_SHIFT);
1472 uprobe = find_uprobe(inode, offset);
1473 }
1474
1475 if (!uprobe)
1476 *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
1477 } else {
1478 *is_swbp = -EFAULT;
1479 }
1480 up_read(&mm->mmap_sem);
1481
1482 return uprobe;
1483}
1484
1482/* 1485/*
1483 * Run handler and ask thread to singlestep. 1486 * Run handler and ask thread to singlestep.
1484 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1487 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1485 */ 1488 */
1486static void handle_swbp(struct pt_regs *regs) 1489static void handle_swbp(struct pt_regs *regs)
1487{ 1490{
1488 struct vm_area_struct *vma;
1489 struct uprobe_task *utask; 1491 struct uprobe_task *utask;
1490 struct uprobe *uprobe; 1492 struct uprobe *uprobe;
1491 struct mm_struct *mm;
1492 unsigned long bp_vaddr; 1493 unsigned long bp_vaddr;
1494 int uninitialized_var(is_swbp);
1493 1495
1494 uprobe = NULL;
1495 bp_vaddr = uprobe_get_swbp_addr(regs); 1496 bp_vaddr = uprobe_get_swbp_addr(regs);
1496 mm = current->mm; 1497 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1497 down_read(&mm->mmap_sem);
1498 vma = find_vma(mm, bp_vaddr);
1499
1500 if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
1501 struct inode *inode;
1502 loff_t offset;
1503
1504 inode = vma->vm_file->f_mapping->host;
1505 offset = bp_vaddr - vma->vm_start;
1506 offset += (vma->vm_pgoff << PAGE_SHIFT);
1507 uprobe = find_uprobe(inode, offset);
1508 }
1509
1510 srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
1511 current->uprobe_srcu_id = -1;
1512 up_read(&mm->mmap_sem);
1513 1498
1514 if (!uprobe) { 1499 if (!uprobe) {
1515 /* No matching uprobe; signal SIGTRAP. */ 1500 if (is_swbp > 0) {
1516 send_sig(SIGTRAP, current, 0); 1501 /* No matching uprobe; signal SIGTRAP. */
1502 send_sig(SIGTRAP, current, 0);
1503 } else {
1504 /*
1505 * Either we raced with uprobe_unregister() or we can't
1506 * access this memory. The latter is only possible if
1507 * another thread plays with our ->mm. In both cases
1508 * we can simply restart. If this vma was unmapped we
1509 * can pretend this insn was not executed yet and get
1510 * the (correct) SIGSEGV after restart.
1511 */
1512 instruction_pointer_set(regs, bp_vaddr);
1513 }
1517 return; 1514 return;
1518 } 1515 }
1519 1516
@@ -1620,7 +1617,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1620 utask->state = UTASK_BP_HIT; 1617 utask->state = UTASK_BP_HIT;
1621 1618
1622 set_thread_flag(TIF_UPROBE); 1619 set_thread_flag(TIF_UPROBE);
1623 current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
1624 1620
1625 return 1; 1621 return 1;
1626} 1622}
@@ -1655,7 +1651,6 @@ static int __init init_uprobes(void)
1655 mutex_init(&uprobes_mutex[i]); 1651 mutex_init(&uprobes_mutex[i]);
1656 mutex_init(&uprobes_mmap_mutex[i]); 1652 mutex_init(&uprobes_mmap_mutex[i]);
1657 } 1653 }
1658 init_srcu_struct(&uprobes_srcu);
1659 1654
1660 return register_die_notifier(&uprobe_exception_nb); 1655 return register_die_notifier(&uprobe_exception_nb);
1661} 1656}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f59cc334516..d17f6c4ddfa9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,14 +953,11 @@ void do_exit(long code)
953 exit_signals(tsk); /* sets PF_EXITING */ 953 exit_signals(tsk); /* sets PF_EXITING */
954 /* 954 /*
955 * tsk->flags are checked in the futex code to protect against 955 * tsk->flags are checked in the futex code to protect against
956 * an exiting task cleaning up the robust pi futexes, and in 956 * an exiting task cleaning up the robust pi futexes.
957 * task_work_add() to avoid the race with exit_task_work().
958 */ 957 */
959 smp_mb(); 958 smp_mb();
960 raw_spin_unlock_wait(&tsk->pi_lock); 959 raw_spin_unlock_wait(&tsk->pi_lock);
961 960
962 exit_task_work(tsk);
963
964 if (unlikely(in_atomic())) 961 if (unlikely(in_atomic()))
965 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 962 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
966 current->comm, task_pid_nr(current), 963 current->comm, task_pid_nr(current),
@@ -995,6 +992,7 @@ void do_exit(long code)
995 exit_shm(tsk); 992 exit_shm(tsk);
996 exit_files(tsk); 993 exit_files(tsk);
997 exit_fs(tsk); 994 exit_fs(tsk);
995 exit_task_work(tsk);
998 check_stack_usage(); 996 check_stack_usage();
999 exit_thread(); 997 exit_thread();
1000 998
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..ff1cad3b7bdc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
304 } 304 }
305 305
306 err = arch_dup_task_struct(tsk, orig); 306 err = arch_dup_task_struct(tsk, orig);
307 if (err)
308 goto out;
309 307
308 /*
309 * We defer looking at err, because we will need this setup
310 * for the clean up path to work correctly.
311 */
310 tsk->stack = ti; 312 tsk->stack = ti;
311
312 setup_thread_stack(tsk, orig); 313 setup_thread_stack(tsk, orig);
314
315 if (err)
316 goto out;
317
313 clear_user_return_notifier(tsk); 318 clear_user_return_notifier(tsk);
314 clear_tsk_need_resched(tsk); 319 clear_tsk_need_resched(tsk);
315 stackend = end_of_stack(tsk); 320 stackend = end_of_stack(tsk);
@@ -1415,7 +1420,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1415 */ 1420 */
1416 p->group_leader = p; 1421 p->group_leader = p;
1417 INIT_LIST_HEAD(&p->thread_group); 1422 INIT_LIST_HEAD(&p->thread_group);
1418 INIT_HLIST_HEAD(&p->task_works); 1423 p->task_works = NULL;
1419 1424
1420 /* Now that the task is set up, run cgroup callbacks if 1425 /* Now that the task is set up, run cgroup callbacks if
1421 * necessary. We need to run them before the task is visible 1426 * necessary. We need to run them before the task is visible
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
657 return 0; 657 return 0;
658} 658}
659 659
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
661{
662 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
663 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
664
665 return ktime_get_update_offsets(offs_real, offs_boot);
666}
667
660/* 668/*
661 * Retrigger next event is called after clock was set 669 * Retrigger next event is called after clock was set
662 * 670 *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
665static void retrigger_next_event(void *arg) 673static void retrigger_next_event(void *arg)
666{ 674{
667 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 675 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
668 struct timespec realtime_offset, xtim, wtm, sleep;
669 676
670 if (!hrtimer_hres_active()) 677 if (!hrtimer_hres_active())
671 return; 678 return;
672 679
673 /* Optimized out for !HIGH_RES */
674 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
675 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
676
677 /* Adjust CLOCK_REALTIME offset */
678 raw_spin_lock(&base->lock); 680 raw_spin_lock(&base->lock);
679 base->clock_base[HRTIMER_BASE_REALTIME].offset = 681 hrtimer_update_base(base);
680 timespec_to_ktime(realtime_offset);
681 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
682 timespec_to_ktime(sleep);
683
684 hrtimer_force_reprogram(base, 0); 682 hrtimer_force_reprogram(base, 0);
685 raw_spin_unlock(&base->lock); 683 raw_spin_unlock(&base->lock);
686} 684}
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
710 base->clock_base[i].resolution = KTIME_HIGH_RES; 708 base->clock_base[i].resolution = KTIME_HIGH_RES;
711 709
712 tick_setup_sched_timer(); 710 tick_setup_sched_timer();
713
714 /* "Retrigger" the interrupt to get things going */ 711 /* "Retrigger" the interrupt to get things going */
715 retrigger_next_event(NULL); 712 retrigger_next_event(NULL);
716 local_irq_restore(flags); 713 local_irq_restore(flags);
717 return 1; 714 return 1;
718} 715}
719 716
717/*
718 * Called from timekeeping code to reprogramm the hrtimer interrupt
719 * device. If called from the timer interrupt context we defer it to
720 * softirq context.
721 */
722void clock_was_set_delayed(void)
723{
724 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
725
726 cpu_base->clock_was_set = 1;
727 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
728}
729
720#else 730#else
721 731
722static inline int hrtimer_hres_active(void) { return 0; } 732static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1250 cpu_base->nr_events++; 1260 cpu_base->nr_events++;
1251 dev->next_event.tv64 = KTIME_MAX; 1261 dev->next_event.tv64 = KTIME_MAX;
1252 1262
1253 entry_time = now = ktime_get(); 1263 raw_spin_lock(&cpu_base->lock);
1264 entry_time = now = hrtimer_update_base(cpu_base);
1254retry: 1265retry:
1255 expires_next.tv64 = KTIME_MAX; 1266 expires_next.tv64 = KTIME_MAX;
1256
1257 raw_spin_lock(&cpu_base->lock);
1258 /* 1267 /*
1259 * We set expires_next to KTIME_MAX here with cpu_base->lock 1268 * We set expires_next to KTIME_MAX here with cpu_base->lock
1260 * held to prevent that a timer is enqueued in our queue via 1269 * held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
1330 * We need to prevent that we loop forever in the hrtimer 1339 * We need to prevent that we loop forever in the hrtimer
1331 * interrupt routine. We give it 3 attempts to avoid 1340 * interrupt routine. We give it 3 attempts to avoid
1332 * overreacting on some spurious event. 1341 * overreacting on some spurious event.
1342 *
1343 * Acquire base lock for updating the offsets and retrieving
1344 * the current time.
1333 */ 1345 */
1334 now = ktime_get(); 1346 raw_spin_lock(&cpu_base->lock);
1347 now = hrtimer_update_base(cpu_base);
1335 cpu_base->nr_retries++; 1348 cpu_base->nr_retries++;
1336 if (++retries < 3) 1349 if (++retries < 3)
1337 goto retry; 1350 goto retry;
@@ -1343,6 +1356,7 @@ retry:
1343 */ 1356 */
1344 cpu_base->nr_hangs++; 1357 cpu_base->nr_hangs++;
1345 cpu_base->hang_detected = 1; 1358 cpu_base->hang_detected = 1;
1359 raw_spin_unlock(&cpu_base->lock);
1346 delta = ktime_sub(now, entry_time); 1360 delta = ktime_sub(now, entry_time);
1347 if (delta.tv64 > cpu_base->max_hang_time.tv64) 1361 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1348 cpu_base->max_hang_time = delta; 1362 cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
1395 1409
1396static void run_hrtimer_softirq(struct softirq_action *h) 1410static void run_hrtimer_softirq(struct softirq_action *h)
1397{ 1411{
1412 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1413
1414 if (cpu_base->clock_was_set) {
1415 cpu_base->clock_was_set = 0;
1416 clock_was_set();
1417 }
1418
1398 hrtimer_peek_ahead_timers(); 1419 hrtimer_peek_ahead_timers();
1399} 1420}
1400 1421
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8c548232ba39..814c9ef6bba1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc)
781 wake_up(&desc->wait_for_threads); 781 wake_up(&desc->wait_for_threads);
782} 782}
783 783
784static void irq_thread_dtor(struct task_work *unused) 784static void irq_thread_dtor(struct callback_head *unused)
785{ 785{
786 struct task_struct *tsk = current; 786 struct task_struct *tsk = current;
787 struct irq_desc *desc; 787 struct irq_desc *desc;
@@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused)
813 */ 813 */
814static int irq_thread(void *data) 814static int irq_thread(void *data)
815{ 815{
816 struct task_work on_exit_work; 816 struct callback_head on_exit_work;
817 static const struct sched_param param = { 817 static const struct sched_param param = {
818 .sched_priority = MAX_USER_RT_PRIO/2, 818 .sched_priority = MAX_USER_RT_PRIO/2,
819 }; 819 };
@@ -830,7 +830,7 @@ static int irq_thread(void *data)
830 830
831 sched_setscheduler(current, SCHED_FIFO, &param); 831 sched_setscheduler(current, SCHED_FIFO, &param);
832 832
833 init_task_work(&on_exit_work, irq_thread_dtor, NULL); 833 init_task_work(&on_exit_work, irq_thread_dtor);
834 task_work_add(current, &on_exit_work, false); 834 task_work_add(current, &on_exit_work, false);
835 835
836 while (!irq_wait_for_interrupt(action)) { 836 while (!irq_wait_for_interrupt(action)) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..b579af57ea10 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
360 struct kthread_work, node); 360 struct kthread_work, node);
361 list_del_init(&work->node); 361 list_del_init(&work->node);
362 } 362 }
363 worker->current_work = work;
363 spin_unlock_irq(&worker->lock); 364 spin_unlock_irq(&worker->lock);
364 365
365 if (work) { 366 if (work) {
366 __set_current_state(TASK_RUNNING); 367 __set_current_state(TASK_RUNNING);
367 work->func(work); 368 work->func(work);
368 smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
369 work->done_seq = work->queue_seq;
370 smp_mb(); /* mb worker-b1 paired with flush-b0 */
371 if (atomic_read(&work->flushing))
372 wake_up_all(&work->done);
373 } else if (!freezing(current)) 369 } else if (!freezing(current))
374 schedule(); 370 schedule();
375 371
@@ -378,6 +374,19 @@ repeat:
378} 374}
379EXPORT_SYMBOL_GPL(kthread_worker_fn); 375EXPORT_SYMBOL_GPL(kthread_worker_fn);
380 376
377/* insert @work before @pos in @worker */
378static void insert_kthread_work(struct kthread_worker *worker,
379 struct kthread_work *work,
380 struct list_head *pos)
381{
382 lockdep_assert_held(&worker->lock);
383
384 list_add_tail(&work->node, pos);
385 work->worker = worker;
386 if (likely(worker->task))
387 wake_up_process(worker->task);
388}
389
381/** 390/**
382 * queue_kthread_work - queue a kthread_work 391 * queue_kthread_work - queue a kthread_work
383 * @worker: target kthread_worker 392 * @worker: target kthread_worker
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
395 404
396 spin_lock_irqsave(&worker->lock, flags); 405 spin_lock_irqsave(&worker->lock, flags);
397 if (list_empty(&work->node)) { 406 if (list_empty(&work->node)) {
398 list_add_tail(&work->node, &worker->work_list); 407 insert_kthread_work(worker, work, &worker->work_list);
399 work->queue_seq++;
400 if (likely(worker->task))
401 wake_up_process(worker->task);
402 ret = true; 408 ret = true;
403 } 409 }
404 spin_unlock_irqrestore(&worker->lock, flags); 410 spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
406} 412}
407EXPORT_SYMBOL_GPL(queue_kthread_work); 413EXPORT_SYMBOL_GPL(queue_kthread_work);
408 414
415struct kthread_flush_work {
416 struct kthread_work work;
417 struct completion done;
418};
419
420static void kthread_flush_work_fn(struct kthread_work *work)
421{
422 struct kthread_flush_work *fwork =
423 container_of(work, struct kthread_flush_work, work);
424 complete(&fwork->done);
425}
426
409/** 427/**
410 * flush_kthread_work - flush a kthread_work 428 * flush_kthread_work - flush a kthread_work
411 * @work: work to flush 429 * @work: work to flush
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
414 */ 432 */
415void flush_kthread_work(struct kthread_work *work) 433void flush_kthread_work(struct kthread_work *work)
416{ 434{
417 int seq = work->queue_seq; 435 struct kthread_flush_work fwork = {
418 436 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
419 atomic_inc(&work->flushing); 437 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
438 };
439 struct kthread_worker *worker;
440 bool noop = false;
420 441
421 /* 442retry:
422 * mb flush-b0 paired with worker-b1, to make sure either 443 worker = work->worker;
423 * worker sees the above increment or we see done_seq update. 444 if (!worker)
424 */ 445 return;
425 smp_mb__after_atomic_inc();
426 446
427 /* A - B <= 0 tests whether B is in front of A regardless of overflow */ 447 spin_lock_irq(&worker->lock);
428 wait_event(work->done, seq - work->done_seq <= 0); 448 if (work->worker != worker) {
429 atomic_dec(&work->flushing); 449 spin_unlock_irq(&worker->lock);
450 goto retry;
451 }
430 452
431 /* 453 if (!list_empty(&work->node))
432 * rmb flush-b1 paired with worker-b0, to make sure our caller 454 insert_kthread_work(worker, &fwork.work, work->node.next);
433 * sees every change made by work->func(). 455 else if (worker->current_work == work)
434 */ 456 insert_kthread_work(worker, &fwork.work, worker->work_list.next);
435 smp_mb__after_atomic_dec(); 457 else
436} 458 noop = true;
437EXPORT_SYMBOL_GPL(flush_kthread_work);
438 459
439struct kthread_flush_work { 460 spin_unlock_irq(&worker->lock);
440 struct kthread_work work;
441 struct completion done;
442};
443 461
444static void kthread_flush_work_fn(struct kthread_work *work) 462 if (!noop)
445{ 463 wait_for_completion(&fwork.done);
446 struct kthread_flush_work *fwork =
447 container_of(work, struct kthread_flush_work, work);
448 complete(&fwork->done);
449} 464}
465EXPORT_SYMBOL_GPL(flush_kthread_work);
450 466
451/** 467/**
452 * flush_kthread_worker - flush all current works on a kthread_worker 468 * flush_kthread_worker - flush all current works on a kthread_worker
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8f9b4eb974e0..a70518c9d82f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -175,7 +175,7 @@ config PM_TEST_SUSPEND
175 You probably want to have your system's RTC driver statically 175 You probably want to have your system's RTC driver statically
176 linked, ensuring that it's available when this test runs. 176 linked, ensuring that it's available when this test runs.
177 177
178config CAN_PM_TRACE 178config PM_SLEEP_DEBUG
179 def_bool y 179 def_bool y
180 depends on PM_DEBUG && PM_SLEEP 180 depends on PM_DEBUG && PM_SLEEP
181 181
@@ -196,7 +196,7 @@ config PM_TRACE
196 196
197config PM_TRACE_RTC 197config PM_TRACE_RTC
198 bool "Suspend/resume event tracing" 198 bool "Suspend/resume event tracing"
199 depends on CAN_PM_TRACE 199 depends on PM_SLEEP_DEBUG
200 depends on X86 200 depends on X86
201 select PM_TRACE 201 select PM_TRACE
202 ---help--- 202 ---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..b26f5f1e773e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,6 +5,7 @@
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
8 * 9 *
9 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
10 */ 11 */
@@ -27,7 +28,6 @@
27#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
29#include <linux/genhd.h> 30#include <linux/genhd.h>
30#include <scsi/scsi_scan.h>
31 31
32#include "power.h" 32#include "power.h"
33 33
@@ -46,6 +46,9 @@ enum {
46 HIBERNATION_PLATFORM, 46 HIBERNATION_PLATFORM,
47 HIBERNATION_SHUTDOWN, 47 HIBERNATION_SHUTDOWN,
48 HIBERNATION_REBOOT, 48 HIBERNATION_REBOOT,
49#ifdef CONFIG_SUSPEND
50 HIBERNATION_SUSPEND,
51#endif
49 /* keep last */ 52 /* keep last */
50 __HIBERNATION_AFTER_LAST 53 __HIBERNATION_AFTER_LAST
51}; 54};
@@ -354,6 +357,7 @@ int hibernation_snapshot(int platform_mode)
354 } 357 }
355 358
356 suspend_console(); 359 suspend_console();
360 ftrace_stop();
357 pm_restrict_gfp_mask(); 361 pm_restrict_gfp_mask();
358 362
359 error = dpm_suspend(PMSG_FREEZE); 363 error = dpm_suspend(PMSG_FREEZE);
@@ -379,6 +383,7 @@ int hibernation_snapshot(int platform_mode)
379 if (error || !in_suspend) 383 if (error || !in_suspend)
380 pm_restore_gfp_mask(); 384 pm_restore_gfp_mask();
381 385
386 ftrace_start();
382 resume_console(); 387 resume_console();
383 dpm_complete(msg); 388 dpm_complete(msg);
384 389
@@ -481,6 +486,7 @@ int hibernation_restore(int platform_mode)
481 486
482 pm_prepare_console(); 487 pm_prepare_console();
483 suspend_console(); 488 suspend_console();
489 ftrace_stop();
484 pm_restrict_gfp_mask(); 490 pm_restrict_gfp_mask();
485 error = dpm_suspend_start(PMSG_QUIESCE); 491 error = dpm_suspend_start(PMSG_QUIESCE);
486 if (!error) { 492 if (!error) {
@@ -488,6 +494,7 @@ int hibernation_restore(int platform_mode)
488 dpm_resume_end(PMSG_RECOVER); 494 dpm_resume_end(PMSG_RECOVER);
489 } 495 }
490 pm_restore_gfp_mask(); 496 pm_restore_gfp_mask();
497 ftrace_start();
491 resume_console(); 498 resume_console();
492 pm_restore_console(); 499 pm_restore_console();
493 return error; 500 return error;
@@ -514,6 +521,7 @@ int hibernation_platform_enter(void)
514 521
515 entering_platform_hibernation = true; 522 entering_platform_hibernation = true;
516 suspend_console(); 523 suspend_console();
524 ftrace_stop();
517 error = dpm_suspend_start(PMSG_HIBERNATE); 525 error = dpm_suspend_start(PMSG_HIBERNATE);
518 if (error) { 526 if (error) {
519 if (hibernation_ops->recover) 527 if (hibernation_ops->recover)
@@ -557,6 +565,7 @@ int hibernation_platform_enter(void)
557 Resume_devices: 565 Resume_devices:
558 entering_platform_hibernation = false; 566 entering_platform_hibernation = false;
559 dpm_resume_end(PMSG_RESTORE); 567 dpm_resume_end(PMSG_RESTORE);
568 ftrace_start();
560 resume_console(); 569 resume_console();
561 570
562 Close: 571 Close:
@@ -574,6 +583,10 @@ int hibernation_platform_enter(void)
574 */ 583 */
575static void power_down(void) 584static void power_down(void)
576{ 585{
586#ifdef CONFIG_SUSPEND
587 int error;
588#endif
589
577 switch (hibernation_mode) { 590 switch (hibernation_mode) {
578 case HIBERNATION_REBOOT: 591 case HIBERNATION_REBOOT:
579 kernel_restart(NULL); 592 kernel_restart(NULL);
@@ -583,6 +596,25 @@ static void power_down(void)
583 case HIBERNATION_SHUTDOWN: 596 case HIBERNATION_SHUTDOWN:
584 kernel_power_off(); 597 kernel_power_off();
585 break; 598 break;
599#ifdef CONFIG_SUSPEND
600 case HIBERNATION_SUSPEND:
601 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
602 if (error) {
603 if (hibernation_ops)
604 hibernation_mode = HIBERNATION_PLATFORM;
605 else
606 hibernation_mode = HIBERNATION_SHUTDOWN;
607 power_down();
608 }
609 /*
610 * Restore swap signature.
611 */
612 error = swsusp_unmark();
613 if (error)
614 printk(KERN_ERR "PM: Swap will be unusable! "
615 "Try swapon -a.\n");
616 return;
617#endif
586 } 618 }
587 kernel_halt(); 619 kernel_halt();
588 /* 620 /*
@@ -748,13 +780,6 @@ static int software_resume(void)
748 async_synchronize_full(); 780 async_synchronize_full();
749 } 781 }
750 782
751 /*
752 * We can't depend on SCSI devices being available after loading
753 * one of their modules until scsi_complete_async_scans() is
754 * called and the resume device usually is a SCSI one.
755 */
756 scsi_complete_async_scans();
757
758 swsusp_resume_device = name_to_dev_t(resume_file); 783 swsusp_resume_device = name_to_dev_t(resume_file);
759 if (!swsusp_resume_device) { 784 if (!swsusp_resume_device) {
760 error = -ENODEV; 785 error = -ENODEV;
@@ -827,6 +852,9 @@ static const char * const hibernation_modes[] = {
827 [HIBERNATION_PLATFORM] = "platform", 852 [HIBERNATION_PLATFORM] = "platform",
828 [HIBERNATION_SHUTDOWN] = "shutdown", 853 [HIBERNATION_SHUTDOWN] = "shutdown",
829 [HIBERNATION_REBOOT] = "reboot", 854 [HIBERNATION_REBOOT] = "reboot",
855#ifdef CONFIG_SUSPEND
856 [HIBERNATION_SUSPEND] = "suspend",
857#endif
830}; 858};
831 859
832/* 860/*
@@ -867,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
867 switch (i) { 895 switch (i) {
868 case HIBERNATION_SHUTDOWN: 896 case HIBERNATION_SHUTDOWN:
869 case HIBERNATION_REBOOT: 897 case HIBERNATION_REBOOT:
898#ifdef CONFIG_SUSPEND
899 case HIBERNATION_SUSPEND:
900#endif
870 break; 901 break;
871 case HIBERNATION_PLATFORM: 902 case HIBERNATION_PLATFORM:
872 if (hibernation_ops) 903 if (hibernation_ops)
@@ -907,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
907 switch (mode) { 938 switch (mode) {
908 case HIBERNATION_SHUTDOWN: 939 case HIBERNATION_SHUTDOWN:
909 case HIBERNATION_REBOOT: 940 case HIBERNATION_REBOOT:
941#ifdef CONFIG_SUSPEND
942 case HIBERNATION_SUSPEND:
943#endif
910 hibernation_mode = mode; 944 hibernation_mode = mode;
911 break; 945 break;
912 case HIBERNATION_PLATFORM: 946 case HIBERNATION_PLATFORM:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 428f8a034e96..f458238109cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init);
235 235
236#endif /* CONFIG_PM_SLEEP */ 236#endif /* CONFIG_PM_SLEEP */
237 237
238#ifdef CONFIG_PM_SLEEP_DEBUG
239/*
240 * pm_print_times: print time taken by devices to suspend and resume.
241 *
242 * show() returns whether printing of suspend and resume times is enabled.
243 * store() accepts 0 or 1. 0 disables printing and 1 enables it.
244 */
245bool pm_print_times_enabled;
246
247static ssize_t pm_print_times_show(struct kobject *kobj,
248 struct kobj_attribute *attr, char *buf)
249{
250 return sprintf(buf, "%d\n", pm_print_times_enabled);
251}
252
253static ssize_t pm_print_times_store(struct kobject *kobj,
254 struct kobj_attribute *attr,
255 const char *buf, size_t n)
256{
257 unsigned long val;
258
259 if (kstrtoul(buf, 10, &val))
260 return -EINVAL;
261
262 if (val > 1)
263 return -EINVAL;
264
265 pm_print_times_enabled = !!val;
266 return n;
267}
268
269power_attr(pm_print_times);
270
271static inline void pm_print_times_init(void)
272{
273 pm_print_times_enabled = !!initcall_debug;
274}
275#else /* !CONFIG_PP_SLEEP_DEBUG */
276static inline void pm_print_times_init(void) {}
277#endif /* CONFIG_PM_SLEEP_DEBUG */
278
238struct kobject *power_kobj; 279struct kobject *power_kobj;
239 280
240/** 281/**
@@ -531,6 +572,9 @@ static struct attribute * g[] = {
531#ifdef CONFIG_PM_DEBUG 572#ifdef CONFIG_PM_DEBUG
532 &pm_test_attr.attr, 573 &pm_test_attr.attr,
533#endif 574#endif
575#ifdef CONFIG_PM_SLEEP_DEBUG
576 &pm_print_times_attr.attr,
577#endif
534#endif 578#endif
535 NULL, 579 NULL,
536}; 580};
@@ -566,6 +610,7 @@ static int __init pm_init(void)
566 error = sysfs_create_group(power_kobj, &attr_group); 610 error = sysfs_create_group(power_kobj, &attr_group);
567 if (error) 611 if (error)
568 return error; 612 return error;
613 pm_print_times_init();
569 return pm_autosleep_init(); 614 return pm_autosleep_init();
570} 615}
571 616
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b0bd4beaebfe..7d4b7ffb3c1d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -156,6 +156,9 @@ extern void swsusp_free(void);
156extern int swsusp_read(unsigned int *flags_p); 156extern int swsusp_read(unsigned int *flags_p);
157extern int swsusp_write(unsigned int flags); 157extern int swsusp_write(unsigned int flags);
158extern void swsusp_close(fmode_t); 158extern void swsusp_close(fmode_t);
159#ifdef CONFIG_SUSPEND
160extern int swsusp_unmark(void);
161#endif
159 162
160/* kernel/power/block_io.c */ 163/* kernel/power/block_io.c */
161extern struct block_device *hib_resume_bdev; 164extern struct block_device *hib_resume_bdev;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 396d262b8fd0..c8b7446b27df 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -24,6 +24,7 @@
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/suspend.h> 25#include <linux/suspend.h>
26#include <linux/syscore_ops.h> 26#include <linux/syscore_ops.h>
27#include <linux/ftrace.h>
27#include <trace/events/power.h> 28#include <trace/events/power.h>
28 29
29#include "power.h" 30#include "power.h"
@@ -212,6 +213,7 @@ int suspend_devices_and_enter(suspend_state_t state)
212 goto Close; 213 goto Close;
213 } 214 }
214 suspend_console(); 215 suspend_console();
216 ftrace_stop();
215 suspend_test_start(); 217 suspend_test_start();
216 error = dpm_suspend_start(PMSG_SUSPEND); 218 error = dpm_suspend_start(PMSG_SUSPEND);
217 if (error) { 219 if (error) {
@@ -231,6 +233,7 @@ int suspend_devices_and_enter(suspend_state_t state)
231 suspend_test_start(); 233 suspend_test_start();
232 dpm_resume_end(PMSG_RESUME); 234 dpm_resume_end(PMSG_RESUME);
233 suspend_test_finish("resume devices"); 235 suspend_test_finish("resume devices");
236 ftrace_start();
234 resume_console(); 237 resume_console();
235 Close: 238 Close:
236 if (suspend_ops->end) 239 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11e22c068e8b..3c9d764eb0d8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle,
448 struct timeval start; 448 struct timeval start;
449 struct timeval stop; 449 struct timeval stop;
450 450
451 printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", 451 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
452 nr_to_write); 452 nr_to_write);
453 m = nr_to_write / 100; 453 m = nr_to_write / 10;
454 if (!m) 454 if (!m)
455 m = 1; 455 m = 1;
456 nr_pages = 0; 456 nr_pages = 0;
@@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle,
464 if (ret) 464 if (ret)
465 break; 465 break;
466 if (!(nr_pages % m)) 466 if (!(nr_pages % m))
467 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 467 printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
468 nr_pages / m * 10);
468 nr_pages++; 469 nr_pages++;
469 } 470 }
470 err2 = hib_wait_on_bio_chain(&bio); 471 err2 = hib_wait_on_bio_chain(&bio);
@@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle,
472 if (!ret) 473 if (!ret)
473 ret = err2; 474 ret = err2;
474 if (!ret) 475 if (!ret)
475 printk(KERN_CONT "\b\b\b\bdone\n"); 476 printk(KERN_INFO "PM: Image saving done.\n");
476 else
477 printk(KERN_CONT "\n");
478 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 477 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
479 return ret; 478 return ret;
480} 479}
@@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
668 667
669 printk(KERN_INFO 668 printk(KERN_INFO
670 "PM: Using %u thread(s) for compression.\n" 669 "PM: Using %u thread(s) for compression.\n"
671 "PM: Compressing and saving image data (%u pages) ... ", 670 "PM: Compressing and saving image data (%u pages)...\n",
672 nr_threads, nr_to_write); 671 nr_threads, nr_to_write);
673 m = nr_to_write / 100; 672 m = nr_to_write / 10;
674 if (!m) 673 if (!m)
675 m = 1; 674 m = 1;
676 nr_pages = 0; 675 nr_pages = 0;
@@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle,
690 data_of(*snapshot), PAGE_SIZE); 689 data_of(*snapshot), PAGE_SIZE);
691 690
692 if (!(nr_pages % m)) 691 if (!(nr_pages % m))
693 printk(KERN_CONT "\b\b\b\b%3d%%", 692 printk(KERN_INFO
694 nr_pages / m); 693 "PM: Image saving progress: "
694 "%3d%%\n",
695 nr_pages / m * 10);
695 nr_pages++; 696 nr_pages++;
696 } 697 }
697 if (!off) 698 if (!off)
@@ -761,11 +762,8 @@ out_finish:
761 do_gettimeofday(&stop); 762 do_gettimeofday(&stop);
762 if (!ret) 763 if (!ret)
763 ret = err2; 764 ret = err2;
764 if (!ret) { 765 if (!ret)
765 printk(KERN_CONT "\b\b\b\bdone\n"); 766 printk(KERN_INFO "PM: Image saving done.\n");
766 } else {
767 printk(KERN_CONT "\n");
768 }
769 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 767 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
770out_clean: 768out_clean:
771 if (crc) { 769 if (crc) {
@@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle,
973 int err2; 971 int err2;
974 unsigned nr_pages; 972 unsigned nr_pages;
975 973
976 printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", 974 printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
977 nr_to_read); 975 nr_to_read);
978 m = nr_to_read / 100; 976 m = nr_to_read / 10;
979 if (!m) 977 if (!m)
980 m = 1; 978 m = 1;
981 nr_pages = 0; 979 nr_pages = 0;
@@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle,
993 if (ret) 991 if (ret)
994 break; 992 break;
995 if (!(nr_pages % m)) 993 if (!(nr_pages % m))
996 printk("\b\b\b\b%3d%%", nr_pages / m); 994 printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
995 nr_pages / m * 10);
997 nr_pages++; 996 nr_pages++;
998 } 997 }
999 err2 = hib_wait_on_bio_chain(&bio); 998 err2 = hib_wait_on_bio_chain(&bio);
@@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle,
1001 if (!ret) 1000 if (!ret)
1002 ret = err2; 1001 ret = err2;
1003 if (!ret) { 1002 if (!ret) {
1004 printk("\b\b\b\bdone\n"); 1003 printk(KERN_INFO "PM: Image loading done.\n");
1005 snapshot_write_finalize(snapshot); 1004 snapshot_write_finalize(snapshot);
1006 if (!snapshot_image_loaded(snapshot)) 1005 if (!snapshot_image_loaded(snapshot))
1007 ret = -ENODATA; 1006 ret = -ENODATA;
1008 } else 1007 }
1009 printk("\n");
1010 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1008 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
1011 return ret; 1009 return ret;
1012} 1010}
@@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
1185 1183
1186 printk(KERN_INFO 1184 printk(KERN_INFO
1187 "PM: Using %u thread(s) for decompression.\n" 1185 "PM: Using %u thread(s) for decompression.\n"
1188 "PM: Loading and decompressing image data (%u pages) ... ", 1186 "PM: Loading and decompressing image data (%u pages)...\n",
1189 nr_threads, nr_to_read); 1187 nr_threads, nr_to_read);
1190 m = nr_to_read / 100; 1188 m = nr_to_read / 10;
1191 if (!m) 1189 if (!m)
1192 m = 1; 1190 m = 1;
1193 nr_pages = 0; 1191 nr_pages = 0;
@@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle,
1319 data[thr].unc + off, PAGE_SIZE); 1317 data[thr].unc + off, PAGE_SIZE);
1320 1318
1321 if (!(nr_pages % m)) 1319 if (!(nr_pages % m))
1322 printk("\b\b\b\b%3d%%", nr_pages / m); 1320 printk(KERN_INFO
1321 "PM: Image loading progress: "
1322 "%3d%%\n",
1323 nr_pages / m * 10);
1323 nr_pages++; 1324 nr_pages++;
1324 1325
1325 ret = snapshot_write_next(snapshot); 1326 ret = snapshot_write_next(snapshot);
@@ -1344,7 +1345,7 @@ out_finish:
1344 } 1345 }
1345 do_gettimeofday(&stop); 1346 do_gettimeofday(&stop);
1346 if (!ret) { 1347 if (!ret) {
1347 printk("\b\b\b\bdone\n"); 1348 printk(KERN_INFO "PM: Image loading done.\n");
1348 snapshot_write_finalize(snapshot); 1349 snapshot_write_finalize(snapshot);
1349 if (!snapshot_image_loaded(snapshot)) 1350 if (!snapshot_image_loaded(snapshot))
1350 ret = -ENODATA; 1351 ret = -ENODATA;
@@ -1357,8 +1358,7 @@ out_finish:
1357 } 1358 }
1358 } 1359 }
1359 } 1360 }
1360 } else 1361 }
1361 printk("\n");
1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
1363out_clean: 1363out_clean:
1364 for (i = 0; i < ring_size; i++) 1364 for (i = 0; i < ring_size; i++)
@@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode)
1472 blkdev_put(hib_resume_bdev, mode); 1472 blkdev_put(hib_resume_bdev, mode);
1473} 1473}
1474 1474
1475/**
1476 * swsusp_unmark - Unmark swsusp signature in the resume device
1477 */
1478
1479#ifdef CONFIG_SUSPEND
1480int swsusp_unmark(void)
1481{
1482 int error;
1483
1484 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
1485 if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
1486 memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
1487 error = hib_bio_write_page(swsusp_resume_block,
1488 swsusp_header, NULL);
1489 } else {
1490 printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
1491 error = -ENODEV;
1492 }
1493
1494 /*
1495 * We just returned from suspend, we don't need the image any more.
1496 */
1497 free_all_swap_pages(root_swap);
1498
1499 return error;
1500}
1501#endif
1502
1475static int swsusp_header_init(void) 1503static int swsusp_header_init(void)
1476{ 1504{
1477 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); 1505 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/freezer.h> 26#include <linux/freezer.h>
27#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
30 29
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
84 * appear. 83 * appear.
85 */ 84 */
86 wait_for_device_probe(); 85 wait_for_device_probe();
87 scsi_complete_async_scans();
88 86
89 data->swap = -1; 87 data->swap = -1;
90 data->mode = O_WRONLY; 88 data->mode = O_WRONLY;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index c8fba3380076..8f50de394d22 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -9,6 +9,7 @@
9 * manipulate wakelocks on Android. 9 * manipulate wakelocks on Android.
10 */ 10 */
11 11
12#include <linux/capability.h>
12#include <linux/ctype.h> 13#include <linux/ctype.h>
13#include <linux/device.h> 14#include <linux/device.h>
14#include <linux/err.h> 15#include <linux/err.h>
@@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf)
188 size_t len; 189 size_t len;
189 int ret = 0; 190 int ret = 0;
190 191
192 if (!capable(CAP_BLOCK_SUSPEND))
193 return -EPERM;
194
191 while (*str && !isspace(*str)) 195 while (*str && !isspace(*str))
192 str++; 196 str++;
193 197
@@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf)
231 size_t len; 235 size_t len;
232 int ret = 0; 236 int ret = 0;
233 237
238 if (!capable(CAP_BLOCK_SUSPEND))
239 return -EPERM;
240
234 len = strlen(buf); 241 len = strlen(buf);
235 if (!len) 242 if (!len)
236 return -EINVAL; 243 return -EINVAL;
diff --git a/kernel/printk.c b/kernel/printk.c
index dba18211685e..ac4bc9e79465 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -194,8 +194,10 @@ static int console_may_schedule;
194 */ 194 */
195 195
196enum log_flags { 196enum log_flags {
197 LOG_DEFAULT = 0, 197 LOG_NOCONS = 1, /* already flushed, do not print to console */
198 LOG_NOCONS = 1, /* already flushed, do not print to console */ 198 LOG_NEWLINE = 2, /* text ended with a newline */
199 LOG_PREFIX = 4, /* text started with a prefix */
200 LOG_CONT = 8, /* text is a fragment of a continuation line */
199}; 201};
200 202
201struct log { 203struct log {
@@ -217,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
217/* the next printk record to read by syslog(READ) or /proc/kmsg */ 219/* the next printk record to read by syslog(READ) or /proc/kmsg */
218static u64 syslog_seq; 220static u64 syslog_seq;
219static u32 syslog_idx; 221static u32 syslog_idx;
222static enum log_flags syslog_prev;
223static size_t syslog_partial;
220 224
221/* index and sequence number of the first record stored in the buffer */ 225/* index and sequence number of the first record stored in the buffer */
222static u64 log_first_seq; 226static u64 log_first_seq;
@@ -430,20 +434,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
430 ret = mutex_lock_interruptible(&user->lock); 434 ret = mutex_lock_interruptible(&user->lock);
431 if (ret) 435 if (ret)
432 return ret; 436 return ret;
433 raw_spin_lock(&logbuf_lock); 437 raw_spin_lock_irq(&logbuf_lock);
434 while (user->seq == log_next_seq) { 438 while (user->seq == log_next_seq) {
435 if (file->f_flags & O_NONBLOCK) { 439 if (file->f_flags & O_NONBLOCK) {
436 ret = -EAGAIN; 440 ret = -EAGAIN;
437 raw_spin_unlock(&logbuf_lock); 441 raw_spin_unlock_irq(&logbuf_lock);
438 goto out; 442 goto out;
439 } 443 }
440 444
441 raw_spin_unlock(&logbuf_lock); 445 raw_spin_unlock_irq(&logbuf_lock);
442 ret = wait_event_interruptible(log_wait, 446 ret = wait_event_interruptible(log_wait,
443 user->seq != log_next_seq); 447 user->seq != log_next_seq);
444 if (ret) 448 if (ret)
445 goto out; 449 goto out;
446 raw_spin_lock(&logbuf_lock); 450 raw_spin_lock_irq(&logbuf_lock);
447 } 451 }
448 452
449 if (user->seq < log_first_seq) { 453 if (user->seq < log_first_seq) {
@@ -451,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
451 user->idx = log_first_idx; 455 user->idx = log_first_idx;
452 user->seq = log_first_seq; 456 user->seq = log_first_seq;
453 ret = -EPIPE; 457 ret = -EPIPE;
454 raw_spin_unlock(&logbuf_lock); 458 raw_spin_unlock_irq(&logbuf_lock);
455 goto out; 459 goto out;
456 } 460 }
457 461
@@ -465,7 +469,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
465 for (i = 0; i < msg->text_len; i++) { 469 for (i = 0; i < msg->text_len; i++) {
466 unsigned char c = log_text(msg)[i]; 470 unsigned char c = log_text(msg)[i];
467 471
468 if (c < ' ' || c >= 128) 472 if (c < ' ' || c >= 127 || c == '\\')
469 len += sprintf(user->buf + len, "\\x%02x", c); 473 len += sprintf(user->buf + len, "\\x%02x", c);
470 else 474 else
471 user->buf[len++] = c; 475 user->buf[len++] = c;
@@ -489,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
489 continue; 493 continue;
490 } 494 }
491 495
492 if (c < ' ' || c >= 128) { 496 if (c < ' ' || c >= 127 || c == '\\') {
493 len += sprintf(user->buf + len, "\\x%02x", c); 497 len += sprintf(user->buf + len, "\\x%02x", c);
494 continue; 498 continue;
495 } 499 }
@@ -501,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
501 505
502 user->idx = log_next(user->idx); 506 user->idx = log_next(user->idx);
503 user->seq++; 507 user->seq++;
504 raw_spin_unlock(&logbuf_lock); 508 raw_spin_unlock_irq(&logbuf_lock);
505 509
506 if (len > count) { 510 if (len > count) {
507 ret = -EINVAL; 511 ret = -EINVAL;
@@ -528,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
528 if (offset) 532 if (offset)
529 return -ESPIPE; 533 return -ESPIPE;
530 534
531 raw_spin_lock(&logbuf_lock); 535 raw_spin_lock_irq(&logbuf_lock);
532 switch (whence) { 536 switch (whence) {
533 case SEEK_SET: 537 case SEEK_SET:
534 /* the first record */ 538 /* the first record */
@@ -552,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
552 default: 556 default:
553 ret = -EINVAL; 557 ret = -EINVAL;
554 } 558 }
555 raw_spin_unlock(&logbuf_lock); 559 raw_spin_unlock_irq(&logbuf_lock);
556 return ret; 560 return ret;
557} 561}
558 562
@@ -566,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
566 570
567 poll_wait(file, &log_wait, wait); 571 poll_wait(file, &log_wait, wait);
568 572
569 raw_spin_lock(&logbuf_lock); 573 raw_spin_lock_irq(&logbuf_lock);
570 if (user->seq < log_next_seq) { 574 if (user->seq < log_next_seq) {
571 /* return error when data has vanished underneath us */ 575 /* return error when data has vanished underneath us */
572 if (user->seq < log_first_seq) 576 if (user->seq < log_first_seq)
573 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; 577 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
574 ret = POLLIN|POLLRDNORM; 578 ret = POLLIN|POLLRDNORM;
575 } 579 }
576 raw_spin_unlock(&logbuf_lock); 580 raw_spin_unlock_irq(&logbuf_lock);
577 581
578 return ret; 582 return ret;
579} 583}
@@ -597,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
597 601
598 mutex_init(&user->lock); 602 mutex_init(&user->lock);
599 603
600 raw_spin_lock(&logbuf_lock); 604 raw_spin_lock_irq(&logbuf_lock);
601 user->idx = log_first_idx; 605 user->idx = log_first_idx;
602 user->seq = log_first_seq; 606 user->seq = log_first_seq;
603 raw_spin_unlock(&logbuf_lock); 607 raw_spin_unlock_irq(&logbuf_lock);
604 608
605 file->private_data = user; 609 file->private_data = user;
606 return 0; 610 return 0;
@@ -818,15 +822,18 @@ static size_t print_time(u64 ts, char *buf)
818static size_t print_prefix(const struct log *msg, bool syslog, char *buf) 822static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
819{ 823{
820 size_t len = 0; 824 size_t len = 0;
825 unsigned int prefix = (msg->facility << 3) | msg->level;
821 826
822 if (syslog) { 827 if (syslog) {
823 if (buf) { 828 if (buf) {
824 len += sprintf(buf, "<%u>", msg->level); 829 len += sprintf(buf, "<%u>", prefix);
825 } else { 830 } else {
826 len += 3; 831 len += 3;
827 if (msg->level > 9) 832 if (prefix > 999)
828 len++; 833 len += 3;
829 if (msg->level > 99) 834 else if (prefix > 99)
835 len += 2;
836 else if (prefix > 9)
830 len++; 837 len++;
831 } 838 }
832 } 839 }
@@ -835,13 +842,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
835 return len; 842 return len;
836} 843}
837 844
838static size_t msg_print_text(const struct log *msg, bool syslog, 845static size_t msg_print_text(const struct log *msg, enum log_flags prev,
839 char *buf, size_t size) 846 bool syslog, char *buf, size_t size)
840{ 847{
841 const char *text = log_text(msg); 848 const char *text = log_text(msg);
842 size_t text_size = msg->text_len; 849 size_t text_size = msg->text_len;
850 bool prefix = true;
851 bool newline = true;
843 size_t len = 0; 852 size_t len = 0;
844 853
854 if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
855 prefix = false;
856
857 if (msg->flags & LOG_CONT) {
858 if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
859 prefix = false;
860
861 if (!(msg->flags & LOG_NEWLINE))
862 newline = false;
863 }
864
845 do { 865 do {
846 const char *next = memchr(text, '\n', text_size); 866 const char *next = memchr(text, '\n', text_size);
847 size_t text_len; 867 size_t text_len;
@@ -859,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
859 text_len + 1>= size - len) 879 text_len + 1>= size - len)
860 break; 880 break;
861 881
862 len += print_prefix(msg, syslog, buf + len); 882 if (prefix)
883 len += print_prefix(msg, syslog, buf + len);
863 memcpy(buf + len, text, text_len); 884 memcpy(buf + len, text, text_len);
864 len += text_len; 885 len += text_len;
865 buf[len++] = '\n'; 886 if (next || newline)
887 buf[len++] = '\n';
866 } else { 888 } else {
867 /* SYSLOG_ACTION_* buffer size only calculation */ 889 /* SYSLOG_ACTION_* buffer size only calculation */
868 len += print_prefix(msg, syslog, NULL); 890 if (prefix)
869 len += text_len + 1; 891 len += print_prefix(msg, syslog, NULL);
892 len += text_len;
893 if (next || newline)
894 len++;
870 } 895 }
871 896
897 prefix = true;
872 text = next; 898 text = next;
873 } while (text); 899 } while (text);
874 900
@@ -887,22 +913,35 @@ static int syslog_print(char __user *buf, int size)
887 913
888 while (size > 0) { 914 while (size > 0) {
889 size_t n; 915 size_t n;
916 size_t skip;
890 917
891 raw_spin_lock_irq(&logbuf_lock); 918 raw_spin_lock_irq(&logbuf_lock);
892 if (syslog_seq < log_first_seq) { 919 if (syslog_seq < log_first_seq) {
893 /* messages are gone, move to first one */ 920 /* messages are gone, move to first one */
894 syslog_seq = log_first_seq; 921 syslog_seq = log_first_seq;
895 syslog_idx = log_first_idx; 922 syslog_idx = log_first_idx;
923 syslog_prev = 0;
924 syslog_partial = 0;
896 } 925 }
897 if (syslog_seq == log_next_seq) { 926 if (syslog_seq == log_next_seq) {
898 raw_spin_unlock_irq(&logbuf_lock); 927 raw_spin_unlock_irq(&logbuf_lock);
899 break; 928 break;
900 } 929 }
930
931 skip = syslog_partial;
901 msg = log_from_idx(syslog_idx); 932 msg = log_from_idx(syslog_idx);
902 n = msg_print_text(msg, true, text, LOG_LINE_MAX); 933 n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
903 if (n <= size) { 934 if (n - syslog_partial <= size) {
935 /* message fits into buffer, move forward */
904 syslog_idx = log_next(syslog_idx); 936 syslog_idx = log_next(syslog_idx);
905 syslog_seq++; 937 syslog_seq++;
938 syslog_prev = msg->flags;
939 n -= syslog_partial;
940 syslog_partial = 0;
941 } else if (!len){
942 /* partial read(), remember position */
943 n = size;
944 syslog_partial += n;
906 } else 945 } else
907 n = 0; 946 n = 0;
908 raw_spin_unlock_irq(&logbuf_lock); 947 raw_spin_unlock_irq(&logbuf_lock);
@@ -910,17 +949,15 @@ static int syslog_print(char __user *buf, int size)
910 if (!n) 949 if (!n)
911 break; 950 break;
912 951
913 len += n; 952 if (copy_to_user(buf, text + skip, n)) {
914 size -= n;
915 buf += n;
916 n = copy_to_user(buf - n, text, n);
917
918 if (n) {
919 len -= n;
920 if (!len) 953 if (!len)
921 len = -EFAULT; 954 len = -EFAULT;
922 break; 955 break;
923 } 956 }
957
958 len += n;
959 size -= n;
960 buf += n;
924 } 961 }
925 962
926 kfree(text); 963 kfree(text);
@@ -941,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
941 u64 next_seq; 978 u64 next_seq;
942 u64 seq; 979 u64 seq;
943 u32 idx; 980 u32 idx;
981 enum log_flags prev;
944 982
945 if (clear_seq < log_first_seq) { 983 if (clear_seq < log_first_seq) {
946 /* messages are gone, move to first available one */ 984 /* messages are gone, move to first available one */
@@ -954,10 +992,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
954 */ 992 */
955 seq = clear_seq; 993 seq = clear_seq;
956 idx = clear_idx; 994 idx = clear_idx;
995 prev = 0;
957 while (seq < log_next_seq) { 996 while (seq < log_next_seq) {
958 struct log *msg = log_from_idx(idx); 997 struct log *msg = log_from_idx(idx);
959 998
960 len += msg_print_text(msg, true, NULL, 0); 999 len += msg_print_text(msg, prev, true, NULL, 0);
961 idx = log_next(idx); 1000 idx = log_next(idx);
962 seq++; 1001 seq++;
963 } 1002 }
@@ -965,10 +1004,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
965 /* move first record forward until length fits into the buffer */ 1004 /* move first record forward until length fits into the buffer */
966 seq = clear_seq; 1005 seq = clear_seq;
967 idx = clear_idx; 1006 idx = clear_idx;
1007 prev = 0;
968 while (len > size && seq < log_next_seq) { 1008 while (len > size && seq < log_next_seq) {
969 struct log *msg = log_from_idx(idx); 1009 struct log *msg = log_from_idx(idx);
970 1010
971 len -= msg_print_text(msg, true, NULL, 0); 1011 len -= msg_print_text(msg, prev, true, NULL, 0);
972 idx = log_next(idx); 1012 idx = log_next(idx);
973 seq++; 1013 seq++;
974 } 1014 }
@@ -977,17 +1017,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
977 next_seq = log_next_seq; 1017 next_seq = log_next_seq;
978 1018
979 len = 0; 1019 len = 0;
1020 prev = 0;
980 while (len >= 0 && seq < next_seq) { 1021 while (len >= 0 && seq < next_seq) {
981 struct log *msg = log_from_idx(idx); 1022 struct log *msg = log_from_idx(idx);
982 int textlen; 1023 int textlen;
983 1024
984 textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); 1025 textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
985 if (textlen < 0) { 1026 if (textlen < 0) {
986 len = textlen; 1027 len = textlen;
987 break; 1028 break;
988 } 1029 }
989 idx = log_next(idx); 1030 idx = log_next(idx);
990 seq++; 1031 seq++;
1032 prev = msg->flags;
991 1033
992 raw_spin_unlock_irq(&logbuf_lock); 1034 raw_spin_unlock_irq(&logbuf_lock);
993 if (copy_to_user(buf + len, text, textlen)) 1035 if (copy_to_user(buf + len, text, textlen))
@@ -1000,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1000 /* messages are gone, move to next one */ 1042 /* messages are gone, move to next one */
1001 seq = log_first_seq; 1043 seq = log_first_seq;
1002 idx = log_first_idx; 1044 idx = log_first_idx;
1045 prev = 0;
1003 } 1046 }
1004 } 1047 }
1005 } 1048 }
@@ -1018,7 +1061,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1018{ 1061{
1019 bool clear = false; 1062 bool clear = false;
1020 static int saved_console_loglevel = -1; 1063 static int saved_console_loglevel = -1;
1021 static DEFINE_MUTEX(syslog_mutex);
1022 int error; 1064 int error;
1023 1065
1024 error = check_syslog_permissions(type, from_file); 1066 error = check_syslog_permissions(type, from_file);
@@ -1045,17 +1087,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1045 error = -EFAULT; 1087 error = -EFAULT;
1046 goto out; 1088 goto out;
1047 } 1089 }
1048 error = mutex_lock_interruptible(&syslog_mutex);
1049 if (error)
1050 goto out;
1051 error = wait_event_interruptible(log_wait, 1090 error = wait_event_interruptible(log_wait,
1052 syslog_seq != log_next_seq); 1091 syslog_seq != log_next_seq);
1053 if (error) { 1092 if (error)
1054 mutex_unlock(&syslog_mutex);
1055 goto out; 1093 goto out;
1056 }
1057 error = syslog_print(buf, len); 1094 error = syslog_print(buf, len);
1058 mutex_unlock(&syslog_mutex);
1059 break; 1095 break;
1060 /* Read/clear last kernel messages */ 1096 /* Read/clear last kernel messages */
1061 case SYSLOG_ACTION_READ_CLEAR: 1097 case SYSLOG_ACTION_READ_CLEAR:
@@ -1111,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1111 /* messages are gone, move to first one */ 1147 /* messages are gone, move to first one */
1112 syslog_seq = log_first_seq; 1148 syslog_seq = log_first_seq;
1113 syslog_idx = log_first_idx; 1149 syslog_idx = log_first_idx;
1150 syslog_prev = 0;
1151 syslog_partial = 0;
1114 } 1152 }
1115 if (from_file) { 1153 if (from_file) {
1116 /* 1154 /*
@@ -1120,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1120 */ 1158 */
1121 error = log_next_idx - syslog_idx; 1159 error = log_next_idx - syslog_idx;
1122 } else { 1160 } else {
1123 u64 seq; 1161 u64 seq = syslog_seq;
1124 u32 idx; 1162 u32 idx = syslog_idx;
1163 enum log_flags prev = syslog_prev;
1125 1164
1126 error = 0; 1165 error = 0;
1127 seq = syslog_seq;
1128 idx = syslog_idx;
1129 while (seq < log_next_seq) { 1166 while (seq < log_next_seq) {
1130 struct log *msg = log_from_idx(idx); 1167 struct log *msg = log_from_idx(idx);
1131 1168
1132 error += msg_print_text(msg, true, NULL, 0); 1169 error += msg_print_text(msg, prev, true, NULL, 0);
1133 idx = log_next(idx); 1170 idx = log_next(idx);
1134 seq++; 1171 seq++;
1172 prev = msg->flags;
1135 } 1173 }
1174 error -= syslog_partial;
1136 } 1175 }
1137 raw_spin_unlock_irq(&logbuf_lock); 1176 raw_spin_unlock_irq(&logbuf_lock);
1138 break; 1177 break;
@@ -1153,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1153 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1192 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1154} 1193}
1155 1194
1156#ifdef CONFIG_KGDB_KDB
1157/* kdb dmesg command needs access to the syslog buffer. do_syslog()
1158 * uses locks so it cannot be used during debugging. Just tell kdb
1159 * where the start and end of the physical and logical logs are. This
1160 * is equivalent to do_syslog(3).
1161 */
1162void kdb_syslog_data(char *syslog_data[4])
1163{
1164 syslog_data[0] = log_buf;
1165 syslog_data[1] = log_buf + log_buf_len;
1166 syslog_data[2] = log_buf + log_first_idx;
1167 syslog_data[3] = log_buf + log_next_idx;
1168}
1169#endif /* CONFIG_KGDB_KDB */
1170
1171static bool __read_mostly ignore_loglevel; 1195static bool __read_mostly ignore_loglevel;
1172 1196
1173static int __init ignore_loglevel_setup(char *str) 1197static int __init ignore_loglevel_setup(char *str)
@@ -1400,10 +1424,9 @@ asmlinkage int vprintk_emit(int facility, int level,
1400 static char textbuf[LOG_LINE_MAX]; 1424 static char textbuf[LOG_LINE_MAX];
1401 char *text = textbuf; 1425 char *text = textbuf;
1402 size_t text_len; 1426 size_t text_len;
1427 enum log_flags lflags = 0;
1403 unsigned long flags; 1428 unsigned long flags;
1404 int this_cpu; 1429 int this_cpu;
1405 bool newline = false;
1406 bool prefix = false;
1407 int printed_len = 0; 1430 int printed_len = 0;
1408 1431
1409 boot_delay_msec(); 1432 boot_delay_msec();
@@ -1442,7 +1465,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1442 recursion_bug = 0; 1465 recursion_bug = 0;
1443 printed_len += strlen(recursion_msg); 1466 printed_len += strlen(recursion_msg);
1444 /* emit KERN_CRIT message */ 1467 /* emit KERN_CRIT message */
1445 log_store(0, 2, LOG_DEFAULT, 0, 1468 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1446 NULL, 0, recursion_msg, printed_len); 1469 NULL, 0, recursion_msg, printed_len);
1447 } 1470 }
1448 1471
@@ -1455,7 +1478,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1455 /* mark and strip a trailing newline */ 1478 /* mark and strip a trailing newline */
1456 if (text_len && text[text_len-1] == '\n') { 1479 if (text_len && text[text_len-1] == '\n') {
1457 text_len--; 1480 text_len--;
1458 newline = true; 1481 lflags |= LOG_NEWLINE;
1459 } 1482 }
1460 1483
1461 /* strip syslog prefix and extract log level or control flags */ 1484 /* strip syslog prefix and extract log level or control flags */
@@ -1465,7 +1488,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1465 if (level == -1) 1488 if (level == -1)
1466 level = text[1] - '0'; 1489 level = text[1] - '0';
1467 case 'd': /* KERN_DEFAULT */ 1490 case 'd': /* KERN_DEFAULT */
1468 prefix = true; 1491 lflags |= LOG_PREFIX;
1469 case 'c': /* KERN_CONT */ 1492 case 'c': /* KERN_CONT */
1470 text += 3; 1493 text += 3;
1471 text_len -= 3; 1494 text_len -= 3;
@@ -1475,22 +1498,20 @@ asmlinkage int vprintk_emit(int facility, int level,
1475 if (level == -1) 1498 if (level == -1)
1476 level = default_message_loglevel; 1499 level = default_message_loglevel;
1477 1500
1478 if (dict) { 1501 if (dict)
1479 prefix = true; 1502 lflags |= LOG_PREFIX|LOG_NEWLINE;
1480 newline = true;
1481 }
1482 1503
1483 if (!newline) { 1504 if (!(lflags & LOG_NEWLINE)) {
1484 /* 1505 /*
1485 * Flush the conflicting buffer. An earlier newline was missing, 1506 * Flush the conflicting buffer. An earlier newline was missing,
1486 * or another task also prints continuation lines. 1507 * or another task also prints continuation lines.
1487 */ 1508 */
1488 if (cont.len && (prefix || cont.owner != current)) 1509 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
1489 cont_flush(); 1510 cont_flush();
1490 1511
1491 /* buffer line if possible, otherwise store it right away */ 1512 /* buffer line if possible, otherwise store it right away */
1492 if (!cont_add(facility, level, text, text_len)) 1513 if (!cont_add(facility, level, text, text_len))
1493 log_store(facility, level, LOG_DEFAULT, 0, 1514 log_store(facility, level, lflags | LOG_CONT, 0,
1494 dict, dictlen, text, text_len); 1515 dict, dictlen, text, text_len);
1495 } else { 1516 } else {
1496 bool stored = false; 1517 bool stored = false;
@@ -1502,13 +1523,13 @@ asmlinkage int vprintk_emit(int facility, int level,
1502 * flush it out and store this line separately. 1523 * flush it out and store this line separately.
1503 */ 1524 */
1504 if (cont.len && cont.owner == current) { 1525 if (cont.len && cont.owner == current) {
1505 if (!prefix) 1526 if (!(lflags & LOG_PREFIX))
1506 stored = cont_add(facility, level, text, text_len); 1527 stored = cont_add(facility, level, text, text_len);
1507 cont_flush(); 1528 cont_flush();
1508 } 1529 }
1509 1530
1510 if (!stored) 1531 if (!stored)
1511 log_store(facility, level, LOG_DEFAULT, 0, 1532 log_store(facility, level, lflags, 0,
1512 dict, dictlen, text, text_len); 1533 dict, dictlen, text, text_len);
1513 } 1534 }
1514 printed_len += text_len; 1535 printed_len += text_len;
@@ -1607,8 +1628,8 @@ static struct cont {
1607static struct log *log_from_idx(u32 idx) { return NULL; } 1628static struct log *log_from_idx(u32 idx) { return NULL; }
1608static u32 log_next(u32 idx) { return 0; } 1629static u32 log_next(u32 idx) { return 0; }
1609static void call_console_drivers(int level, const char *text, size_t len) {} 1630static void call_console_drivers(int level, const char *text, size_t len) {}
1610static size_t msg_print_text(const struct log *msg, bool syslog, 1631static size_t msg_print_text(const struct log *msg, enum log_flags prev,
1611 char *buf, size_t size) { return 0; } 1632 bool syslog, char *buf, size_t size) { return 0; }
1612static size_t cont_print_text(char *text, size_t size) { return 0; } 1633static size_t cont_print_text(char *text, size_t size) { return 0; }
1613 1634
1614#endif /* CONFIG_PRINTK */ 1635#endif /* CONFIG_PRINTK */
@@ -1884,6 +1905,7 @@ void wake_up_klogd(void)
1884/* the next printk record to write to the console */ 1905/* the next printk record to write to the console */
1885static u64 console_seq; 1906static u64 console_seq;
1886static u32 console_idx; 1907static u32 console_idx;
1908static enum log_flags console_prev;
1887 1909
1888/** 1910/**
1889 * console_unlock - unlock the console system 1911 * console_unlock - unlock the console system
@@ -1944,6 +1966,7 @@ again:
1944 /* messages are gone, move to first one */ 1966 /* messages are gone, move to first one */
1945 console_seq = log_first_seq; 1967 console_seq = log_first_seq;
1946 console_idx = log_first_idx; 1968 console_idx = log_first_idx;
1969 console_prev = 0;
1947 } 1970 }
1948skip: 1971skip:
1949 if (console_seq == log_next_seq) 1972 if (console_seq == log_next_seq)
@@ -1957,14 +1980,21 @@ skip:
1957 */ 1980 */
1958 console_idx = log_next(console_idx); 1981 console_idx = log_next(console_idx);
1959 console_seq++; 1982 console_seq++;
1983 /*
1984 * We will get here again when we register a new
1985 * CON_PRINTBUFFER console. Clear the flag so we
1986 * will properly dump everything later.
1987 */
1988 msg->flags &= ~LOG_NOCONS;
1960 goto skip; 1989 goto skip;
1961 } 1990 }
1962 1991
1963 level = msg->level; 1992 level = msg->level;
1964 len = msg_print_text(msg, false, text, sizeof(text)); 1993 len = msg_print_text(msg, console_prev, false,
1965 1994 text, sizeof(text));
1966 console_idx = log_next(console_idx); 1995 console_idx = log_next(console_idx);
1967 console_seq++; 1996 console_seq++;
1997 console_prev = msg->flags;
1968 raw_spin_unlock(&logbuf_lock); 1998 raw_spin_unlock(&logbuf_lock);
1969 1999
1970 stop_critical_timings(); /* don't trace print latency */ 2000 stop_critical_timings(); /* don't trace print latency */
@@ -2227,6 +2257,7 @@ void register_console(struct console *newcon)
2227 raw_spin_lock_irqsave(&logbuf_lock, flags); 2257 raw_spin_lock_irqsave(&logbuf_lock, flags);
2228 console_seq = syslog_seq; 2258 console_seq = syslog_seq;
2229 console_idx = syslog_idx; 2259 console_idx = syslog_idx;
2260 console_prev = syslog_prev;
2230 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2261 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2231 /* 2262 /*
2232 * We're about to replay the log buffer. Only do this to the 2263 * We're about to replay the log buffer. Only do this to the
@@ -2479,7 +2510,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2479} 2510}
2480 2511
2481/** 2512/**
2482 * kmsg_dump_get_line - retrieve one kmsg log line 2513 * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
2483 * @dumper: registered kmsg dumper 2514 * @dumper: registered kmsg dumper
2484 * @syslog: include the "<4>" prefixes 2515 * @syslog: include the "<4>" prefixes
2485 * @line: buffer to copy the line to 2516 * @line: buffer to copy the line to
@@ -2494,11 +2525,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2494 * 2525 *
2495 * A return value of FALSE indicates that there are no more records to 2526 * A return value of FALSE indicates that there are no more records to
2496 * read. 2527 * read.
2528 *
2529 * The function is similar to kmsg_dump_get_line(), but grabs no locks.
2497 */ 2530 */
2498bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, 2531bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
2499 char *line, size_t size, size_t *len) 2532 char *line, size_t size, size_t *len)
2500{ 2533{
2501 unsigned long flags;
2502 struct log *msg; 2534 struct log *msg;
2503 size_t l = 0; 2535 size_t l = 0;
2504 bool ret = false; 2536 bool ret = false;
@@ -2506,7 +2538,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2506 if (!dumper->active) 2538 if (!dumper->active)
2507 goto out; 2539 goto out;
2508 2540
2509 raw_spin_lock_irqsave(&logbuf_lock, flags);
2510 if (dumper->cur_seq < log_first_seq) { 2541 if (dumper->cur_seq < log_first_seq) {
2511 /* messages are gone, move to first available one */ 2542 /* messages are gone, move to first available one */
2512 dumper->cur_seq = log_first_seq; 2543 dumper->cur_seq = log_first_seq;
@@ -2514,24 +2545,50 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2514 } 2545 }
2515 2546
2516 /* last entry */ 2547 /* last entry */
2517 if (dumper->cur_seq >= log_next_seq) { 2548 if (dumper->cur_seq >= log_next_seq)
2518 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2519 goto out; 2549 goto out;
2520 }
2521 2550
2522 msg = log_from_idx(dumper->cur_idx); 2551 msg = log_from_idx(dumper->cur_idx);
2523 l = msg_print_text(msg, syslog, 2552 l = msg_print_text(msg, 0, syslog, line, size);
2524 line, size);
2525 2553
2526 dumper->cur_idx = log_next(dumper->cur_idx); 2554 dumper->cur_idx = log_next(dumper->cur_idx);
2527 dumper->cur_seq++; 2555 dumper->cur_seq++;
2528 ret = true; 2556 ret = true;
2529 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2530out: 2557out:
2531 if (len) 2558 if (len)
2532 *len = l; 2559 *len = l;
2533 return ret; 2560 return ret;
2534} 2561}
2562
2563/**
2564 * kmsg_dump_get_line - retrieve one kmsg log line
2565 * @dumper: registered kmsg dumper
2566 * @syslog: include the "<4>" prefixes
2567 * @line: buffer to copy the line to
2568 * @size: maximum size of the buffer
2569 * @len: length of line placed into buffer
2570 *
2571 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2572 * record, and copy one record into the provided buffer.
2573 *
2574 * Consecutive calls will return the next available record moving
2575 * towards the end of the buffer with the youngest messages.
2576 *
2577 * A return value of FALSE indicates that there are no more records to
2578 * read.
2579 */
2580bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2581 char *line, size_t size, size_t *len)
2582{
2583 unsigned long flags;
2584 bool ret;
2585
2586 raw_spin_lock_irqsave(&logbuf_lock, flags);
2587 ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
2588 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2589
2590 return ret;
2591}
2535EXPORT_SYMBOL_GPL(kmsg_dump_get_line); 2592EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
2536 2593
2537/** 2594/**
@@ -2561,6 +2618,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2561 u32 idx; 2618 u32 idx;
2562 u64 next_seq; 2619 u64 next_seq;
2563 u32 next_idx; 2620 u32 next_idx;
2621 enum log_flags prev;
2564 size_t l = 0; 2622 size_t l = 0;
2565 bool ret = false; 2623 bool ret = false;
2566 2624
@@ -2583,23 +2641,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2583 /* calculate length of entire buffer */ 2641 /* calculate length of entire buffer */
2584 seq = dumper->cur_seq; 2642 seq = dumper->cur_seq;
2585 idx = dumper->cur_idx; 2643 idx = dumper->cur_idx;
2644 prev = 0;
2586 while (seq < dumper->next_seq) { 2645 while (seq < dumper->next_seq) {
2587 struct log *msg = log_from_idx(idx); 2646 struct log *msg = log_from_idx(idx);
2588 2647
2589 l += msg_print_text(msg, true, NULL, 0); 2648 l += msg_print_text(msg, prev, true, NULL, 0);
2590 idx = log_next(idx); 2649 idx = log_next(idx);
2591 seq++; 2650 seq++;
2651 prev = msg->flags;
2592 } 2652 }
2593 2653
2594 /* move first record forward until length fits into the buffer */ 2654 /* move first record forward until length fits into the buffer */
2595 seq = dumper->cur_seq; 2655 seq = dumper->cur_seq;
2596 idx = dumper->cur_idx; 2656 idx = dumper->cur_idx;
2657 prev = 0;
2597 while (l > size && seq < dumper->next_seq) { 2658 while (l > size && seq < dumper->next_seq) {
2598 struct log *msg = log_from_idx(idx); 2659 struct log *msg = log_from_idx(idx);
2599 2660
2600 l -= msg_print_text(msg, true, NULL, 0); 2661 l -= msg_print_text(msg, prev, true, NULL, 0);
2601 idx = log_next(idx); 2662 idx = log_next(idx);
2602 seq++; 2663 seq++;
2664 prev = msg->flags;
2603 } 2665 }
2604 2666
2605 /* last message in next interation */ 2667 /* last message in next interation */
@@ -2607,14 +2669,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2607 next_idx = idx; 2669 next_idx = idx;
2608 2670
2609 l = 0; 2671 l = 0;
2672 prev = 0;
2610 while (seq < dumper->next_seq) { 2673 while (seq < dumper->next_seq) {
2611 struct log *msg = log_from_idx(idx); 2674 struct log *msg = log_from_idx(idx);
2612 2675
2613 l += msg_print_text(msg, syslog, 2676 l += msg_print_text(msg, prev, syslog, buf + l, size - l);
2614 buf + l, size - l);
2615
2616 idx = log_next(idx); 2677 idx = log_next(idx);
2617 seq++; 2678 seq++;
2679 prev = msg->flags;
2618 } 2680 }
2619 2681
2620 dumper->next_seq = next_seq; 2682 dumper->next_seq = next_seq;
@@ -2629,6 +2691,24 @@ out:
2629EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); 2691EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
2630 2692
2631/** 2693/**
2694 * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
2695 * @dumper: registered kmsg dumper
2696 *
2697 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2698 * kmsg_dump_get_buffer() can be called again and used multiple
2699 * times within the same dumper.dump() callback.
2700 *
2701 * The function is similar to kmsg_dump_rewind(), but grabs no locks.
2702 */
2703void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
2704{
2705 dumper->cur_seq = clear_seq;
2706 dumper->cur_idx = clear_idx;
2707 dumper->next_seq = log_next_seq;
2708 dumper->next_idx = log_next_idx;
2709}
2710
2711/**
2632 * kmsg_dump_rewind - reset the interator 2712 * kmsg_dump_rewind - reset the interator
2633 * @dumper: registered kmsg dumper 2713 * @dumper: registered kmsg dumper
2634 * 2714 *
@@ -2641,10 +2721,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2641 unsigned long flags; 2721 unsigned long flags;
2642 2722
2643 raw_spin_lock_irqsave(&logbuf_lock, flags); 2723 raw_spin_lock_irqsave(&logbuf_lock, flags);
2644 dumper->cur_seq = clear_seq; 2724 kmsg_dump_rewind_nolock(dumper);
2645 dumper->cur_idx = clear_idx;
2646 dumper->next_seq = log_next_seq;
2647 dumper->next_idx = log_next_idx;
2648 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2725 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2649} 2726}
2650EXPORT_SYMBOL_GPL(kmsg_dump_rewind); 2727EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 95cba41ce1e9..4e6a61b15e86 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -54,6 +54,50 @@
54#ifdef CONFIG_PREEMPT_RCU 54#ifdef CONFIG_PREEMPT_RCU
55 55
56/* 56/*
57 * Preemptible RCU implementation for rcu_read_lock().
58 * Just increment ->rcu_read_lock_nesting, shared state will be updated
59 * if we block.
60 */
61void __rcu_read_lock(void)
62{
63 current->rcu_read_lock_nesting++;
64 barrier(); /* critical section after entry code. */
65}
66EXPORT_SYMBOL_GPL(__rcu_read_lock);
67
68/*
69 * Preemptible RCU implementation for rcu_read_unlock().
70 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
71 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
72 * invoke rcu_read_unlock_special() to clean up after a context switch
73 * in an RCU read-side critical section and other special cases.
74 */
75void __rcu_read_unlock(void)
76{
77 struct task_struct *t = current;
78
79 if (t->rcu_read_lock_nesting != 1) {
80 --t->rcu_read_lock_nesting;
81 } else {
82 barrier(); /* critical section before exit code. */
83 t->rcu_read_lock_nesting = INT_MIN;
84 barrier(); /* assign before ->rcu_read_unlock_special load */
85 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
86 rcu_read_unlock_special(t);
87 barrier(); /* ->rcu_read_unlock_special load before assign */
88 t->rcu_read_lock_nesting = 0;
89 }
90#ifdef CONFIG_PROVE_LOCKING
91 {
92 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
93
94 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
95 }
96#endif /* #ifdef CONFIG_PROVE_LOCKING */
97}
98EXPORT_SYMBOL_GPL(__rcu_read_unlock);
99
100/*
57 * Check for a task exiting while in a preemptible-RCU read-side 101 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings, 102 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep 103 * as debug_check_no_locks_held() already does this if lockdep
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 37a5444204d2..547b1fe5b052 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -172,7 +172,7 @@ void rcu_irq_enter(void)
172 local_irq_restore(flags); 172 local_irq_restore(flags);
173} 173}
174 174
175#ifdef CONFIG_PROVE_RCU 175#ifdef CONFIG_DEBUG_LOCK_ALLOC
176 176
177/* 177/*
178 * Test whether RCU thinks that the current CPU is idle. 178 * Test whether RCU thinks that the current CPU is idle.
@@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void)
183} 183}
184EXPORT_SYMBOL(rcu_is_cpu_idle); 184EXPORT_SYMBOL(rcu_is_cpu_idle);
185 185
186#endif /* #ifdef CONFIG_PROVE_RCU */ 186#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
187 187
188/* 188/*
189 * Test whether the current CPU was interrupted from idle. Nested 189 * Test whether the current CPU was interrupted from idle. Nested
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index fc31a2d65100..918fd1e8509c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
132 RCU_TRACE(.rcb.name = "rcu_preempt") 132 RCU_TRACE(.rcb.name = "rcu_preempt")
133}; 133};
134 134
135static void rcu_read_unlock_special(struct task_struct *t);
136static int rcu_preempted_readers_exp(void); 135static int rcu_preempted_readers_exp(void);
137static void rcu_report_exp_done(void); 136static void rcu_report_exp_done(void);
138 137
@@ -351,8 +350,9 @@ static int rcu_initiate_boost(void)
351 rcu_preempt_ctrlblk.boost_tasks = 350 rcu_preempt_ctrlblk.boost_tasks =
352 rcu_preempt_ctrlblk.gp_tasks; 351 rcu_preempt_ctrlblk.gp_tasks;
353 invoke_rcu_callbacks(); 352 invoke_rcu_callbacks();
354 } else 353 } else {
355 RCU_TRACE(rcu_initiate_boost_trace()); 354 RCU_TRACE(rcu_initiate_boost_trace());
355 }
356 return 1; 356 return 1;
357} 357}
358 358
@@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void)
527} 527}
528 528
529/* 529/*
530 * Tiny-preemptible RCU implementation for rcu_read_lock().
531 * Just increment ->rcu_read_lock_nesting, shared state will be updated
532 * if we block.
533 */
534void __rcu_read_lock(void)
535{
536 current->rcu_read_lock_nesting++;
537 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
538}
539EXPORT_SYMBOL_GPL(__rcu_read_lock);
540
541/*
542 * Handle special cases during rcu_read_unlock(), such as needing to 530 * Handle special cases during rcu_read_unlock(), such as needing to
543 * notify RCU core processing or task having blocked during the RCU 531 * notify RCU core processing or task having blocked during the RCU
544 * read-side critical section. 532 * read-side critical section.
545 */ 533 */
546static noinline void rcu_read_unlock_special(struct task_struct *t) 534void rcu_read_unlock_special(struct task_struct *t)
547{ 535{
548 int empty; 536 int empty;
549 int empty_exp; 537 int empty_exp;
@@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
627} 615}
628 616
629/* 617/*
630 * Tiny-preemptible RCU implementation for rcu_read_unlock().
631 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
632 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
633 * invoke rcu_read_unlock_special() to clean up after a context switch
634 * in an RCU read-side critical section and other special cases.
635 */
636void __rcu_read_unlock(void)
637{
638 struct task_struct *t = current;
639
640 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
641 if (t->rcu_read_lock_nesting != 1)
642 --t->rcu_read_lock_nesting;
643 else {
644 t->rcu_read_lock_nesting = INT_MIN;
645 barrier(); /* assign before ->rcu_read_unlock_special load */
646 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
647 rcu_read_unlock_special(t);
648 barrier(); /* ->rcu_read_unlock_special load before assign */
649 t->rcu_read_lock_nesting = 0;
650 }
651#ifdef CONFIG_PROVE_LOCKING
652 {
653 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
654
655 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
656 }
657#endif /* #ifdef CONFIG_PROVE_LOCKING */
658}
659EXPORT_SYMBOL_GPL(__rcu_read_unlock);
660
661/*
662 * Check for a quiescent state from the current CPU. When a task blocks, 618 * Check for a quiescent state from the current CPU. When a task blocks,
663 * the task is recorded in the rcu_preempt_ctrlblk structure, which is 619 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
664 * checked elsewhere. This is called from the scheduling-clock interrupt. 620 * checked elsewhere. This is called from the scheduling-clock interrupt.
@@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void)
823 rpcp->exp_tasks = NULL; 779 rpcp->exp_tasks = NULL;
824 780
825 /* Wait for tail of ->blkd_tasks list to drain. */ 781 /* Wait for tail of ->blkd_tasks list to drain. */
826 if (!rcu_preempted_readers_exp()) 782 if (!rcu_preempted_readers_exp()) {
827 local_irq_restore(flags); 783 local_irq_restore(flags);
828 else { 784 } else {
829 rcu_initiate_boost(); 785 rcu_initiate_boost();
830 local_irq_restore(flags); 786 local_irq_restore(flags);
831 wait_event(sync_rcu_preempt_exp_wq, 787 wait_event(sync_rcu_preempt_exp_wq,
@@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
846 */ 802 */
847int rcu_preempt_needs_cpu(void) 803int rcu_preempt_needs_cpu(void)
848{ 804{
849 if (!rcu_preempt_running_reader())
850 rcu_preempt_cpu_qs();
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 805 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 806}
853 807
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e66b34ab7555..25b15033c61f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,8 +49,7 @@
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
53 "Josh Triplett <josh@freedesktop.org>");
54 53
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 55static int nfakewriters = 4; /* # fake writer threads */
@@ -206,6 +205,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */
206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 205DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
207 /* and boost task create/destroy. */ 206 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ 207static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
208static bool barrier_phase; /* Test phase. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ 209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ 210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); 211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
@@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p)
407 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { 407 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
408 rp->rtort_mbtest = 0; 408 rp->rtort_mbtest = 0;
409 rcu_torture_free(rp); 409 rcu_torture_free(rp);
410 } else 410 } else {
411 cur_ops->deferred_free(rp); 411 cur_ops->deferred_free(rp);
412 }
412} 413}
413 414
414static int rcu_no_completed(void) 415static int rcu_no_completed(void)
@@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void)
635 synchronize_srcu(&srcu_ctl); 636 synchronize_srcu(&srcu_ctl);
636} 637}
637 638
639static void srcu_torture_call(struct rcu_head *head,
640 void (*func)(struct rcu_head *head))
641{
642 call_srcu(&srcu_ctl, head, func);
643}
644
645static void srcu_torture_barrier(void)
646{
647 srcu_barrier(&srcu_ctl);
648}
649
638static int srcu_torture_stats(char *page) 650static int srcu_torture_stats(char *page)
639{ 651{
640 int cnt = 0; 652 int cnt = 0;
@@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = {
661 .completed = srcu_torture_completed, 673 .completed = srcu_torture_completed,
662 .deferred_free = srcu_torture_deferred_free, 674 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 675 .sync = srcu_torture_synchronize,
664 .call = NULL, 676 .call = srcu_torture_call,
665 .cb_barrier = NULL, 677 .cb_barrier = srcu_torture_barrier,
666 .stats = srcu_torture_stats, 678 .stats = srcu_torture_stats,
667 .name = "srcu" 679 .name = "srcu"
668}; 680};
@@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg)
1013 do { 1025 do {
1014 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 1026 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
1015 udelay(rcu_random(&rand) & 0x3ff); 1027 udelay(rcu_random(&rand) & 0x3ff);
1016 cur_ops->sync(); 1028 if (cur_ops->cb_barrier != NULL &&
1029 rcu_random(&rand) % (nfakewriters * 8) == 0)
1030 cur_ops->cb_barrier();
1031 else
1032 cur_ops->sync();
1017 rcu_stutter_wait("rcu_torture_fakewriter"); 1033 rcu_stutter_wait("rcu_torture_fakewriter");
1018 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1034 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1019 1035
@@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page)
1183 } 1199 }
1184 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1200 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1185 cnt += sprintf(&page[cnt], 1201 cnt += sprintf(&page[cnt],
1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1202 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1188 "rtbf: %ld rtb: %ld nt: %ld "
1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1191 rcu_torture_current, 1203 rcu_torture_current,
1192 rcu_torture_current_version, 1204 rcu_torture_current_version,
1193 list_empty(&rcu_torture_freelist), 1205 list_empty(&rcu_torture_freelist),
1194 atomic_read(&n_rcu_torture_alloc), 1206 atomic_read(&n_rcu_torture_alloc),
1195 atomic_read(&n_rcu_torture_alloc_fail), 1207 atomic_read(&n_rcu_torture_alloc_fail),
1196 atomic_read(&n_rcu_torture_free), 1208 atomic_read(&n_rcu_torture_free));
1209 cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
1197 atomic_read(&n_rcu_torture_mberror), 1210 atomic_read(&n_rcu_torture_mberror),
1198 n_rcu_torture_boost_ktrerror, 1211 n_rcu_torture_boost_ktrerror,
1199 n_rcu_torture_boost_rterror, 1212 n_rcu_torture_boost_rterror);
1213 cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
1200 n_rcu_torture_boost_failure, 1214 n_rcu_torture_boost_failure,
1201 n_rcu_torture_boosts, 1215 n_rcu_torture_boosts,
1202 n_rcu_torture_timers, 1216 n_rcu_torture_timers);
1217 cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ",
1203 n_online_successes, 1218 n_online_successes,
1204 n_online_attempts, 1219 n_online_attempts,
1205 n_offline_successes, 1220 n_offline_successes,
1206 n_offline_attempts, 1221 n_offline_attempts);
1222 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
1207 n_barrier_successes, 1223 n_barrier_successes,
1208 n_barrier_attempts, 1224 n_barrier_attempts,
1209 n_rcu_torture_barrier_error); 1225 n_rcu_torture_barrier_error);
@@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg)
1445 delta = shutdown_time - jiffies_snap; 1461 delta = shutdown_time - jiffies_snap;
1446 if (verbose) 1462 if (verbose)
1447 printk(KERN_ALERT "%s" TORTURE_FLAG 1463 printk(KERN_ALERT "%s" TORTURE_FLAG
1448 "rcu_torture_shutdown task: %lu " 1464 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1449 "jiffies remaining\n",
1450 torture_type, delta); 1465 torture_type, delta);
1451 schedule_timeout_interruptible(delta); 1466 schedule_timeout_interruptible(delta);
1452 jiffies_snap = ACCESS_ONCE(jiffies); 1467 jiffies_snap = ACCESS_ONCE(jiffies);
@@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg)
1498 if (cpu_down(cpu) == 0) { 1513 if (cpu_down(cpu) == 0) {
1499 if (verbose) 1514 if (verbose)
1500 printk(KERN_ALERT "%s" TORTURE_FLAG 1515 printk(KERN_ALERT "%s" TORTURE_FLAG
1501 "rcu_torture_onoff task: " 1516 "rcu_torture_onoff task: offlined %d\n",
1502 "offlined %d\n",
1503 torture_type, cpu); 1517 torture_type, cpu);
1504 n_offline_successes++; 1518 n_offline_successes++;
1505 } 1519 }
@@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg)
1512 if (cpu_up(cpu) == 0) { 1526 if (cpu_up(cpu) == 0) {
1513 if (verbose) 1527 if (verbose)
1514 printk(KERN_ALERT "%s" TORTURE_FLAG 1528 printk(KERN_ALERT "%s" TORTURE_FLAG
1515 "rcu_torture_onoff task: " 1529 "rcu_torture_onoff task: onlined %d\n",
1516 "onlined %d\n",
1517 torture_type, cpu); 1530 torture_type, cpu);
1518 n_online_successes++; 1531 n_online_successes++;
1519 } 1532 }
@@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1631static int rcu_torture_barrier_cbs(void *arg) 1644static int rcu_torture_barrier_cbs(void *arg)
1632{ 1645{
1633 long myid = (long)arg; 1646 long myid = (long)arg;
1647 bool lastphase = 0;
1634 struct rcu_head rcu; 1648 struct rcu_head rcu;
1635 1649
1636 init_rcu_head_on_stack(&rcu); 1650 init_rcu_head_on_stack(&rcu);
@@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg)
1638 set_user_nice(current, 19); 1652 set_user_nice(current, 19);
1639 do { 1653 do {
1640 wait_event(barrier_cbs_wq[myid], 1654 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs || 1655 barrier_phase != lastphase ||
1642 kthread_should_stop() || 1656 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP); 1657 fullstop != FULLSTOP_DONTSTOP);
1658 lastphase = barrier_phase;
1659 smp_mb(); /* ensure barrier_phase load before ->call(). */
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1660 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break; 1661 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf); 1662 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
@@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg)
1665 do { 1681 do {
1666 atomic_set(&barrier_cbs_invoked, 0); 1682 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs); 1683 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */ 1684 smp_mb(); /* Ensure barrier_phase after prior assignments. */
1685 barrier_phase = !barrier_phase;
1669 for (i = 0; i < n_barrier_cbs; i++) 1686 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]); 1687 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq, 1688 wait_event(barrier_wq,
@@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg)
1684 schedule_timeout_interruptible(HZ / 10); 1701 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1702 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); 1703 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); 1704 rcutorture_shutdown_absorb("rcu_torture_barrier");
1688 while (!kthread_should_stop()) 1705 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1); 1706 schedule_timeout_interruptible(1);
1690 return 0; 1707 return 0;
@@ -1908,8 +1925,8 @@ rcu_torture_init(void)
1908 static struct rcu_torture_ops *torture_ops[] = 1925 static struct rcu_torture_ops *torture_ops[] =
1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1926 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1927 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, 1928 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops, 1929 &srcu_raw_ops, &srcu_raw_sync_ops,
1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1930 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1914 1931
1915 mutex_lock(&fullstop_mutex); 1932 mutex_lock(&fullstop_mutex);
@@ -1931,8 +1948,7 @@ rcu_torture_init(void)
1931 return -EINVAL; 1948 return -EINVAL;
1932 } 1949 }
1933 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1950 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1934 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " 1951 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
1935 "fqs_duration, fqs disabled.\n");
1936 fqs_duration = 0; 1952 fqs_duration = 0;
1937 } 1953 }
1938 if (cur_ops->init) 1954 if (cur_ops->init)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 38ecdda3f55f..f280e542e3e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,36 +60,44 @@
60 60
61/* Data structures. */ 61/* Data structures. */
62 62
63static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 63static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
64 64
65#define RCU_STATE_INITIALIZER(structname) { \ 65#define RCU_STATE_INITIALIZER(sname, cr) { \
66 .level = { &structname##_state.node[0] }, \ 66 .level = { &sname##_state.node[0] }, \
67 .levelcnt = { \ 67 .call = cr, \
68 NUM_RCU_LVL_0, /* root of hierarchy. */ \
69 NUM_RCU_LVL_1, \
70 NUM_RCU_LVL_2, \
71 NUM_RCU_LVL_3, \
72 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
73 }, \
74 .fqs_state = RCU_GP_IDLE, \ 68 .fqs_state = RCU_GP_IDLE, \
75 .gpnum = -300, \ 69 .gpnum = -300, \
76 .completed = -300, \ 70 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 71 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \ 72 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \ 73 .orphan_donetail = &sname##_state.orphan_donelist, \
80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 74 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
81 .n_force_qs = 0, \ 75 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
82 .n_force_qs_ngp = 0, \ 76 .name = #sname, \
83 .name = #structname, \
84} 77}
85 78
86struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); 79struct rcu_state rcu_sched_state =
80 RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
87DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
88 82
89struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); 83struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
90DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 84DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
91 85
92static struct rcu_state *rcu_state; 86static struct rcu_state *rcu_state;
87LIST_HEAD(rcu_struct_flavors);
88
89/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
90static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
91module_param(rcu_fanout_leaf, int, 0);
92int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
93static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
94 NUM_RCU_LVL_0,
95 NUM_RCU_LVL_1,
96 NUM_RCU_LVL_2,
97 NUM_RCU_LVL_3,
98 NUM_RCU_LVL_4,
99};
100int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
93 101
94/* 102/*
95 * The rcu_scheduler_active variable transitions from zero to one just 103 * The rcu_scheduler_active variable transitions from zero to one just
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
147unsigned long rcutorture_testseq; 155unsigned long rcutorture_testseq;
148unsigned long rcutorture_vernum; 156unsigned long rcutorture_vernum;
149 157
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
157/* 158/*
158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 159 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
159 * permit this function to be invoked without holding the root rcu_node 160 * permit this function to be invoked without holding the root rcu_node
@@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu)
201{ 202{
202 trace_rcu_utilization("Start context switch"); 203 trace_rcu_utilization("Start context switch");
203 rcu_sched_qs(cpu); 204 rcu_sched_qs(cpu);
205 rcu_preempt_note_context_switch(cpu);
204 trace_rcu_utilization("End context switch"); 206 trace_rcu_utilization("End context switch");
205} 207}
206EXPORT_SYMBOL_GPL(rcu_note_context_switch); 208EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
357 struct task_struct *idle = idle_task(smp_processor_id()); 359 struct task_struct *idle = idle_task(smp_processor_id());
358 360
359 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 361 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
360 ftrace_dump(DUMP_ALL); 362 ftrace_dump(DUMP_ORIG);
361 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 363 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
362 current->pid, current->comm, 364 current->pid, current->comm,
363 idle->pid, idle->comm); /* must be idle task! */ 365 idle->pid, idle->comm); /* must be idle task! */
@@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
467 469
468 trace_rcu_dyntick("Error on exit: not idle task", 470 trace_rcu_dyntick("Error on exit: not idle task",
469 oldval, rdtp->dynticks_nesting); 471 oldval, rdtp->dynticks_nesting);
470 ftrace_dump(DUMP_ALL); 472 ftrace_dump(DUMP_ORIG);
471 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 473 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
472 current->pid, current->comm, 474 current->pid, current->comm,
473 idle->pid, idle->comm); /* must be idle task! */ 475 idle->pid, idle->comm); /* must be idle task! */
@@ -584,8 +586,6 @@ void rcu_nmi_exit(void)
584 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 586 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
585} 587}
586 588
587#ifdef CONFIG_PROVE_RCU
588
589/** 589/**
590 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle 590 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
591 * 591 *
@@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void)
603} 603}
604EXPORT_SYMBOL(rcu_is_cpu_idle); 604EXPORT_SYMBOL(rcu_is_cpu_idle);
605 605
606#ifdef CONFIG_HOTPLUG_CPU 606#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
607 607
608/* 608/*
609 * Is the current CPU online? Disable preemption to avoid false positives 609 * Is the current CPU online? Disable preemption to avoid false positives
@@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)
644} 644}
645EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); 645EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
646 646
647#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 647#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
648
649#endif /* #ifdef CONFIG_PROVE_RCU */
650 648
651/** 649/**
652 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 650 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
732 int cpu; 730 int cpu;
733 long delta; 731 long delta;
734 unsigned long flags; 732 unsigned long flags;
735 int ndetected; 733 int ndetected = 0;
736 struct rcu_node *rnp = rcu_get_root(rsp); 734 struct rcu_node *rnp = rcu_get_root(rsp);
737 735
738 /* Only let one CPU complain about others per time interval. */ 736 /* Only let one CPU complain about others per time interval. */
@@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
773 */ 771 */
774 rnp = rcu_get_root(rsp); 772 rnp = rcu_get_root(rsp);
775 raw_spin_lock_irqsave(&rnp->lock, flags); 773 raw_spin_lock_irqsave(&rnp->lock, flags);
776 ndetected = rcu_print_task_stall(rnp); 774 ndetected += rcu_print_task_stall(rnp);
777 raw_spin_unlock_irqrestore(&rnp->lock, flags); 775 raw_spin_unlock_irqrestore(&rnp->lock, flags);
778 776
779 print_cpu_stall_info_end(); 777 print_cpu_stall_info_end();
@@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
859 */ 857 */
860void rcu_cpu_stall_reset(void) 858void rcu_cpu_stall_reset(void)
861{ 859{
862 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; 860 struct rcu_state *rsp;
863 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; 861
864 rcu_preempt_stall_reset(); 862 for_each_rcu_flavor(rsp)
863 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
865} 864}
866 865
867static struct notifier_block rcu_panic_block = { 866static struct notifier_block rcu_panic_block = {
@@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
893 if (rnp->qsmask & rdp->grpmask) { 892 if (rnp->qsmask & rdp->grpmask) {
894 rdp->qs_pending = 1; 893 rdp->qs_pending = 1;
895 rdp->passed_quiesce = 0; 894 rdp->passed_quiesce = 0;
896 } else 895 } else {
897 rdp->qs_pending = 0; 896 rdp->qs_pending = 0;
897 }
898 zero_cpu_stall_ticks(rdp); 898 zero_cpu_stall_ticks(rdp);
899 } 899 }
900} 900}
@@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
936} 936}
937 937
938/* 938/*
939 * Initialize the specified rcu_data structure's callback list to empty.
940 */
941static void init_callback_list(struct rcu_data *rdp)
942{
943 int i;
944
945 rdp->nxtlist = NULL;
946 for (i = 0; i < RCU_NEXT_SIZE; i++)
947 rdp->nxttail[i] = &rdp->nxtlist;
948}
949
950/*
939 * Advance this CPU's callbacks, but only if the current grace period 951 * Advance this CPU's callbacks, but only if the current grace period
940 * has ended. This may be called only from the CPU to whom the rdp 952 * has ended. This may be called only from the CPU to whom the rdp
941 * belongs. In addition, the corresponding leaf rcu_node structure's 953 * belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1327,8 +1339,6 @@ static void
1327rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1339rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1328 struct rcu_node *rnp, struct rcu_data *rdp) 1340 struct rcu_node *rnp, struct rcu_data *rdp)
1329{ 1341{
1330 int i;
1331
1332 /* 1342 /*
1333 * Orphan the callbacks. First adjust the counts. This is safe 1343 * Orphan the callbacks. First adjust the counts. This is safe
1334 * because ->onofflock excludes _rcu_barrier()'s adoption of 1344 * because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1339 rsp->qlen += rdp->qlen; 1349 rsp->qlen += rdp->qlen;
1340 rdp->n_cbs_orphaned += rdp->qlen; 1350 rdp->n_cbs_orphaned += rdp->qlen;
1341 rdp->qlen_lazy = 0; 1351 rdp->qlen_lazy = 0;
1342 rdp->qlen = 0; 1352 ACCESS_ONCE(rdp->qlen) = 0;
1343 } 1353 }
1344 1354
1345 /* 1355 /*
@@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1368 } 1378 }
1369 1379
1370 /* Finally, initialize the rcu_data structure's list to empty. */ 1380 /* Finally, initialize the rcu_data structure's list to empty. */
1371 rdp->nxtlist = NULL; 1381 init_callback_list(rdp);
1372 for (i = 0; i < RCU_NEXT_SIZE; i++)
1373 rdp->nxttail[i] = &rdp->nxtlist;
1374} 1382}
1375 1383
1376/* 1384/*
@@ -1504,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1504 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1512 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1505 if (need_report & RCU_OFL_TASKS_EXP_GP) 1513 if (need_report & RCU_OFL_TASKS_EXP_GP)
1506 rcu_report_exp_rnp(rsp, rnp, true); 1514 rcu_report_exp_rnp(rsp, rnp, true);
1515 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
1516 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
1517 cpu, rdp->qlen, rdp->nxtlist);
1507} 1518}
1508 1519
1509#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1520#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1591,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1591 } 1602 }
1592 smp_mb(); /* List handling before counting for rcu_barrier(). */ 1603 smp_mb(); /* List handling before counting for rcu_barrier(). */
1593 rdp->qlen_lazy -= count_lazy; 1604 rdp->qlen_lazy -= count_lazy;
1594 rdp->qlen -= count; 1605 ACCESS_ONCE(rdp->qlen) -= count;
1595 rdp->n_cbs_invoked += count; 1606 rdp->n_cbs_invoked += count;
1596 1607
1597 /* Reinstate batch limit if we have worked down the excess. */ 1608 /* Reinstate batch limit if we have worked down the excess. */
@@ -1604,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1604 rdp->n_force_qs_snap = rsp->n_force_qs; 1615 rdp->n_force_qs_snap = rsp->n_force_qs;
1605 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 1616 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1606 rdp->qlen_last_fqs_check = rdp->qlen; 1617 rdp->qlen_last_fqs_check = rdp->qlen;
1618 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
1607 1619
1608 local_irq_restore(flags); 1620 local_irq_restore(flags);
1609 1621
@@ -1744,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1744 break; /* grace period idle or initializing, ignore. */ 1756 break; /* grace period idle or initializing, ignore. */
1745 1757
1746 case RCU_SAVE_DYNTICK: 1758 case RCU_SAVE_DYNTICK:
1747 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1748 break; /* So gcc recognizes the dead code. */
1749 1759
1750 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1760 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1751 1761
@@ -1787,9 +1797,10 @@ unlock_fqs_ret:
1787 * whom the rdp belongs. 1797 * whom the rdp belongs.
1788 */ 1798 */
1789static void 1799static void
1790__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1800__rcu_process_callbacks(struct rcu_state *rsp)
1791{ 1801{
1792 unsigned long flags; 1802 unsigned long flags;
1803 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1793 1804
1794 WARN_ON_ONCE(rdp->beenonline == 0); 1805 WARN_ON_ONCE(rdp->beenonline == 0);
1795 1806
@@ -1825,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1825 */ 1836 */
1826static void rcu_process_callbacks(struct softirq_action *unused) 1837static void rcu_process_callbacks(struct softirq_action *unused)
1827{ 1838{
1839 struct rcu_state *rsp;
1840
1828 trace_rcu_utilization("Start RCU core"); 1841 trace_rcu_utilization("Start RCU core");
1829 __rcu_process_callbacks(&rcu_sched_state, 1842 for_each_rcu_flavor(rsp)
1830 &__get_cpu_var(rcu_sched_data)); 1843 __rcu_process_callbacks(rsp);
1831 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1832 rcu_preempt_process_callbacks();
1833 trace_rcu_utilization("End RCU core"); 1844 trace_rcu_utilization("End RCU core");
1834} 1845}
1835 1846
@@ -1856,6 +1867,56 @@ static void invoke_rcu_core(void)
1856 raise_softirq(RCU_SOFTIRQ); 1867 raise_softirq(RCU_SOFTIRQ);
1857} 1868}
1858 1869
1870/*
1871 * Handle any core-RCU processing required by a call_rcu() invocation.
1872 */
1873static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
1874 struct rcu_head *head, unsigned long flags)
1875{
1876 /*
1877 * If called from an extended quiescent state, invoke the RCU
1878 * core in order to force a re-evaluation of RCU's idleness.
1879 */
1880 if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
1881 invoke_rcu_core();
1882
1883 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
1884 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
1885 return;
1886
1887 /*
1888 * Force the grace period if too many callbacks or too long waiting.
1889 * Enforce hysteresis, and don't invoke force_quiescent_state()
1890 * if some other CPU has recently done so. Also, don't bother
1891 * invoking force_quiescent_state() if the newly enqueued callback
1892 * is the only one waiting for a grace period to complete.
1893 */
1894 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1895
1896 /* Are we ignoring a completed grace period? */
1897 rcu_process_gp_end(rsp, rdp);
1898 check_for_new_grace_period(rsp, rdp);
1899
1900 /* Start a new grace period if one not already started. */
1901 if (!rcu_gp_in_progress(rsp)) {
1902 unsigned long nestflag;
1903 struct rcu_node *rnp_root = rcu_get_root(rsp);
1904
1905 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1906 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1907 } else {
1908 /* Give the grace period a kick. */
1909 rdp->blimit = LONG_MAX;
1910 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1911 *rdp->nxttail[RCU_DONE_TAIL] != head)
1912 force_quiescent_state(rsp, 0);
1913 rdp->n_force_qs_snap = rsp->n_force_qs;
1914 rdp->qlen_last_fqs_check = rdp->qlen;
1915 }
1916 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1917 force_quiescent_state(rsp, 1);
1918}
1919
1859static void 1920static void
1860__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1921__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1861 struct rcu_state *rsp, bool lazy) 1922 struct rcu_state *rsp, bool lazy)
@@ -1880,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1880 rdp = this_cpu_ptr(rsp->rda); 1941 rdp = this_cpu_ptr(rsp->rda);
1881 1942
1882 /* Add the callback to our list. */ 1943 /* Add the callback to our list. */
1883 rdp->qlen++; 1944 ACCESS_ONCE(rdp->qlen)++;
1884 if (lazy) 1945 if (lazy)
1885 rdp->qlen_lazy++; 1946 rdp->qlen_lazy++;
1886 else 1947 else
@@ -1895,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1895 else 1956 else
1896 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 1957 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
1897 1958
1898 /* If interrupts were disabled, don't dive into RCU core. */ 1959 /* Go handle any RCU core processing required. */
1899 if (irqs_disabled_flags(flags)) { 1960 __call_rcu_core(rsp, rdp, head, flags);
1900 local_irq_restore(flags);
1901 return;
1902 }
1903
1904 /*
1905 * Force the grace period if too many callbacks or too long waiting.
1906 * Enforce hysteresis, and don't invoke force_quiescent_state()
1907 * if some other CPU has recently done so. Also, don't bother
1908 * invoking force_quiescent_state() if the newly enqueued callback
1909 * is the only one waiting for a grace period to complete.
1910 */
1911 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1912
1913 /* Are we ignoring a completed grace period? */
1914 rcu_process_gp_end(rsp, rdp);
1915 check_for_new_grace_period(rsp, rdp);
1916
1917 /* Start a new grace period if one not already started. */
1918 if (!rcu_gp_in_progress(rsp)) {
1919 unsigned long nestflag;
1920 struct rcu_node *rnp_root = rcu_get_root(rsp);
1921
1922 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1923 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1924 } else {
1925 /* Give the grace period a kick. */
1926 rdp->blimit = LONG_MAX;
1927 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1928 *rdp->nxttail[RCU_DONE_TAIL] != head)
1929 force_quiescent_state(rsp, 0);
1930 rdp->n_force_qs_snap = rsp->n_force_qs;
1931 rdp->qlen_last_fqs_check = rdp->qlen;
1932 }
1933 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1934 force_quiescent_state(rsp, 1);
1935 local_irq_restore(flags); 1961 local_irq_restore(flags);
1936} 1962}
1937 1963
@@ -1961,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1961 * occasionally incorrectly indicate that there are multiple CPUs online 1987 * occasionally incorrectly indicate that there are multiple CPUs online
1962 * when there was in fact only one the whole time, as this just adds 1988 * when there was in fact only one the whole time, as this just adds
1963 * some overhead: RCU still operates correctly. 1989 * some overhead: RCU still operates correctly.
1964 *
1965 * Of course, sampling num_online_cpus() with preemption enabled can
1966 * give erroneous results if there are concurrent CPU-hotplug operations.
1967 * For example, given a demonic sequence of preemptions in num_online_cpus()
1968 * and CPU-hotplug operations, there could be two or more CPUs online at
1969 * all times, but num_online_cpus() might well return one (or even zero).
1970 *
1971 * However, all such demonic sequences require at least one CPU-offline
1972 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1973 * is only a problem if there is an RCU read-side critical section executing
1974 * throughout. But RCU-sched and RCU-bh read-side critical sections
1975 * disable either preemption or bh, which prevents a CPU from going offline.
1976 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1977 * that there is only one CPU when in fact there was more than one throughout
1978 * is when there were no RCU readers in the system. If there are no
1979 * RCU readers, the grace period by definition can be of zero length,
1980 * regardless of the number of online CPUs.
1981 */ 1990 */
1982static inline int rcu_blocking_is_gp(void) 1991static inline int rcu_blocking_is_gp(void)
1983{ 1992{
1993 int ret;
1994
1984 might_sleep(); /* Check for RCU read-side critical section. */ 1995 might_sleep(); /* Check for RCU read-side critical section. */
1985 return num_online_cpus() <= 1; 1996 preempt_disable();
1997 ret = num_online_cpus() <= 1;
1998 preempt_enable();
1999 return ret;
1986} 2000}
1987 2001
1988/** 2002/**
@@ -2117,9 +2131,9 @@ void synchronize_sched_expedited(void)
2117 put_online_cpus(); 2131 put_online_cpus();
2118 2132
2119 /* No joy, try again later. Or just synchronize_sched(). */ 2133 /* No joy, try again later. Or just synchronize_sched(). */
2120 if (trycount++ < 10) 2134 if (trycount++ < 10) {
2121 udelay(trycount * num_online_cpus()); 2135 udelay(trycount * num_online_cpus());
2122 else { 2136 } else {
2123 synchronize_sched(); 2137 synchronize_sched();
2124 return; 2138 return;
2125 } 2139 }
@@ -2240,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2240 */ 2254 */
2241static int rcu_pending(int cpu) 2255static int rcu_pending(int cpu)
2242{ 2256{
2243 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || 2257 struct rcu_state *rsp;
2244 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || 2258
2245 rcu_preempt_pending(cpu); 2259 for_each_rcu_flavor(rsp)
2260 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
2261 return 1;
2262 return 0;
2246} 2263}
2247 2264
2248/* 2265/*
@@ -2252,20 +2269,41 @@ static int rcu_pending(int cpu)
2252 */ 2269 */
2253static int rcu_cpu_has_callbacks(int cpu) 2270static int rcu_cpu_has_callbacks(int cpu)
2254{ 2271{
2272 struct rcu_state *rsp;
2273
2255 /* RCU callbacks either ready or pending? */ 2274 /* RCU callbacks either ready or pending? */
2256 return per_cpu(rcu_sched_data, cpu).nxtlist || 2275 for_each_rcu_flavor(rsp)
2257 per_cpu(rcu_bh_data, cpu).nxtlist || 2276 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
2258 rcu_preempt_cpu_has_callbacks(cpu); 2277 return 1;
2278 return 0;
2279}
2280
2281/*
2282 * Helper function for _rcu_barrier() tracing. If tracing is disabled,
2283 * the compiler is expected to optimize this away.
2284 */
2285static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
2286 int cpu, unsigned long done)
2287{
2288 trace_rcu_barrier(rsp->name, s, cpu,
2289 atomic_read(&rsp->barrier_cpu_count), done);
2259} 2290}
2260 2291
2261/* 2292/*
2262 * RCU callback function for _rcu_barrier(). If we are last, wake 2293 * RCU callback function for _rcu_barrier(). If we are last, wake
2263 * up the task executing _rcu_barrier(). 2294 * up the task executing _rcu_barrier().
2264 */ 2295 */
2265static void rcu_barrier_callback(struct rcu_head *notused) 2296static void rcu_barrier_callback(struct rcu_head *rhp)
2266{ 2297{
2267 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2298 struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
2268 complete(&rcu_barrier_completion); 2299 struct rcu_state *rsp = rdp->rsp;
2300
2301 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
2302 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
2303 complete(&rsp->barrier_completion);
2304 } else {
2305 _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
2306 }
2269} 2307}
2270 2308
2271/* 2309/*
@@ -2273,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)
2273 */ 2311 */
2274static void rcu_barrier_func(void *type) 2312static void rcu_barrier_func(void *type)
2275{ 2313{
2276 int cpu = smp_processor_id(); 2314 struct rcu_state *rsp = type;
2277 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); 2315 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
2278 void (*call_rcu_func)(struct rcu_head *head,
2279 void (*func)(struct rcu_head *head));
2280 2316
2281 atomic_inc(&rcu_barrier_cpu_count); 2317 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
2282 call_rcu_func = type; 2318 atomic_inc(&rsp->barrier_cpu_count);
2283 call_rcu_func(head, rcu_barrier_callback); 2319 rsp->call(&rdp->barrier_head, rcu_barrier_callback);
2284} 2320}
2285 2321
2286/* 2322/*
2287 * Orchestrate the specified type of RCU barrier, waiting for all 2323 * Orchestrate the specified type of RCU barrier, waiting for all
2288 * RCU callbacks of the specified type to complete. 2324 * RCU callbacks of the specified type to complete.
2289 */ 2325 */
2290static void _rcu_barrier(struct rcu_state *rsp, 2326static void _rcu_barrier(struct rcu_state *rsp)
2291 void (*call_rcu_func)(struct rcu_head *head,
2292 void (*func)(struct rcu_head *head)))
2293{ 2327{
2294 int cpu; 2328 int cpu;
2295 unsigned long flags; 2329 unsigned long flags;
2296 struct rcu_data *rdp; 2330 struct rcu_data *rdp;
2297 struct rcu_head rh; 2331 struct rcu_data rd;
2332 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
2333 unsigned long snap_done;
2298 2334
2299 init_rcu_head_on_stack(&rh); 2335 init_rcu_head_on_stack(&rd.barrier_head);
2336 _rcu_barrier_trace(rsp, "Begin", -1, snap);
2300 2337
2301 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2338 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2302 mutex_lock(&rcu_barrier_mutex); 2339 mutex_lock(&rsp->barrier_mutex);
2340
2341 /*
2342 * Ensure that all prior references, including to ->n_barrier_done,
2343 * are ordered before the _rcu_barrier() machinery.
2344 */
2345 smp_mb(); /* See above block comment. */
2346
2347 /*
2348 * Recheck ->n_barrier_done to see if others did our work for us.
2349 * This means checking ->n_barrier_done for an even-to-odd-to-even
2350 * transition. The "if" expression below therefore rounds the old
2351 * value up to the next even number and adds two before comparing.
2352 */
2353 snap_done = ACCESS_ONCE(rsp->n_barrier_done);
2354 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2355 if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
2356 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2357 smp_mb(); /* caller's subsequent code after above check. */
2358 mutex_unlock(&rsp->barrier_mutex);
2359 return;
2360 }
2303 2361
2304 smp_mb(); /* Prevent any prior operations from leaking in. */ 2362 /*
2363 * Increment ->n_barrier_done to avoid duplicate work. Use
2364 * ACCESS_ONCE() to prevent the compiler from speculating
2365 * the increment to precede the early-exit check.
2366 */
2367 ACCESS_ONCE(rsp->n_barrier_done)++;
2368 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
2369 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
2370 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
2305 2371
2306 /* 2372 /*
2307 * Initialize the count to one rather than to zero in order to 2373 * Initialize the count to one rather than to zero in order to
@@ -2320,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,
2320 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening 2386 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2321 * us -- but before CPU 1's orphaned callbacks are invoked!!! 2387 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2322 */ 2388 */
2323 init_completion(&rcu_barrier_completion); 2389 init_completion(&rsp->barrier_completion);
2324 atomic_set(&rcu_barrier_cpu_count, 1); 2390 atomic_set(&rsp->barrier_cpu_count, 1);
2325 raw_spin_lock_irqsave(&rsp->onofflock, flags); 2391 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2326 rsp->rcu_barrier_in_progress = current; 2392 rsp->rcu_barrier_in_progress = current;
2327 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2393 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2337,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,
2337 preempt_disable(); 2403 preempt_disable();
2338 rdp = per_cpu_ptr(rsp->rda, cpu); 2404 rdp = per_cpu_ptr(rsp->rda, cpu);
2339 if (cpu_is_offline(cpu)) { 2405 if (cpu_is_offline(cpu)) {
2406 _rcu_barrier_trace(rsp, "Offline", cpu,
2407 rsp->n_barrier_done);
2340 preempt_enable(); 2408 preempt_enable();
2341 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) 2409 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2342 schedule_timeout_interruptible(1); 2410 schedule_timeout_interruptible(1);
2343 } else if (ACCESS_ONCE(rdp->qlen)) { 2411 } else if (ACCESS_ONCE(rdp->qlen)) {
2344 smp_call_function_single(cpu, rcu_barrier_func, 2412 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2345 (void *)call_rcu_func, 1); 2413 rsp->n_barrier_done);
2414 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
2346 preempt_enable(); 2415 preempt_enable();
2347 } else { 2416 } else {
2417 _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
2418 rsp->n_barrier_done);
2348 preempt_enable(); 2419 preempt_enable();
2349 } 2420 }
2350 } 2421 }
@@ -2361,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,
2361 rcu_adopt_orphan_cbs(rsp); 2432 rcu_adopt_orphan_cbs(rsp);
2362 rsp->rcu_barrier_in_progress = NULL; 2433 rsp->rcu_barrier_in_progress = NULL;
2363 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2434 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2364 atomic_inc(&rcu_barrier_cpu_count); 2435 atomic_inc(&rsp->barrier_cpu_count);
2365 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ 2436 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2366 call_rcu_func(&rh, rcu_barrier_callback); 2437 rd.rsp = rsp;
2438 rsp->call(&rd.barrier_head, rcu_barrier_callback);
2367 2439
2368 /* 2440 /*
2369 * Now that we have an rcu_barrier_callback() callback on each 2441 * Now that we have an rcu_barrier_callback() callback on each
2370 * CPU, and thus each counted, remove the initial count. 2442 * CPU, and thus each counted, remove the initial count.
2371 */ 2443 */
2372 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2444 if (atomic_dec_and_test(&rsp->barrier_cpu_count))
2373 complete(&rcu_barrier_completion); 2445 complete(&rsp->barrier_completion);
2446
2447 /* Increment ->n_barrier_done to prevent duplicate work. */
2448 smp_mb(); /* Keep increment after above mechanism. */
2449 ACCESS_ONCE(rsp->n_barrier_done)++;
2450 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
2451 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
2452 smp_mb(); /* Keep increment before caller's subsequent code. */
2374 2453
2375 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 2454 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2376 wait_for_completion(&rcu_barrier_completion); 2455 wait_for_completion(&rsp->barrier_completion);
2377 2456
2378 /* Other rcu_barrier() invocations can now safely proceed. */ 2457 /* Other rcu_barrier() invocations can now safely proceed. */
2379 mutex_unlock(&rcu_barrier_mutex); 2458 mutex_unlock(&rsp->barrier_mutex);
2380 2459
2381 destroy_rcu_head_on_stack(&rh); 2460 destroy_rcu_head_on_stack(&rd.barrier_head);
2382} 2461}
2383 2462
2384/** 2463/**
@@ -2386,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
2386 */ 2465 */
2387void rcu_barrier_bh(void) 2466void rcu_barrier_bh(void)
2388{ 2467{
2389 _rcu_barrier(&rcu_bh_state, call_rcu_bh); 2468 _rcu_barrier(&rcu_bh_state);
2390} 2469}
2391EXPORT_SYMBOL_GPL(rcu_barrier_bh); 2470EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2392 2471
@@ -2395,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2395 */ 2474 */
2396void rcu_barrier_sched(void) 2475void rcu_barrier_sched(void)
2397{ 2476{
2398 _rcu_barrier(&rcu_sched_state, call_rcu_sched); 2477 _rcu_barrier(&rcu_sched_state);
2399} 2478}
2400EXPORT_SYMBOL_GPL(rcu_barrier_sched); 2479EXPORT_SYMBOL_GPL(rcu_barrier_sched);
2401 2480
@@ -2406,18 +2485,15 @@ static void __init
2406rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 2485rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2407{ 2486{
2408 unsigned long flags; 2487 unsigned long flags;
2409 int i;
2410 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2488 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2411 struct rcu_node *rnp = rcu_get_root(rsp); 2489 struct rcu_node *rnp = rcu_get_root(rsp);
2412 2490
2413 /* Set up local state, ensuring consistent view of global state. */ 2491 /* Set up local state, ensuring consistent view of global state. */
2414 raw_spin_lock_irqsave(&rnp->lock, flags); 2492 raw_spin_lock_irqsave(&rnp->lock, flags);
2415 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 2493 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
2416 rdp->nxtlist = NULL; 2494 init_callback_list(rdp);
2417 for (i = 0; i < RCU_NEXT_SIZE; i++)
2418 rdp->nxttail[i] = &rdp->nxtlist;
2419 rdp->qlen_lazy = 0; 2495 rdp->qlen_lazy = 0;
2420 rdp->qlen = 0; 2496 ACCESS_ONCE(rdp->qlen) = 0;
2421 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2497 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2422 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2498 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2423 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2499 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -2491,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2491 2567
2492static void __cpuinit rcu_prepare_cpu(int cpu) 2568static void __cpuinit rcu_prepare_cpu(int cpu)
2493{ 2569{
2494 rcu_init_percpu_data(cpu, &rcu_sched_state, 0); 2570 struct rcu_state *rsp;
2495 rcu_init_percpu_data(cpu, &rcu_bh_state, 0); 2571
2496 rcu_preempt_init_percpu_data(cpu); 2572 for_each_rcu_flavor(rsp)
2573 rcu_init_percpu_data(cpu, rsp,
2574 strcmp(rsp->name, "rcu_preempt") == 0);
2497} 2575}
2498 2576
2499/* 2577/*
@@ -2505,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2505 long cpu = (long)hcpu; 2583 long cpu = (long)hcpu;
2506 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2584 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2507 struct rcu_node *rnp = rdp->mynode; 2585 struct rcu_node *rnp = rdp->mynode;
2586 struct rcu_state *rsp;
2508 2587
2509 trace_rcu_utilization("Start CPU hotplug"); 2588 trace_rcu_utilization("Start CPU hotplug");
2510 switch (action) { 2589 switch (action) {
@@ -2529,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2529 * touch any data without introducing corruption. We send the 2608 * touch any data without introducing corruption. We send the
2530 * dying CPU's callbacks to an arbitrarily chosen online CPU. 2609 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2531 */ 2610 */
2532 rcu_cleanup_dying_cpu(&rcu_bh_state); 2611 for_each_rcu_flavor(rsp)
2533 rcu_cleanup_dying_cpu(&rcu_sched_state); 2612 rcu_cleanup_dying_cpu(rsp);
2534 rcu_preempt_cleanup_dying_cpu();
2535 rcu_cleanup_after_idle(cpu); 2613 rcu_cleanup_after_idle(cpu);
2536 break; 2614 break;
2537 case CPU_DEAD: 2615 case CPU_DEAD:
2538 case CPU_DEAD_FROZEN: 2616 case CPU_DEAD_FROZEN:
2539 case CPU_UP_CANCELED: 2617 case CPU_UP_CANCELED:
2540 case CPU_UP_CANCELED_FROZEN: 2618 case CPU_UP_CANCELED_FROZEN:
2541 rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); 2619 for_each_rcu_flavor(rsp)
2542 rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); 2620 rcu_cleanup_dead_cpu(cpu, rsp);
2543 rcu_preempt_cleanup_dead_cpu(cpu);
2544 break; 2621 break;
2545 default: 2622 default:
2546 break; 2623 break;
@@ -2573,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2573{ 2650{
2574 int i; 2651 int i;
2575 2652
2576 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2653 for (i = rcu_num_lvls - 1; i > 0; i--)
2577 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2654 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2578 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; 2655 rsp->levelspread[0] = rcu_fanout_leaf;
2579} 2656}
2580#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2657#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2581static void __init rcu_init_levelspread(struct rcu_state *rsp) 2658static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2585,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2585 int i; 2662 int i;
2586 2663
2587 cprv = NR_CPUS; 2664 cprv = NR_CPUS;
2588 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { 2665 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2589 ccur = rsp->levelcnt[i]; 2666 ccur = rsp->levelcnt[i];
2590 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 2667 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
2591 cprv = ccur; 2668 cprv = ccur;
@@ -2612,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2612 2689
2613 /* Initialize the level-tracking arrays. */ 2690 /* Initialize the level-tracking arrays. */
2614 2691
2615 for (i = 1; i < NUM_RCU_LVLS; i++) 2692 for (i = 0; i < rcu_num_lvls; i++)
2693 rsp->levelcnt[i] = num_rcu_lvl[i];
2694 for (i = 1; i < rcu_num_lvls; i++)
2616 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 2695 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
2617 rcu_init_levelspread(rsp); 2696 rcu_init_levelspread(rsp);
2618 2697
2619 /* Initialize the elements themselves, starting from the leaves. */ 2698 /* Initialize the elements themselves, starting from the leaves. */
2620 2699
2621 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { 2700 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2622 cpustride *= rsp->levelspread[i]; 2701 cpustride *= rsp->levelspread[i];
2623 rnp = rsp->level[i]; 2702 rnp = rsp->level[i];
2624 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 2703 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2648,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2648 } 2727 }
2649 2728
2650 rsp->rda = rda; 2729 rsp->rda = rda;
2651 rnp = rsp->level[NUM_RCU_LVLS - 1]; 2730 rnp = rsp->level[rcu_num_lvls - 1];
2652 for_each_possible_cpu(i) { 2731 for_each_possible_cpu(i) {
2653 while (i > rnp->grphi) 2732 while (i > rnp->grphi)
2654 rnp++; 2733 rnp++;
2655 per_cpu_ptr(rsp->rda, i)->mynode = rnp; 2734 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
2656 rcu_boot_init_percpu_data(i, rsp); 2735 rcu_boot_init_percpu_data(i, rsp);
2657 } 2736 }
2737 list_add(&rsp->flavors, &rcu_struct_flavors);
2738}
2739
2740/*
2741 * Compute the rcu_node tree geometry from kernel parameters. This cannot
2742 * replace the definitions in rcutree.h because those are needed to size
2743 * the ->node array in the rcu_state structure.
2744 */
2745static void __init rcu_init_geometry(void)
2746{
2747 int i;
2748 int j;
2749 int n = nr_cpu_ids;
2750 int rcu_capacity[MAX_RCU_LVLS + 1];
2751
2752 /* If the compile-time values are accurate, just leave. */
2753 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
2754 return;
2755
2756 /*
2757 * Compute number of nodes that can be handled an rcu_node tree
2758 * with the given number of levels. Setting rcu_capacity[0] makes
2759 * some of the arithmetic easier.
2760 */
2761 rcu_capacity[0] = 1;
2762 rcu_capacity[1] = rcu_fanout_leaf;
2763 for (i = 2; i <= MAX_RCU_LVLS; i++)
2764 rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
2765
2766 /*
2767 * The boot-time rcu_fanout_leaf parameter is only permitted
2768 * to increase the leaf-level fanout, not decrease it. Of course,
2769 * the leaf-level fanout cannot exceed the number of bits in
2770 * the rcu_node masks. Finally, the tree must be able to accommodate
2771 * the configured number of CPUs. Complain and fall back to the
2772 * compile-time values if these limits are exceeded.
2773 */
2774 if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
2775 rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
2776 n > rcu_capacity[MAX_RCU_LVLS]) {
2777 WARN_ON(1);
2778 return;
2779 }
2780
2781 /* Calculate the number of rcu_nodes at each level of the tree. */
2782 for (i = 1; i <= MAX_RCU_LVLS; i++)
2783 if (n <= rcu_capacity[i]) {
2784 for (j = 0; j <= i; j++)
2785 num_rcu_lvl[j] =
2786 DIV_ROUND_UP(n, rcu_capacity[i - j]);
2787 rcu_num_lvls = i;
2788 for (j = i + 1; j <= MAX_RCU_LVLS; j++)
2789 num_rcu_lvl[j] = 0;
2790 break;
2791 }
2792
2793 /* Calculate the total number of rcu_node structures. */
2794 rcu_num_nodes = 0;
2795 for (i = 0; i <= MAX_RCU_LVLS; i++)
2796 rcu_num_nodes += num_rcu_lvl[i];
2797 rcu_num_nodes -= n;
2658} 2798}
2659 2799
2660void __init rcu_init(void) 2800void __init rcu_init(void)
@@ -2662,6 +2802,7 @@ void __init rcu_init(void)
2662 int cpu; 2802 int cpu;
2663 2803
2664 rcu_bootup_announce(); 2804 rcu_bootup_announce();
2805 rcu_init_geometry();
2665 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2806 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
2666 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2807 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
2667 __rcu_init_preempt(); 2808 __rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index ea056495783e..4d29169f2124 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -42,28 +42,28 @@
42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
43 43
44#if NR_CPUS <= RCU_FANOUT_1 44#if NR_CPUS <= RCU_FANOUT_1
45# define NUM_RCU_LVLS 1 45# define RCU_NUM_LVLS 1
46# define NUM_RCU_LVL_0 1 46# define NUM_RCU_LVL_0 1
47# define NUM_RCU_LVL_1 (NR_CPUS) 47# define NUM_RCU_LVL_1 (NR_CPUS)
48# define NUM_RCU_LVL_2 0 48# define NUM_RCU_LVL_2 0
49# define NUM_RCU_LVL_3 0 49# define NUM_RCU_LVL_3 0
50# define NUM_RCU_LVL_4 0 50# define NUM_RCU_LVL_4 0
51#elif NR_CPUS <= RCU_FANOUT_2 51#elif NR_CPUS <= RCU_FANOUT_2
52# define NUM_RCU_LVLS 2 52# define RCU_NUM_LVLS 2
53# define NUM_RCU_LVL_0 1 53# define NUM_RCU_LVL_0 1
54# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 54# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
55# define NUM_RCU_LVL_2 (NR_CPUS) 55# define NUM_RCU_LVL_2 (NR_CPUS)
56# define NUM_RCU_LVL_3 0 56# define NUM_RCU_LVL_3 0
57# define NUM_RCU_LVL_4 0 57# define NUM_RCU_LVL_4 0
58#elif NR_CPUS <= RCU_FANOUT_3 58#elif NR_CPUS <= RCU_FANOUT_3
59# define NUM_RCU_LVLS 3 59# define RCU_NUM_LVLS 3
60# define NUM_RCU_LVL_0 1 60# define NUM_RCU_LVL_0 1
61# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 61# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
62# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 62# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
63# define NUM_RCU_LVL_3 (NR_CPUS) 63# define NUM_RCU_LVL_3 (NR_CPUS)
64# define NUM_RCU_LVL_4 0 64# define NUM_RCU_LVL_4 0
65#elif NR_CPUS <= RCU_FANOUT_4 65#elif NR_CPUS <= RCU_FANOUT_4
66# define NUM_RCU_LVLS 4 66# define RCU_NUM_LVLS 4
67# define NUM_RCU_LVL_0 1 67# define NUM_RCU_LVL_0 1
68# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) 68# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
69# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 69# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,6 +76,9 @@
76#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 76#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
77#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 77#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
78 78
79extern int rcu_num_lvls;
80extern int rcu_num_nodes;
81
79/* 82/*
80 * Dynticks per-CPU state. 83 * Dynticks per-CPU state.
81 */ 84 */
@@ -97,6 +100,7 @@ struct rcu_dynticks {
97 /* # times non-lazy CBs posted to CPU. */ 100 /* # times non-lazy CBs posted to CPU. */
98 unsigned long nonlazy_posted_snap; 101 unsigned long nonlazy_posted_snap;
99 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
100#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
101}; 105};
102 106
@@ -206,7 +210,7 @@ struct rcu_node {
206 */ 210 */
207#define rcu_for_each_node_breadth_first(rsp, rnp) \ 211#define rcu_for_each_node_breadth_first(rsp, rnp) \
208 for ((rnp) = &(rsp)->node[0]; \ 212 for ((rnp) = &(rsp)->node[0]; \
209 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 213 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
210 214
211/* 215/*
212 * Do a breadth-first scan of the non-leaf rcu_node structures for the 216 * Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -215,7 +219,7 @@ struct rcu_node {
215 */ 219 */
216#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ 220#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
217 for ((rnp) = &(rsp)->node[0]; \ 221 for ((rnp) = &(rsp)->node[0]; \
218 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) 222 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
219 223
220/* 224/*
221 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state 225 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -224,8 +228,8 @@ struct rcu_node {
224 * It is still a leaf node, even if it is also the root node. 228 * It is still a leaf node, even if it is also the root node.
225 */ 229 */
226#define rcu_for_each_leaf_node(rsp, rnp) \ 230#define rcu_for_each_leaf_node(rsp, rnp) \
227 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ 231 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
228 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 232 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
229 233
230/* Index values for nxttail array in struct rcu_data. */ 234/* Index values for nxttail array in struct rcu_data. */
231#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 235#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
@@ -311,6 +315,9 @@ struct rcu_data {
311 unsigned long n_rp_need_fqs; 315 unsigned long n_rp_need_fqs;
312 unsigned long n_rp_need_nothing; 316 unsigned long n_rp_need_nothing;
313 317
318 /* 6) _rcu_barrier() callback. */
319 struct rcu_head barrier_head;
320
314 int cpu; 321 int cpu;
315 struct rcu_state *rsp; 322 struct rcu_state *rsp;
316}; 323};
@@ -357,10 +364,12 @@ do { \
357 */ 364 */
358struct rcu_state { 365struct rcu_state {
359 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ 366 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
360 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 367 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
361 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 368 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
362 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 369 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
363 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 370 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
371 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
372 void (*func)(struct rcu_head *head));
364 373
365 /* The following fields are guarded by the root rcu_node's lock. */ 374 /* The following fields are guarded by the root rcu_node's lock. */
366 375
@@ -392,6 +401,11 @@ struct rcu_state {
392 struct task_struct *rcu_barrier_in_progress; 401 struct task_struct *rcu_barrier_in_progress;
393 /* Task doing rcu_barrier(), */ 402 /* Task doing rcu_barrier(), */
394 /* or NULL if no barrier. */ 403 /* or NULL if no barrier. */
404 struct mutex barrier_mutex; /* Guards barrier fields. */
405 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
406 struct completion barrier_completion; /* Wake at barrier end. */
407 unsigned long n_barrier_done; /* ++ at start and end of */
408 /* _rcu_barrier(). */
395 raw_spinlock_t fqslock; /* Only one task forcing */ 409 raw_spinlock_t fqslock; /* Only one task forcing */
396 /* quiescent states. */ 410 /* quiescent states. */
397 unsigned long jiffies_force_qs; /* Time at which to invoke */ 411 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -409,8 +423,13 @@ struct rcu_state {
409 unsigned long gp_max; /* Maximum GP duration in */ 423 unsigned long gp_max; /* Maximum GP duration in */
410 /* jiffies. */ 424 /* jiffies. */
411 char *name; /* Name of structure. */ 425 char *name; /* Name of structure. */
426 struct list_head flavors; /* List of RCU flavors. */
412}; 427};
413 428
429extern struct list_head rcu_struct_flavors;
430#define for_each_rcu_flavor(rsp) \
431 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
432
414/* Return values for rcu_preempt_offline_tasks(). */ 433/* Return values for rcu_preempt_offline_tasks(). */
415 434
416#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ 435#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
@@ -444,6 +463,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
444/* Forward declarations for rcutree_plugin.h */ 463/* Forward declarations for rcutree_plugin.h */
445static void rcu_bootup_announce(void); 464static void rcu_bootup_announce(void);
446long rcu_batches_completed(void); 465long rcu_batches_completed(void);
466static void rcu_preempt_note_context_switch(int cpu);
447static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 467static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
448#ifdef CONFIG_HOTPLUG_CPU 468#ifdef CONFIG_HOTPLUG_CPU
449static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 469static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -452,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu);
452#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 472#endif /* #ifdef CONFIG_HOTPLUG_CPU */
453static void rcu_print_detail_task_stall(struct rcu_state *rsp); 473static void rcu_print_detail_task_stall(struct rcu_state *rsp);
454static int rcu_print_task_stall(struct rcu_node *rnp); 474static int rcu_print_task_stall(struct rcu_node *rnp);
455static void rcu_preempt_stall_reset(void);
456static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 475static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
457#ifdef CONFIG_HOTPLUG_CPU 476#ifdef CONFIG_HOTPLUG_CPU
458static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 477static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
459 struct rcu_node *rnp, 478 struct rcu_node *rnp,
460 struct rcu_data *rdp); 479 struct rcu_data *rdp);
461#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 480#endif /* #ifdef CONFIG_HOTPLUG_CPU */
462static void rcu_preempt_cleanup_dead_cpu(int cpu);
463static void rcu_preempt_check_callbacks(int cpu); 481static void rcu_preempt_check_callbacks(int cpu);
464static void rcu_preempt_process_callbacks(void);
465void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 482void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
466#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 483#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
467static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 484static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
468 bool wake); 485 bool wake);
469#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 486#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
470static int rcu_preempt_pending(int cpu);
471static int rcu_preempt_cpu_has_callbacks(int cpu);
472static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
473static void rcu_preempt_cleanup_dying_cpu(void);
474static void __init __rcu_init_preempt(void); 487static void __init __rcu_init_preempt(void);
475static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 488static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
476static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 489static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5271a020887e..7f3244c0df01 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void)
68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); 68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
69#endif 69#endif
70#if NUM_RCU_LVL_4 != 0 70#if NUM_RCU_LVL_4 != 0
71 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); 71 printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
72#endif 72#endif
73 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
74 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
75 if (nr_cpu_ids != NR_CPUS)
76 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
73} 77}
74 78
75#ifdef CONFIG_TREE_PREEMPT_RCU 79#ifdef CONFIG_TREE_PREEMPT_RCU
76 80
77struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); 81struct rcu_state rcu_preempt_state =
82 RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
78DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
79static struct rcu_state *rcu_state = &rcu_preempt_state; 84static struct rcu_state *rcu_state = &rcu_preempt_state;
80 85
81static void rcu_read_unlock_special(struct task_struct *t);
82static int rcu_preempted_readers_exp(struct rcu_node *rnp); 86static int rcu_preempted_readers_exp(struct rcu_node *rnp);
83 87
84/* 88/*
@@ -153,7 +157,7 @@ static void rcu_preempt_qs(int cpu)
153 * 157 *
154 * Caller must disable preemption. 158 * Caller must disable preemption.
155 */ 159 */
156void rcu_preempt_note_context_switch(void) 160static void rcu_preempt_note_context_switch(int cpu)
157{ 161{
158 struct task_struct *t = current; 162 struct task_struct *t = current;
159 unsigned long flags; 163 unsigned long flags;
@@ -164,7 +168,7 @@ void rcu_preempt_note_context_switch(void)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 168 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 169
166 /* Possibly blocking in an RCU read-side critical section. */ 170 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = __this_cpu_ptr(rcu_preempt_state.rda); 171 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
168 rnp = rdp->mynode; 172 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 173 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 174 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,23 +232,11 @@ void rcu_preempt_note_context_switch(void)
228 * means that we continue to block the current grace period. 232 * means that we continue to block the current grace period.
229 */ 233 */
230 local_irq_save(flags); 234 local_irq_save(flags);
231 rcu_preempt_qs(smp_processor_id()); 235 rcu_preempt_qs(cpu);
232 local_irq_restore(flags); 236 local_irq_restore(flags);
233} 237}
234 238
235/* 239/*
236 * Tree-preemptible RCU implementation for rcu_read_lock().
237 * Just increment ->rcu_read_lock_nesting, shared state will be updated
238 * if we block.
239 */
240void __rcu_read_lock(void)
241{
242 current->rcu_read_lock_nesting++;
243 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
244}
245EXPORT_SYMBOL_GPL(__rcu_read_lock);
246
247/*
248 * Check for preempted RCU readers blocking the current grace period 240 * Check for preempted RCU readers blocking the current grace period
249 * for the specified rcu_node structure. If the caller needs a reliable 241 * for the specified rcu_node structure. If the caller needs a reliable
250 * answer, it must hold the rcu_node's ->lock. 242 * answer, it must hold the rcu_node's ->lock.
@@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
310 * notify RCU core processing or task having blocked during the RCU 302 * notify RCU core processing or task having blocked during the RCU
311 * read-side critical section. 303 * read-side critical section.
312 */ 304 */
313static noinline void rcu_read_unlock_special(struct task_struct *t) 305void rcu_read_unlock_special(struct task_struct *t)
314{ 306{
315 int empty; 307 int empty;
316 int empty_exp; 308 int empty_exp;
@@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
398 rnp->grphi, 390 rnp->grphi,
399 !!rnp->gp_tasks); 391 !!rnp->gp_tasks);
400 rcu_report_unblock_qs_rnp(rnp, flags); 392 rcu_report_unblock_qs_rnp(rnp, flags);
401 } else 393 } else {
402 raw_spin_unlock_irqrestore(&rnp->lock, flags); 394 raw_spin_unlock_irqrestore(&rnp->lock, flags);
395 }
403 396
404#ifdef CONFIG_RCU_BOOST 397#ifdef CONFIG_RCU_BOOST
405 /* Unboost if we were boosted. */ 398 /* Unboost if we were boosted. */
@@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
418 } 411 }
419} 412}
420 413
421/*
422 * Tree-preemptible RCU implementation for rcu_read_unlock().
423 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
424 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
425 * invoke rcu_read_unlock_special() to clean up after a context switch
426 * in an RCU read-side critical section and other special cases.
427 */
428void __rcu_read_unlock(void)
429{
430 struct task_struct *t = current;
431
432 if (t->rcu_read_lock_nesting != 1)
433 --t->rcu_read_lock_nesting;
434 else {
435 barrier(); /* critical section before exit code. */
436 t->rcu_read_lock_nesting = INT_MIN;
437 barrier(); /* assign before ->rcu_read_unlock_special load */
438 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
439 rcu_read_unlock_special(t);
440 barrier(); /* ->rcu_read_unlock_special load before assign */
441 t->rcu_read_lock_nesting = 0;
442 }
443#ifdef CONFIG_PROVE_LOCKING
444 {
445 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
446
447 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
448 }
449#endif /* #ifdef CONFIG_PROVE_LOCKING */
450}
451EXPORT_SYMBOL_GPL(__rcu_read_unlock);
452
453#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 414#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
454 415
455/* 416/*
@@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
540} 501}
541 502
542/* 503/*
543 * Suppress preemptible RCU's CPU stall warnings by pushing the
544 * time of the next stall-warning message comfortably far into the
545 * future.
546 */
547static void rcu_preempt_stall_reset(void)
548{
549 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
550}
551
552/*
553 * Check that the list of blocked tasks for the newly completed grace 504 * Check that the list of blocked tasks for the newly completed grace
554 * period is in fact empty. It is a serious bug to complete a grace 505 * period is in fact empty. It is a serious bug to complete a grace
555 * period that still has RCU readers blocked! This function must be 506 * period that still has RCU readers blocked! This function must be
@@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
650#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 601#endif /* #ifdef CONFIG_HOTPLUG_CPU */
651 602
652/* 603/*
653 * Do CPU-offline processing for preemptible RCU.
654 */
655static void rcu_preempt_cleanup_dead_cpu(int cpu)
656{
657 rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
658}
659
660/*
661 * Check for a quiescent state from the current CPU. When a task blocks, 604 * Check for a quiescent state from the current CPU. When a task blocks,
662 * the task is recorded in the corresponding CPU's rcu_node structure, 605 * the task is recorded in the corresponding CPU's rcu_node structure,
663 * which is checked elsewhere. 606 * which is checked elsewhere.
@@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu)
677 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 620 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
678} 621}
679 622
680/*
681 * Process callbacks for preemptible RCU.
682 */
683static void rcu_preempt_process_callbacks(void)
684{
685 __rcu_process_callbacks(&rcu_preempt_state,
686 &__get_cpu_var(rcu_preempt_data));
687}
688
689#ifdef CONFIG_RCU_BOOST 623#ifdef CONFIG_RCU_BOOST
690 624
691static void rcu_preempt_do_callbacks(void) 625static void rcu_preempt_do_callbacks(void)
@@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
824 int must_wait = 0; 758 int must_wait = 0;
825 759
826 raw_spin_lock_irqsave(&rnp->lock, flags); 760 raw_spin_lock_irqsave(&rnp->lock, flags);
827 if (list_empty(&rnp->blkd_tasks)) 761 if (list_empty(&rnp->blkd_tasks)) {
828 raw_spin_unlock_irqrestore(&rnp->lock, flags); 762 raw_spin_unlock_irqrestore(&rnp->lock, flags);
829 else { 763 } else {
830 rnp->exp_tasks = rnp->blkd_tasks.next; 764 rnp->exp_tasks = rnp->blkd_tasks.next;
831 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 765 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
832 must_wait = 1; 766 must_wait = 1;
@@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void)
870 * expedited grace period for us, just leave. 804 * expedited grace period for us, just leave.
871 */ 805 */
872 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 806 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
873 if (trycount++ < 10) 807 if (trycount++ < 10) {
874 udelay(trycount * num_online_cpus()); 808 udelay(trycount * num_online_cpus());
875 else { 809 } else {
876 synchronize_rcu(); 810 synchronize_rcu();
877 return; 811 return;
878 } 812 }
@@ -917,51 +851,16 @@ mb_ret:
917} 851}
918EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 852EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
919 853
920/*
921 * Check to see if there is any immediate preemptible-RCU-related work
922 * to be done.
923 */
924static int rcu_preempt_pending(int cpu)
925{
926 return __rcu_pending(&rcu_preempt_state,
927 &per_cpu(rcu_preempt_data, cpu));
928}
929
930/*
931 * Does preemptible RCU have callbacks on this CPU?
932 */
933static int rcu_preempt_cpu_has_callbacks(int cpu)
934{
935 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
936}
937
938/** 854/**
939 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 855 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
940 */ 856 */
941void rcu_barrier(void) 857void rcu_barrier(void)
942{ 858{
943 _rcu_barrier(&rcu_preempt_state, call_rcu); 859 _rcu_barrier(&rcu_preempt_state);
944} 860}
945EXPORT_SYMBOL_GPL(rcu_barrier); 861EXPORT_SYMBOL_GPL(rcu_barrier);
946 862
947/* 863/*
948 * Initialize preemptible RCU's per-CPU data.
949 */
950static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
951{
952 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
953}
954
955/*
956 * Move preemptible RCU's callbacks from dying CPU to other online CPU
957 * and record a quiescent state.
958 */
959static void rcu_preempt_cleanup_dying_cpu(void)
960{
961 rcu_cleanup_dying_cpu(&rcu_preempt_state);
962}
963
964/*
965 * Initialize preemptible RCU's state structures. 864 * Initialize preemptible RCU's state structures.
966 */ 865 */
967static void __init __rcu_init_preempt(void) 866static void __init __rcu_init_preempt(void)
@@ -1002,6 +901,14 @@ void rcu_force_quiescent_state(void)
1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 901EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1003 902
1004/* 903/*
904 * Because preemptible RCU does not exist, we never have to check for
905 * CPUs being in quiescent states.
906 */
907static void rcu_preempt_note_context_switch(int cpu)
908{
909}
910
911/*
1005 * Because preemptible RCU does not exist, there are never any preempted 912 * Because preemptible RCU does not exist, there are never any preempted
1006 * RCU readers. 913 * RCU readers.
1007 */ 914 */
@@ -1038,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
1038} 945}
1039 946
1040/* 947/*
1041 * Because preemptible RCU does not exist, there is no need to suppress
1042 * its CPU stall warnings.
1043 */
1044static void rcu_preempt_stall_reset(void)
1045{
1046}
1047
1048/*
1049 * Because there is no preemptible RCU, there can be no readers blocked, 948 * Because there is no preemptible RCU, there can be no readers blocked,
1050 * so there is no need to check for blocked tasks. So check only for 949 * so there is no need to check for blocked tasks. So check only for
1051 * bogus qsmask values. 950 * bogus qsmask values.
@@ -1073,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1073#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 972#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1074 973
1075/* 974/*
1076 * Because preemptible RCU does not exist, it never needs CPU-offline
1077 * processing.
1078 */
1079static void rcu_preempt_cleanup_dead_cpu(int cpu)
1080{
1081}
1082
1083/*
1084 * Because preemptible RCU does not exist, it never has any callbacks 975 * Because preemptible RCU does not exist, it never has any callbacks
1085 * to check. 976 * to check.
1086 */ 977 */
@@ -1089,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu)
1089} 980}
1090 981
1091/* 982/*
1092 * Because preemptible RCU does not exist, it never has any callbacks
1093 * to process.
1094 */
1095static void rcu_preempt_process_callbacks(void)
1096{
1097}
1098
1099/*
1100 * Queue an RCU callback for lazy invocation after a grace period. 983 * Queue an RCU callback for lazy invocation after a grace period.
1101 * This will likely be later named something like "call_rcu_lazy()", 984 * This will likely be later named something like "call_rcu_lazy()",
1102 * but this change will require some way of tagging the lazy RCU 985 * but this change will require some way of tagging the lazy RCU
@@ -1137,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1137#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1020#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1138 1021
1139/* 1022/*
1140 * Because preemptible RCU does not exist, it never has any work to do.
1141 */
1142static int rcu_preempt_pending(int cpu)
1143{
1144 return 0;
1145}
1146
1147/*
1148 * Because preemptible RCU does not exist, it never has callbacks
1149 */
1150static int rcu_preempt_cpu_has_callbacks(int cpu)
1151{
1152 return 0;
1153}
1154
1155/*
1156 * Because preemptible RCU does not exist, rcu_barrier() is just 1023 * Because preemptible RCU does not exist, rcu_barrier() is just
1157 * another name for rcu_barrier_sched(). 1024 * another name for rcu_barrier_sched().
1158 */ 1025 */
@@ -1163,21 +1030,6 @@ void rcu_barrier(void)
1163EXPORT_SYMBOL_GPL(rcu_barrier); 1030EXPORT_SYMBOL_GPL(rcu_barrier);
1164 1031
1165/* 1032/*
1166 * Because preemptible RCU does not exist, there is no per-CPU
1167 * data to initialize.
1168 */
1169static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1170{
1171}
1172
1173/*
1174 * Because there is no preemptible RCU, there is no cleanup to do.
1175 */
1176static void rcu_preempt_cleanup_dying_cpu(void)
1177{
1178}
1179
1180/*
1181 * Because preemptible RCU does not exist, it need not be initialized. 1033 * Because preemptible RCU does not exist, it need not be initialized.
1182 */ 1034 */
1183static void __init __rcu_init_preempt(void) 1035static void __init __rcu_init_preempt(void)
@@ -1960,9 +1812,11 @@ static void rcu_idle_count_callbacks_posted(void)
1960 */ 1812 */
1961#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ 1813#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
1962#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ 1814#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
1963#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1815#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1964#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1816#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1965 1817
1818extern int tick_nohz_enabled;
1819
1966/* 1820/*
1967 * Does the specified flavor of RCU have non-lazy callbacks pending on 1821 * Does the specified flavor of RCU have non-lazy callbacks pending on
1968 * the specified CPU? Both RCU flavor and CPU are specified by the 1822 * the specified CPU? Both RCU flavor and CPU are specified by the
@@ -2039,10 +1893,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
2039 return 1; 1893 return 1;
2040 } 1894 }
2041 /* Set up for the possibility that RCU will post a timer. */ 1895 /* Set up for the possibility that RCU will post a timer. */
2042 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 1896 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2043 *delta_jiffies = RCU_IDLE_GP_DELAY; 1897 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
2044 else 1898 RCU_IDLE_GP_DELAY) - jiffies;
2045 *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; 1899 } else {
1900 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
1901 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1902 }
2046 return 0; 1903 return 0;
2047} 1904}
2048 1905
@@ -2101,6 +1958,7 @@ static void rcu_cleanup_after_idle(int cpu)
2101 1958
2102 del_timer(&rdtp->idle_gp_timer); 1959 del_timer(&rdtp->idle_gp_timer);
2103 trace_rcu_prep_idle("Cleanup after idle"); 1960 trace_rcu_prep_idle("Cleanup after idle");
1961 rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
2104} 1962}
2105 1963
2106/* 1964/*
@@ -2126,6 +1984,18 @@ static void rcu_prepare_for_idle(int cpu)
2126{ 1984{
2127 struct timer_list *tp; 1985 struct timer_list *tp;
2128 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1986 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1987 int tne;
1988
1989 /* Handle nohz enablement switches conservatively. */
1990 tne = ACCESS_ONCE(tick_nohz_enabled);
1991 if (tne != rdtp->tick_nohz_enabled_snap) {
1992 if (rcu_cpu_has_callbacks(cpu))
1993 invoke_rcu_core(); /* force nohz to see update. */
1994 rdtp->tick_nohz_enabled_snap = tne;
1995 return;
1996 }
1997 if (!tne)
1998 return;
2129 1999
2130 /* 2000 /*
2131 * If this is an idle re-entry, for example, due to use of 2001 * If this is an idle re-entry, for example, due to use of
@@ -2179,10 +2049,11 @@ static void rcu_prepare_for_idle(int cpu)
2179 if (rcu_cpu_has_nonlazy_callbacks(cpu)) { 2049 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2180 trace_rcu_prep_idle("Dyntick with callbacks"); 2050 trace_rcu_prep_idle("Dyntick with callbacks");
2181 rdtp->idle_gp_timer_expires = 2051 rdtp->idle_gp_timer_expires =
2182 jiffies + RCU_IDLE_GP_DELAY; 2052 round_up(jiffies + RCU_IDLE_GP_DELAY,
2053 RCU_IDLE_GP_DELAY);
2183 } else { 2054 } else {
2184 rdtp->idle_gp_timer_expires = 2055 rdtp->idle_gp_timer_expires =
2185 jiffies + RCU_IDLE_LAZY_GP_DELAY; 2056 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
2186 trace_rcu_prep_idle("Dyntick with lazy callbacks"); 2057 trace_rcu_prep_idle("Dyntick with lazy callbacks");
2187 } 2058 }
2188 tp = &rdtp->idle_gp_timer; 2059 tp = &rdtp->idle_gp_timer;
@@ -2223,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu)
2223 if (rcu_cpu_has_callbacks(cpu)) { 2094 if (rcu_cpu_has_callbacks(cpu)) {
2224 trace_rcu_prep_idle("More callbacks"); 2095 trace_rcu_prep_idle("More callbacks");
2225 invoke_rcu_core(); 2096 invoke_rcu_core();
2226 } else 2097 } else {
2227 trace_rcu_prep_idle("Callbacks drained"); 2098 trace_rcu_prep_idle("Callbacks drained");
2099 }
2228} 2100}
2229 2101
2230/* 2102/*
@@ -2261,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2261 2133
2262static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2134static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2263{ 2135{
2136 *cp = '\0';
2264} 2137}
2265 2138
2266#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ 2139#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..abffb486e94e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,31 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused)
50{
51 struct rcu_state *rsp;
52
53 for_each_rcu_flavor(rsp)
54 seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
55 rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
56 atomic_read(&rsp->barrier_cpu_count),
57 rsp->n_barrier_done);
58 return 0;
59}
60
61static int rcubarrier_open(struct inode *inode, struct file *file)
62{
63 return single_open(file, show_rcubarrier, NULL);
64}
65
66static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE,
68 .open = rcubarrier_open,
69 .read = seq_read,
70 .llseek = seq_lseek,
71 .release = single_release,
72};
73
49#ifdef CONFIG_RCU_BOOST 74#ifdef CONFIG_RCU_BOOST
50 75
51static char convert_kthread_status(unsigned int kthread_status) 76static char convert_kthread_status(unsigned int kthread_status)
@@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
95 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 120 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
96} 121}
97 122
98#define PRINT_RCU_DATA(name, func, m) \
99 do { \
100 int _p_r_d_i; \
101 \
102 for_each_possible_cpu(_p_r_d_i) \
103 func(m, &per_cpu(name, _p_r_d_i)); \
104 } while (0)
105
106static int show_rcudata(struct seq_file *m, void *unused) 123static int show_rcudata(struct seq_file *m, void *unused)
107{ 124{
108#ifdef CONFIG_TREE_PREEMPT_RCU 125 int cpu;
109 seq_puts(m, "rcu_preempt:\n"); 126 struct rcu_state *rsp;
110 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); 127
111#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 128 for_each_rcu_flavor(rsp) {
112 seq_puts(m, "rcu_sched:\n"); 129 seq_printf(m, "%s:\n", rsp->name);
113 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); 130 for_each_possible_cpu(cpu)
114 seq_puts(m, "rcu_bh:\n"); 131 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
115 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 132 }
116 return 0; 133 return 0;
117} 134}
118 135
@@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
166 183
167static int show_rcudata_csv(struct seq_file *m, void *unused) 184static int show_rcudata_csv(struct seq_file *m, void *unused)
168{ 185{
186 int cpu;
187 struct rcu_state *rsp;
188
169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 189 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 190 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
171 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 191 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
@@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
173 seq_puts(m, "\"kt\",\"ktl\""); 193 seq_puts(m, "\"kt\",\"ktl\"");
174#endif /* #ifdef CONFIG_RCU_BOOST */ 194#endif /* #ifdef CONFIG_RCU_BOOST */
175 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); 195 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
176#ifdef CONFIG_TREE_PREEMPT_RCU 196 for_each_rcu_flavor(rsp) {
177 seq_puts(m, "\"rcu_preempt:\"\n"); 197 seq_printf(m, "\"%s:\"\n", rsp->name);
178 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 198 for_each_possible_cpu(cpu)
179#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 199 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
180 seq_puts(m, "\"rcu_sched:\"\n"); 200 }
181 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
182 seq_puts(m, "\"rcu_bh:\"\n");
183 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
184 return 0; 201 return 0;
185} 202}
186 203
@@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = {
201 218
202static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) 219static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
203{ 220{
204 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " 221 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
205 "j=%04x bt=%04x\n",
206 rnp->grplo, rnp->grphi, 222 rnp->grplo, rnp->grphi,
207 "T."[list_empty(&rnp->blkd_tasks)], 223 "T."[list_empty(&rnp->blkd_tasks)],
208 "N."[!rnp->gp_tasks], 224 "N."[!rnp->gp_tasks],
@@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
210 "B."[!rnp->boost_tasks], 226 "B."[!rnp->boost_tasks],
211 convert_kthread_status(rnp->boost_kthread_status), 227 convert_kthread_status(rnp->boost_kthread_status),
212 rnp->n_tasks_boosted, rnp->n_exp_boosts, 228 rnp->n_tasks_boosted, rnp->n_exp_boosts,
213 rnp->n_normal_boosts, 229 rnp->n_normal_boosts);
230 seq_printf(m, "j=%04x bt=%04x\n",
214 (int)(jiffies & 0xffff), 231 (int)(jiffies & 0xffff),
215 (int)(rnp->boost_time & 0xffff)); 232 (int)(rnp->boost_time & 0xffff));
216 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", 233 seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
217 " balk",
218 rnp->n_balk_blkd_tasks, 234 rnp->n_balk_blkd_tasks,
219 rnp->n_balk_exp_gp_tasks, 235 rnp->n_balk_exp_gp_tasks,
220 rnp->n_balk_boost_tasks, 236 rnp->n_balk_boost_tasks,
@@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
270 struct rcu_node *rnp; 286 struct rcu_node *rnp;
271 287
272 gpnum = rsp->gpnum; 288 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 289 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ",
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 290 rsp->name, rsp->completed, gpnum, rsp->fqs_state,
275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 291 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 292 (int)(jiffies & 0xffff));
293 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 294 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 295 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); 296 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 297 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
282 if (rnp->level != level) { 298 if (rnp->level != level) {
283 seq_puts(m, "\n"); 299 seq_puts(m, "\n");
284 level = rnp->level; 300 level = rnp->level;
@@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
295 311
296static int show_rcuhier(struct seq_file *m, void *unused) 312static int show_rcuhier(struct seq_file *m, void *unused)
297{ 313{
298#ifdef CONFIG_TREE_PREEMPT_RCU 314 struct rcu_state *rsp;
299 seq_puts(m, "rcu_preempt:\n"); 315
300 print_one_rcu_state(m, &rcu_preempt_state); 316 for_each_rcu_flavor(rsp)
301#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 317 print_one_rcu_state(m, rsp);
302 seq_puts(m, "rcu_sched:\n");
303 print_one_rcu_state(m, &rcu_sched_state);
304 seq_puts(m, "rcu_bh:\n");
305 print_one_rcu_state(m, &rcu_bh_state);
306 return 0; 318 return 0;
307} 319}
308 320
@@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
343 355
344static int show_rcugp(struct seq_file *m, void *unused) 356static int show_rcugp(struct seq_file *m, void *unused)
345{ 357{
346#ifdef CONFIG_TREE_PREEMPT_RCU 358 struct rcu_state *rsp;
347 show_one_rcugp(m, &rcu_preempt_state); 359
348#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 360 for_each_rcu_flavor(rsp)
349 show_one_rcugp(m, &rcu_sched_state); 361 show_one_rcugp(m, rsp);
350 show_one_rcugp(m, &rcu_bh_state);
351 return 0; 362 return 0;
352} 363}
353 364
@@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = {
366 377
367static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 378static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
368{ 379{
369 seq_printf(m, "%3d%cnp=%ld " 380 seq_printf(m, "%3d%cnp=%ld ",
370 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
371 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
372 rdp->cpu, 381 rdp->cpu,
373 cpu_is_offline(rdp->cpu) ? '!' : ' ', 382 cpu_is_offline(rdp->cpu) ? '!' : ' ',
374 rdp->n_rcu_pending, 383 rdp->n_rcu_pending);
384 seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
375 rdp->n_rp_qs_pending, 385 rdp->n_rp_qs_pending,
376 rdp->n_rp_report_qs, 386 rdp->n_rp_report_qs,
377 rdp->n_rp_cb_ready, 387 rdp->n_rp_cb_ready,
378 rdp->n_rp_cpu_needs_gp, 388 rdp->n_rp_cpu_needs_gp);
389 seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
379 rdp->n_rp_gp_completed, 390 rdp->n_rp_gp_completed,
380 rdp->n_rp_gp_started, 391 rdp->n_rp_gp_started,
381 rdp->n_rp_need_fqs, 392 rdp->n_rp_need_fqs,
382 rdp->n_rp_need_nothing); 393 rdp->n_rp_need_nothing);
383} 394}
384 395
385static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) 396static int show_rcu_pending(struct seq_file *m, void *unused)
386{ 397{
387 int cpu; 398 int cpu;
388 struct rcu_data *rdp; 399 struct rcu_data *rdp;
389 400 struct rcu_state *rsp;
390 for_each_possible_cpu(cpu) { 401
391 rdp = per_cpu_ptr(rsp->rda, cpu); 402 for_each_rcu_flavor(rsp) {
392 if (rdp->beenonline) 403 seq_printf(m, "%s:\n", rsp->name);
393 print_one_rcu_pending(m, rdp); 404 for_each_possible_cpu(cpu) {
405 rdp = per_cpu_ptr(rsp->rda, cpu);
406 if (rdp->beenonline)
407 print_one_rcu_pending(m, rdp);
408 }
394 } 409 }
395}
396
397static int show_rcu_pending(struct seq_file *m, void *unused)
398{
399#ifdef CONFIG_TREE_PREEMPT_RCU
400 seq_puts(m, "rcu_preempt:\n");
401 print_rcu_pendings(m, &rcu_preempt_state);
402#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
403 seq_puts(m, "rcu_sched:\n");
404 print_rcu_pendings(m, &rcu_sched_state);
405 seq_puts(m, "rcu_bh:\n");
406 print_rcu_pendings(m, &rcu_bh_state);
407 return 0; 410 return 0;
408} 411}
409 412
@@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void)
453 if (!rcudir) 456 if (!rcudir)
454 goto free_out; 457 goto free_out;
455 458
459 retval = debugfs_create_file("rcubarrier", 0444, rcudir,
460 NULL, &rcubarrier_fops);
461 if (!retval)
462 goto free_out;
463
456 retval = debugfs_create_file("rcudata", 0444, rcudir, 464 retval = debugfs_create_file("rcudata", 0444, rcudir,
457 NULL, &rcudata_fops); 465 NULL, &rcudata_fops);
458 if (!retval) 466 if (!retval)
diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8ee76d5..dc8b47764443 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -722,14 +722,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
722 722
723 write_lock(&resource_lock); 723 write_lock(&resource_lock);
724 724
725 if (!parent)
726 goto skip;
727
725 if ((start < parent->start) || (end > parent->end)) 728 if ((start < parent->start) || (end > parent->end))
726 goto out; 729 goto out;
727 730
728 for (tmp = res->child; tmp; tmp = tmp->sibling) {
729 if ((tmp->start < start) || (tmp->end > end))
730 goto out;
731 }
732
733 if (res->sibling && (res->sibling->start <= end)) 731 if (res->sibling && (res->sibling->start <= end))
734 goto out; 732 goto out;
735 733
@@ -741,6 +739,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
741 goto out; 739 goto out;
742 } 740 }
743 741
742skip:
743 for (tmp = res->child; tmp; tmp = tmp->sibling)
744 if ((tmp->start < start) || (tmp->end > end))
745 goto out;
746
744 res->start = start; 747 res->start = start;
745 res->end = end; 748 res->end = end;
746 result = 0; 749 result = 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
2081#endif 2081#endif
2082 2082
2083 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2084 rcu_switch_from(prev);
2085 switch_to(prev, next, prev); 2084 switch_to(prev, next, prev);
2086 2085
2087 barrier(); 2086 barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
2161} 2160}
2162 2161
2163 2162
2163/*
2164 * Global load-average calculations
2165 *
2166 * We take a distributed and async approach to calculating the global load-avg
2167 * in order to minimize overhead.
2168 *
2169 * The global load average is an exponentially decaying average of nr_running +
2170 * nr_uninterruptible.
2171 *
2172 * Once every LOAD_FREQ:
2173 *
2174 * nr_active = 0;
2175 * for_each_possible_cpu(cpu)
2176 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2177 *
2178 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2179 *
2180 * Due to a number of reasons the above turns in the mess below:
2181 *
2182 * - for_each_possible_cpu() is prohibitively expensive on machines with
2183 * serious number of cpus, therefore we need to take a distributed approach
2184 * to calculating nr_active.
2185 *
2186 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2187 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2188 *
2189 * So assuming nr_active := 0 when we start out -- true per definition, we
2190 * can simply take per-cpu deltas and fold those into a global accumulate
2191 * to obtain the same result. See calc_load_fold_active().
2192 *
2193 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2194 * across the machine, we assume 10 ticks is sufficient time for every
2195 * cpu to have completed this task.
2196 *
2197 * This places an upper-bound on the IRQ-off latency of the machine. Then
2198 * again, being late doesn't loose the delta, just wrecks the sample.
2199 *
2200 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2201 * this would add another cross-cpu cacheline miss and atomic operation
2202 * to the wakeup path. Instead we increment on whatever cpu the task ran
2203 * when it went into uninterruptible state and decrement on whatever cpu
2204 * did the wakeup. This means that only the sum of nr_uninterruptible over
2205 * all cpus yields the correct result.
2206 *
2207 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2208 */
2209
2164/* Variables and functions for calc_load */ 2210/* Variables and functions for calc_load */
2165static atomic_long_t calc_load_tasks; 2211static atomic_long_t calc_load_tasks;
2166static unsigned long calc_load_update; 2212static unsigned long calc_load_update;
2167unsigned long avenrun[3]; 2213unsigned long avenrun[3];
2168EXPORT_SYMBOL(avenrun); 2214EXPORT_SYMBOL(avenrun); /* should be removed */
2215
2216/**
2217 * get_avenrun - get the load average array
2218 * @loads: pointer to dest load array
2219 * @offset: offset to add
2220 * @shift: shift count to shift the result left
2221 *
2222 * These values are estimates at best, so no need for locking.
2223 */
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2169 2230
2170static long calc_load_fold_active(struct rq *this_rq) 2231static long calc_load_fold_active(struct rq *this_rq)
2171{ 2232{
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
2182 return delta; 2243 return delta;
2183} 2244}
2184 2245
2246/*
2247 * a1 = a0 * e + a * (1 - e)
2248 */
2185static unsigned long 2249static unsigned long
2186calc_load(unsigned long load, unsigned long exp, unsigned long active) 2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2187{ 2251{
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2193 2257
2194#ifdef CONFIG_NO_HZ 2258#ifdef CONFIG_NO_HZ
2195/* 2259/*
2196 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2260 * Handle NO_HZ for the global load-average.
2261 *
2262 * Since the above described distributed algorithm to compute the global
2263 * load-average relies on per-cpu sampling from the tick, it is affected by
2264 * NO_HZ.
2265 *
2266 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2267 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2268 * when we read the global state.
2269 *
2270 * Obviously reality has to ruin such a delightfully simple scheme:
2271 *
2272 * - When we go NO_HZ idle during the window, we can negate our sample
2273 * contribution, causing under-accounting.
2274 *
2275 * We avoid this by keeping two idle-delta counters and flipping them
2276 * when the window starts, thus separating old and new NO_HZ load.
2277 *
2278 * The only trick is the slight shift in index flip for read vs write.
2279 *
2280 * 0s 5s 10s 15s
2281 * +10 +10 +10 +10
2282 * |-|-----------|-|-----------|-|-----------|-|
2283 * r:0 0 1 1 0 0 1 1 0
2284 * w:0 1 1 0 0 1 1 0 0
2285 *
2286 * This ensures we'll fold the old idle contribution in this window while
2287 * accumlating the new one.
2288 *
2289 * - When we wake up from NO_HZ idle during the window, we push up our
2290 * contribution, since we effectively move our sample point to a known
2291 * busy state.
2292 *
2293 * This is solved by pushing the window forward, and thus skipping the
2294 * sample, for this cpu (effectively using the idle-delta for this cpu which
2295 * was in effect at the time the window opened). This also solves the issue
2296 * of having to deal with a cpu having been in NOHZ idle for multiple
2297 * LOAD_FREQ intervals.
2197 * 2298 *
2198 * When making the ILB scale, we should try to pull this in as well. 2299 * When making the ILB scale, we should try to pull this in as well.
2199 */ 2300 */
2200static atomic_long_t calc_load_tasks_idle; 2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2201 2303
2202void calc_load_account_idle(struct rq *this_rq) 2304static inline int calc_load_write_idx(void)
2203{ 2305{
2306 int idx = calc_load_idx;
2307
2308 /*
2309 * See calc_global_nohz(), if we observe the new index, we also
2310 * need to observe the new update time.
2311 */
2312 smp_rmb();
2313
2314 /*
2315 * If the folding window started, make sure we start writing in the
2316 * next idle-delta.
2317 */
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2320
2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2204 long delta; 2332 long delta;
2205 2333
2334 /*
2335 * We're going into NOHZ mode, if there's any pending delta, fold it
2336 * into the pending idle delta.
2337 */
2206 delta = calc_load_fold_active(this_rq); 2338 delta = calc_load_fold_active(this_rq);
2207 if (delta) 2339 if (delta) {
2208 atomic_long_add(delta, &calc_load_tasks_idle); 2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2209} 2343}
2210 2344
2211static long calc_load_fold_idle(void) 2345void calc_load_exit_idle(void)
2212{ 2346{
2213 long delta = 0; 2347 struct rq *this_rq = this_rq();
2348
2349 /*
2350 * If we're still before the sample window, we're done.
2351 */
2352 if (time_before(jiffies, this_rq->calc_load_update))
2353 return;
2214 2354
2215 /* 2355 /*
2216 * Its got a race, we don't care... 2356 * We woke inside or after the sample window, this means we're already
2357 * accounted through the nohz accounting, so skip the entire deal and
2358 * sync up for the next window.
2217 */ 2359 */
2218 if (atomic_long_read(&calc_load_tasks_idle)) 2360 this_rq->calc_load_update = calc_load_update;
2219 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2220 2372
2221 return delta; 2373 return delta;
2222} 2374}
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
2302{ 2454{
2303 long delta, active, n; 2455 long delta, active, n;
2304 2456
2305 /* 2457 if (!time_before(jiffies, calc_load_update + 10)) {
2306 * If we crossed a calc_load_update boundary, make sure to fold 2458 /*
2307 * any pending idle changes, the respective CPUs might have 2459 * Catch-up, fold however many we are behind still
2308 * missed the tick driven calc_load_account_active() update 2460 */
2309 * due to NO_HZ. 2461 delta = jiffies - calc_load_update - 10;
2310 */ 2462 n = 1 + (delta / LOAD_FREQ);
2311 delta = calc_load_fold_idle();
2312 if (delta)
2313 atomic_long_add(delta, &calc_load_tasks);
2314
2315 /*
2316 * It could be the one fold was all it took, we done!
2317 */
2318 if (time_before(jiffies, calc_load_update + 10))
2319 return;
2320
2321 /*
2322 * Catch-up, fold however many we are behind still
2323 */
2324 delta = jiffies - calc_load_update - 10;
2325 n = 1 + (delta / LOAD_FREQ);
2326 2463
2327 active = atomic_long_read(&calc_load_tasks); 2464 active = atomic_long_read(&calc_load_tasks);
2328 active = active > 0 ? active * FIXED_1 : 0; 2465 active = active > 0 ? active * FIXED_1 : 0;
2329 2466
2330 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2331 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2332 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2333 2470
2334 calc_load_update += n * LOAD_FREQ; 2471 calc_load_update += n * LOAD_FREQ;
2335} 2472 }
2336#else
2337void calc_load_account_idle(struct rq *this_rq)
2338{
2339}
2340 2473
2341static inline long calc_load_fold_idle(void) 2474 /*
2342{ 2475 * Flip the idle index...
2343 return 0; 2476 *
2477 * Make sure we first write the new time then flip the index, so that
2478 * calc_load_write_idx() will see the new time when it reads the new
2479 * index, this avoids a double flip messing things up.
2480 */
2481 smp_wmb();
2482 calc_load_idx++;
2344} 2483}
2484#else /* !CONFIG_NO_HZ */
2345 2485
2346static void calc_global_nohz(void) 2486static inline long calc_load_fold_idle(void) { return 0; }
2347{ 2487static inline void calc_global_nohz(void) { }
2348}
2349#endif
2350 2488
2351/** 2489#endif /* CONFIG_NO_HZ */
2352 * get_avenrun - get the load average array
2353 * @loads: pointer to dest load array
2354 * @offset: offset to add
2355 * @shift: shift count to shift the result left
2356 *
2357 * These values are estimates at best, so no need for locking.
2358 */
2359void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2360{
2361 loads[0] = (avenrun[0] + offset) << shift;
2362 loads[1] = (avenrun[1] + offset) << shift;
2363 loads[2] = (avenrun[2] + offset) << shift;
2364}
2365 2490
2366/* 2491/*
2367 * calc_load - update the avenrun load estimates 10 ticks after the 2492 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2369 */ 2494 */
2370void calc_global_load(unsigned long ticks) 2495void calc_global_load(unsigned long ticks)
2371{ 2496{
2372 long active; 2497 long active, delta;
2373 2498
2374 if (time_before(jiffies, calc_load_update + 10)) 2499 if (time_before(jiffies, calc_load_update + 10))
2375 return; 2500 return;
2376 2501
2502 /*
2503 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2504 */
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2377 active = atomic_long_read(&calc_load_tasks); 2509 active = atomic_long_read(&calc_load_tasks);
2378 active = active > 0 ? active * FIXED_1 : 0; 2510 active = active > 0 ? active * FIXED_1 : 0;
2379 2511
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
2384 calc_load_update += LOAD_FREQ; 2516 calc_load_update += LOAD_FREQ;
2385 2517
2386 /* 2518 /*
2387 * Account one period with whatever state we found before 2519 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2388 * folding in the nohz state and ageing the entire idle period.
2389 *
2390 * This avoids loosing a sample when we go idle between
2391 * calc_load_account_active() (10 ticks ago) and now and thus
2392 * under-accounting.
2393 */ 2520 */
2394 calc_global_nohz(); 2521 calc_global_nohz();
2395} 2522}
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
2406 return; 2533 return;
2407 2534
2408 delta = calc_load_fold_active(this_rq); 2535 delta = calc_load_fold_active(this_rq);
2409 delta += calc_load_fold_idle();
2410 if (delta) 2536 if (delta)
2411 atomic_long_add(delta, &calc_load_tasks); 2537 atomic_long_add(delta, &calc_load_tasks);
2412 2538
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
2414} 2540}
2415 2541
2416/* 2542/*
2543 * End of global load-average stuff
2544 */
2545
2546/*
2417 * The exact cpuload at various idx values, calculated at every tick would be 2547 * The exact cpuload at various idx values, calculated at every tick would be
2418 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2548 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2419 * 2549 *
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 25static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 26{
27 schedstat_inc(rq, sched_goidle); 27 schedstat_inc(rq, sched_goidle);
28 calc_load_account_idle(rq);
29 return rq->idle; 28 return rq->idle;
30} 29}
31 30
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea7f33d..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)
942 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 942 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
943} 943}
944 944
945void calc_load_account_idle(struct rq *this_rq);
946
947#ifdef CONFIG_SCHED_HRTICK 945#ifdef CONFIG_SCHED_HRTICK
948 946
949/* 947/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 677102789cf2..be4f856d52f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1971void ptrace_notify(int exit_code) 1971void ptrace_notify(int exit_code)
1972{ 1972{
1973 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); 1973 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1974 if (unlikely(current->task_works)) {
1975 if (test_and_clear_ti_thread_flag(current_thread_info(),
1976 TIF_NOTIFY_RESUME)) {
1977 smp_mb__after_clear_bit();
1978 task_work_run();
1979 }
1980 }
1974 1981
1975 spin_lock_irq(&current->sighand->siglock); 1982 spin_lock_irq(&current->sighand->siglock);
1976 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); 1983 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2191 struct signal_struct *signal = current->signal; 2198 struct signal_struct *signal = current->signal;
2192 int signr; 2199 int signr;
2193 2200
2201 if (unlikely(current->task_works)) {
2202 if (test_and_clear_ti_thread_flag(current_thread_info(),
2203 TIF_NOTIFY_RESUME)) {
2204 smp_mb__after_clear_bit();
2205 task_work_run();
2206 }
2207 }
2208
2194 if (unlikely(uprobe_deny_signal())) 2209 if (unlikely(uprobe_deny_signal()))
2195 return 0; 2210 return 0;
2196 2211
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae5b24875e..29dd40a9f2f4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
581 return 0; 581 return 0;
582} 582}
583EXPORT_SYMBOL(smp_call_function); 583EXPORT_SYMBOL(smp_call_function);
584
585void ipi_call_lock(void)
586{
587 raw_spin_lock(&call_function.lock);
588}
589
590void ipi_call_unlock(void)
591{
592 raw_spin_unlock(&call_function.lock);
593}
594
595void ipi_call_lock_irq(void)
596{
597 raw_spin_lock_irq(&call_function.lock);
598}
599
600void ipi_call_unlock_irq(void)
601{
602 raw_spin_unlock_irq(&call_function.lock);
603}
604#endif /* USE_GENERIC_SMP_HELPERS */ 584#endif /* USE_GENERIC_SMP_HELPERS */
605 585
606/* Setup configured maximum number of CPUs to activate */ 586/* Setup configured maximum number of CPUs to activate */
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb8472..6ef9433e1c70 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -3,8 +3,6 @@
3 3
4struct task_struct; 4struct task_struct;
5 5
6int smpboot_prepare(unsigned int cpu);
7
8#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD 6#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
9struct task_struct *idle_thread_get(unsigned int cpu); 7struct task_struct *idle_thread_get(unsigned int cpu);
10void idle_thread_set_boot_cpu(void); 8void idle_thread_set_boot_cpu(void);
diff --git a/kernel/sys.c b/kernel/sys.c
index e0c8ffc50d7f..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)
1788#ifdef CONFIG_CHECKPOINT_RESTORE 1788#ifdef CONFIG_CHECKPOINT_RESTORE
1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1790{ 1790{
1791 struct vm_area_struct *vma;
1792 struct file *exe_file; 1791 struct file *exe_file;
1793 struct dentry *dentry; 1792 struct dentry *dentry;
1794 int err; 1793 int err;
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1816 down_write(&mm->mmap_sem); 1815 down_write(&mm->mmap_sem);
1817 1816
1818 /* 1817 /*
1819 * Forbid mm->exe_file change if there are mapped other files. 1818 * Forbid mm->exe_file change if old file still mapped.
1820 */ 1819 */
1821 err = -EBUSY; 1820 err = -EBUSY;
1822 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1821 if (mm->exe_file) {
1823 if (vma->vm_file && !path_equal(&vma->vm_file->f_path, 1822 struct vm_area_struct *vma;
1824 &exe_file->f_path)) 1823
1825 goto exit_unlock; 1824 for (vma = mm->mmap; vma; vma = vma->vm_next)
1825 if (vma->vm_file &&
1826 path_equal(&vma->vm_file->f_path,
1827 &mm->exe_file->f_path))
1828 goto exit_unlock;
1826 } 1829 }
1827 1830
1828 /* 1831 /*
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1835 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) 1838 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1836 goto exit_unlock; 1839 goto exit_unlock;
1837 1840
1841 err = 0;
1838 set_mm_exe_file(mm, exe_file); 1842 set_mm_exe_file(mm, exe_file);
1839exit_unlock: 1843exit_unlock:
1840 up_write(&mm->mmap_sem); 1844 up_write(&mm->mmap_sem);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 82d1c794066d..91d4e1742a0c 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -3,82 +3,78 @@
3#include <linux/tracehook.h> 3#include <linux/tracehook.h>
4 4
5int 5int
6task_work_add(struct task_struct *task, struct task_work *twork, bool notify) 6task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
7{ 7{
8 struct callback_head *last, *first;
8 unsigned long flags; 9 unsigned long flags;
9 int err = -ESRCH;
10 10
11#ifndef TIF_NOTIFY_RESUME
12 if (notify)
13 return -ENOTSUPP;
14#endif
15 /* 11 /*
16 * We must not insert the new work if the task has already passed 12 * Not inserting the new work if the task has already passed
17 * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() 13 * exit_task_work() is the responisbility of callers.
18 * and check PF_EXITING under pi_lock.
19 */ 14 */
20 raw_spin_lock_irqsave(&task->pi_lock, flags); 15 raw_spin_lock_irqsave(&task->pi_lock, flags);
21 if (likely(!(task->flags & PF_EXITING))) { 16 last = task->task_works;
22 hlist_add_head(&twork->hlist, &task->task_works); 17 first = last ? last->next : twork;
23 err = 0; 18 twork->next = first;
24 } 19 if (last)
20 last->next = twork;
21 task->task_works = twork;
25 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 22 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
26 23
27 /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ 24 /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
28 if (likely(!err) && notify) 25 if (notify)
29 set_notify_resume(task); 26 set_notify_resume(task);
30 return err; 27 return 0;
31} 28}
32 29
33struct task_work * 30struct callback_head *
34task_work_cancel(struct task_struct *task, task_work_func_t func) 31task_work_cancel(struct task_struct *task, task_work_func_t func)
35{ 32{
36 unsigned long flags; 33 unsigned long flags;
37 struct task_work *twork; 34 struct callback_head *last, *res = NULL;
38 struct hlist_node *pos;
39 35
40 raw_spin_lock_irqsave(&task->pi_lock, flags); 36 raw_spin_lock_irqsave(&task->pi_lock, flags);
41 hlist_for_each_entry(twork, pos, &task->task_works, hlist) { 37 last = task->task_works;
42 if (twork->func == func) { 38 if (last) {
43 hlist_del(&twork->hlist); 39 struct callback_head *q = last, *p = q->next;
44 goto found; 40 while (1) {
41 if (p->func == func) {
42 q->next = p->next;
43 if (p == last)
44 task->task_works = q == p ? NULL : q;
45 res = p;
46 break;
47 }
48 if (p == last)
49 break;
50 q = p;
51 p = q->next;
45 } 52 }
46 } 53 }
47 twork = NULL;
48 found:
49 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 54 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
50 55 return res;
51 return twork;
52} 56}
53 57
54void task_work_run(void) 58void task_work_run(void)
55{ 59{
56 struct task_struct *task = current; 60 struct task_struct *task = current;
57 struct hlist_head task_works; 61 struct callback_head *p, *q;
58 struct hlist_node *pos;
59 62
60 raw_spin_lock_irq(&task->pi_lock); 63 while (1) {
61 hlist_move_list(&task->task_works, &task_works); 64 raw_spin_lock_irq(&task->pi_lock);
62 raw_spin_unlock_irq(&task->pi_lock); 65 p = task->task_works;
66 task->task_works = NULL;
67 raw_spin_unlock_irq(&task->pi_lock);
63 68
64 if (unlikely(hlist_empty(&task_works))) 69 if (unlikely(!p))
65 return; 70 return;
66 /*
67 * We use hlist to save the space in task_struct, but we want fifo.
68 * Find the last entry, the list should be short, then process them
69 * in reverse order.
70 */
71 for (pos = task_works.first; pos->next; pos = pos->next)
72 ;
73 71
74 for (;;) { 72 q = p->next; /* head */
75 struct hlist_node **pprev = pos->pprev; 73 p->next = NULL; /* cut it */
76 struct task_work *twork = container_of(pos, struct task_work, 74 while (q) {
77 hlist); 75 p = q->next;
78 twork->func(twork); 76 q->func(q);
79 77 q = p;
80 if (pprev == &task_works.first) 78 }
81 break;
82 pos = container_of(pprev, struct hlist_node, next);
83 } 79 }
84} 80}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7bb..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
409 time_state = TIME_DEL; 409 time_state = TIME_DEL;
410 break; 410 break;
411 case TIME_INS: 411 case TIME_INS:
412 if (secs % 86400 == 0) { 412 if (!(time_status & STA_INS))
413 time_state = TIME_OK;
414 else if (secs % 86400 == 0) {
413 leap = -1; 415 leap = -1;
414 time_state = TIME_OOP; 416 time_state = TIME_OOP;
415 time_tai++; 417 time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
418 } 420 }
419 break; 421 break;
420 case TIME_DEL: 422 case TIME_DEL:
421 if ((secs + 1) % 86400 == 0) { 423 if (!(time_status & STA_DEL))
424 time_state = TIME_OK;
425 else if ((secs + 1) % 86400 == 0) {
422 leap = 1; 426 leap = 1;
423 time_tai--; 427 time_tai--;
424 time_state = TIME_WAIT; 428 time_state = TIME_WAIT;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 869997833928..024540f97f74 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void)
105/* 105/*
106 * NO HZ enabled ? 106 * NO HZ enabled ?
107 */ 107 */
108static int tick_nohz_enabled __read_mostly = 1; 108int tick_nohz_enabled __read_mostly = 1;
109 109
110/* 110/*
111 * Enable / Disable tickless mode 111 * Enable / Disable tickless mode
@@ -271,50 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
271} 271}
272EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 272EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
273 273
274static void tick_nohz_stop_sched_tick(struct tick_sched *ts) 274static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
275 ktime_t now, int cpu)
275{ 276{
276 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 277 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
278 ktime_t last_update, expires, ret = { .tv64 = 0 };
277 unsigned long rcu_delta_jiffies; 279 unsigned long rcu_delta_jiffies;
278 ktime_t last_update, expires, now;
279 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 280 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
280 u64 time_delta; 281 u64 time_delta;
281 int cpu;
282
283 cpu = smp_processor_id();
284 ts = &per_cpu(tick_cpu_sched, cpu);
285
286 now = tick_nohz_start_idle(cpu, ts);
287
288 /*
289 * If this cpu is offline and it is the one which updates
290 * jiffies, then give up the assignment and let it be taken by
291 * the cpu which runs the tick timer next. If we don't drop
292 * this here the jiffies might be stale and do_timer() never
293 * invoked.
294 */
295 if (unlikely(!cpu_online(cpu))) {
296 if (cpu == tick_do_timer_cpu)
297 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
298 }
299
300 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
301 return;
302 282
303 if (need_resched())
304 return;
305
306 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
307 static int ratelimit;
308
309 if (ratelimit < 10) {
310 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
311 (unsigned int) local_softirq_pending());
312 ratelimit++;
313 }
314 return;
315 }
316
317 ts->idle_calls++;
318 /* Read jiffies and the time when jiffies were updated last */ 283 /* Read jiffies and the time when jiffies were updated last */
319 do { 284 do {
320 seq = read_seqbegin(&xtime_lock); 285 seq = read_seqbegin(&xtime_lock);
@@ -397,6 +362,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
397 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 362 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
398 goto out; 363 goto out;
399 364
365 ret = expires;
366
400 /* 367 /*
401 * nohz_stop_sched_tick can be called several times before 368 * nohz_stop_sched_tick can be called several times before
402 * the nohz_restart_sched_tick is called. This happens when 369 * the nohz_restart_sched_tick is called. This happens when
@@ -406,17 +373,12 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
406 */ 373 */
407 if (!ts->tick_stopped) { 374 if (!ts->tick_stopped) {
408 select_nohz_load_balancer(1); 375 select_nohz_load_balancer(1);
376 calc_load_enter_idle();
409 377
410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
411 ts->tick_stopped = 1; 379 ts->tick_stopped = 1;
412 ts->idle_jiffies = last_jiffies;
413 } 380 }
414 381
415 ts->idle_sleeps++;
416
417 /* Mark expires */
418 ts->idle_expires = expires;
419
420 /* 382 /*
421 * If the expiration time == KTIME_MAX, then 383 * If the expiration time == KTIME_MAX, then
422 * in this case we simply stop the tick timer. 384 * in this case we simply stop the tick timer.
@@ -447,6 +409,65 @@ out:
447 ts->next_jiffies = next_jiffies; 409 ts->next_jiffies = next_jiffies;
448 ts->last_jiffies = last_jiffies; 410 ts->last_jiffies = last_jiffies;
449 ts->sleep_length = ktime_sub(dev->next_event, now); 411 ts->sleep_length = ktime_sub(dev->next_event, now);
412
413 return ret;
414}
415
416static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
417{
418 /*
419 * If this cpu is offline and it is the one which updates
420 * jiffies, then give up the assignment and let it be taken by
421 * the cpu which runs the tick timer next. If we don't drop
422 * this here the jiffies might be stale and do_timer() never
423 * invoked.
424 */
425 if (unlikely(!cpu_online(cpu))) {
426 if (cpu == tick_do_timer_cpu)
427 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
428 }
429
430 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
431 return false;
432
433 if (need_resched())
434 return false;
435
436 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
437 static int ratelimit;
438
439 if (ratelimit < 10) {
440 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
441 (unsigned int) local_softirq_pending());
442 ratelimit++;
443 }
444 return false;
445 }
446
447 return true;
448}
449
450static void __tick_nohz_idle_enter(struct tick_sched *ts)
451{
452 ktime_t now, expires;
453 int cpu = smp_processor_id();
454
455 now = tick_nohz_start_idle(cpu, ts);
456
457 if (can_stop_idle_tick(cpu, ts)) {
458 int was_stopped = ts->tick_stopped;
459
460 ts->idle_calls++;
461
462 expires = tick_nohz_stop_sched_tick(ts, now, cpu);
463 if (expires.tv64 > 0LL) {
464 ts->idle_sleeps++;
465 ts->idle_expires = expires;
466 }
467
468 if (!was_stopped && ts->tick_stopped)
469 ts->idle_jiffies = ts->last_jiffies;
470 }
450} 471}
451 472
452/** 473/**
@@ -484,7 +505,7 @@ void tick_nohz_idle_enter(void)
484 * update of the idle time accounting in tick_nohz_start_idle(). 505 * update of the idle time accounting in tick_nohz_start_idle().
485 */ 506 */
486 ts->inidle = 1; 507 ts->inidle = 1;
487 tick_nohz_stop_sched_tick(ts); 508 __tick_nohz_idle_enter(ts);
488 509
489 local_irq_enable(); 510 local_irq_enable();
490} 511}
@@ -504,7 +525,7 @@ void tick_nohz_irq_exit(void)
504 if (!ts->inidle) 525 if (!ts->inidle)
505 return; 526 return;
506 527
507 tick_nohz_stop_sched_tick(ts); 528 __tick_nohz_idle_enter(ts);
508} 529}
509 530
510/** 531/**
@@ -522,7 +543,7 @@ ktime_t tick_nohz_get_sleep_length(void)
522static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 543static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
523{ 544{
524 hrtimer_cancel(&ts->sched_timer); 545 hrtimer_cancel(&ts->sched_timer);
525 hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); 546 hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
526 547
527 while (1) { 548 while (1) {
528 /* Forward the time to expire in the future */ 549 /* Forward the time to expire in the future */
@@ -545,6 +566,41 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
545 } 566 }
546} 567}
547 568
569static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
570{
571 /* Update jiffies first */
572 select_nohz_load_balancer(0);
573 tick_do_update_jiffies64(now);
574 update_cpu_load_nohz();
575
576 touch_softlockup_watchdog();
577 /*
578 * Cancel the scheduled timer and restore the tick
579 */
580 ts->tick_stopped = 0;
581 ts->idle_exittime = now;
582
583 tick_nohz_restart(ts, now);
584}
585
586static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
587{
588#ifndef CONFIG_VIRT_CPU_ACCOUNTING
589 unsigned long ticks;
590 /*
591 * We stopped the tick in idle. Update process times would miss the
592 * time we slept as update_process_times does only a 1 tick
593 * accounting. Enforce that this is accounted to idle !
594 */
595 ticks = jiffies - ts->idle_jiffies;
596 /*
597 * We might be one off. Do not randomly account a huge number of ticks!
598 */
599 if (ticks && ticks < LONG_MAX)
600 account_idle_ticks(ticks);
601#endif
602}
603
548/** 604/**
549 * tick_nohz_idle_exit - restart the idle tick from the idle task 605 * tick_nohz_idle_exit - restart the idle tick from the idle task
550 * 606 *
@@ -556,9 +612,6 @@ void tick_nohz_idle_exit(void)
556{ 612{
557 int cpu = smp_processor_id(); 613 int cpu = smp_processor_id();
558 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 614 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
559#ifndef CONFIG_VIRT_CPU_ACCOUNTING
560 unsigned long ticks;
561#endif
562 ktime_t now; 615 ktime_t now;
563 616
564 local_irq_disable(); 617 local_irq_disable();
@@ -573,39 +626,11 @@ void tick_nohz_idle_exit(void)
573 if (ts->idle_active) 626 if (ts->idle_active)
574 tick_nohz_stop_idle(cpu, now); 627 tick_nohz_stop_idle(cpu, now);
575 628
576 if (!ts->tick_stopped) { 629 if (ts->tick_stopped) {
577 local_irq_enable(); 630 tick_nohz_restart_sched_tick(ts, now);
578 return; 631 tick_nohz_account_idle_ticks(ts);
579 } 632 }
580 633
581 /* Update jiffies first */
582 select_nohz_load_balancer(0);
583 tick_do_update_jiffies64(now);
584 update_cpu_load_nohz();
585
586#ifndef CONFIG_VIRT_CPU_ACCOUNTING
587 /*
588 * We stopped the tick in idle. Update process times would miss the
589 * time we slept as update_process_times does only a 1 tick
590 * accounting. Enforce that this is accounted to idle !
591 */
592 ticks = jiffies - ts->idle_jiffies;
593 /*
594 * We might be one off. Do not randomly account a huge number of ticks!
595 */
596 if (ticks && ticks < LONG_MAX)
597 account_idle_ticks(ticks);
598#endif
599
600 touch_softlockup_watchdog();
601 /*
602 * Cancel the scheduled timer and restore the tick
603 */
604 ts->tick_stopped = 0;
605 ts->idle_exittime = now;
606
607 tick_nohz_restart(ts, now);
608
609 local_irq_enable(); 634 local_irq_enable();
610} 635}
611 636
@@ -809,7 +834,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
809 */ 834 */
810 if (ts->tick_stopped) { 835 if (ts->tick_stopped) {
811 touch_softlockup_watchdog(); 836 touch_softlockup_watchdog();
812 ts->idle_jiffies++; 837 if (idle_cpu(cpu))
838 ts->idle_jiffies++;
813 } 839 }
814 update_process_times(user_mode(regs)); 840 update_process_times(user_mode(regs));
815 profile_tick(CPU_PROFILING); 841 profile_tick(CPU_PROFILING);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6f46a00a1e8a..f045cc50832d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -24,32 +24,32 @@
24/* Structure holding internal timekeeping values. */ 24/* Structure holding internal timekeeping values. */
25struct timekeeper { 25struct timekeeper {
26 /* Current clocksource used for timekeeping. */ 26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock; 27 struct clocksource *clock;
28 /* NTP adjusted clock multiplier */ 28 /* NTP adjusted clock multiplier */
29 u32 mult; 29 u32 mult;
30 /* The shift value of the current clocksource. */ 30 /* The shift value of the current clocksource. */
31 int shift; 31 u32 shift;
32
33 /* Number of clock cycles in one NTP interval. */ 32 /* Number of clock cycles in one NTP interval. */
34 cycle_t cycle_interval; 33 cycle_t cycle_interval;
35 /* Number of clock shifted nano seconds in one NTP interval. */ 34 /* Number of clock shifted nano seconds in one NTP interval. */
36 u64 xtime_interval; 35 u64 xtime_interval;
37 /* shifted nano seconds left over when rounding cycle_interval */ 36 /* shifted nano seconds left over when rounding cycle_interval */
38 s64 xtime_remainder; 37 s64 xtime_remainder;
39 /* Raw nano seconds accumulated per NTP interval. */ 38 /* Raw nano seconds accumulated per NTP interval. */
40 u32 raw_interval; 39 u32 raw_interval;
40
41 /* Current CLOCK_REALTIME time in seconds */
42 u64 xtime_sec;
43 /* Clock shifted nano seconds */
44 u64 xtime_nsec;
41 45
42 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
43 u64 xtime_nsec;
44 /* Difference between accumulated time and NTP time in ntp 46 /* Difference between accumulated time and NTP time in ntp
45 * shifted nano seconds. */ 47 * shifted nano seconds. */
46 s64 ntp_error; 48 s64 ntp_error;
47 /* Shift conversion between clock shifted nano seconds and 49 /* Shift conversion between clock shifted nano seconds and
48 * ntp shifted nano seconds. */ 50 * ntp shifted nano seconds. */
49 int ntp_error_shift; 51 u32 ntp_error_shift;
50 52
51 /* The current time */
52 struct timespec xtime;
53 /* 53 /*
54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected 54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged 55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
@@ -64,14 +64,17 @@ struct timekeeper {
64 * - wall_to_monotonic is no longer the boot time, getboottime must be 64 * - wall_to_monotonic is no longer the boot time, getboottime must be
65 * used instead. 65 * used instead.
66 */ 66 */
67 struct timespec wall_to_monotonic; 67 struct timespec wall_to_monotonic;
68 /* time spent in suspend */ 68 /* time spent in suspend */
69 struct timespec total_sleep_time; 69 struct timespec total_sleep_time;
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ 70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time; 71 struct timespec raw_time;
72 72 /* Offset clock monotonic -> clock realtime */
73 ktime_t offs_real;
74 /* Offset clock monotonic -> clock boottime */
75 ktime_t offs_boot;
73 /* Seqlock for all timekeeper values */ 76 /* Seqlock for all timekeeper values */
74 seqlock_t lock; 77 seqlock_t lock;
75}; 78};
76 79
77static struct timekeeper timekeeper; 80static struct timekeeper timekeeper;
@@ -82,11 +85,37 @@ static struct timekeeper timekeeper;
82 */ 85 */
83__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 86__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
84 87
85
86/* flag for if timekeeping is suspended */ 88/* flag for if timekeeping is suspended */
87int __read_mostly timekeeping_suspended; 89int __read_mostly timekeeping_suspended;
88 90
91static inline void tk_normalize_xtime(struct timekeeper *tk)
92{
93 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
94 tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
95 tk->xtime_sec++;
96 }
97}
98
99static struct timespec tk_xtime(struct timekeeper *tk)
100{
101 struct timespec ts;
102
103 ts.tv_sec = tk->xtime_sec;
104 ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
105 return ts;
106}
107
108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
109{
110 tk->xtime_sec = ts->tv_sec;
111 tk->xtime_nsec = ts->tv_nsec << tk->shift;
112}
89 113
114static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
115{
116 tk->xtime_sec += ts->tv_sec;
117 tk->xtime_nsec += ts->tv_nsec << tk->shift;
118}
90 119
91/** 120/**
92 * timekeeper_setup_internals - Set up internals to use clocksource clock. 121 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -98,12 +127,14 @@ int __read_mostly timekeeping_suspended;
98 * 127 *
99 * Unless you're the timekeeping code, you should not be using this! 128 * Unless you're the timekeeping code, you should not be using this!
100 */ 129 */
101static void timekeeper_setup_internals(struct clocksource *clock) 130static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
102{ 131{
103 cycle_t interval; 132 cycle_t interval;
104 u64 tmp, ntpinterval; 133 u64 tmp, ntpinterval;
134 struct clocksource *old_clock;
105 135
106 timekeeper.clock = clock; 136 old_clock = tk->clock;
137 tk->clock = clock;
107 clock->cycle_last = clock->read(clock); 138 clock->cycle_last = clock->read(clock);
108 139
109 /* Do the ns -> cycle conversion first, using original mult */ 140 /* Do the ns -> cycle conversion first, using original mult */
@@ -116,71 +147,96 @@ static void timekeeper_setup_internals(struct clocksource *clock)
116 tmp = 1; 147 tmp = 1;
117 148
118 interval = (cycle_t) tmp; 149 interval = (cycle_t) tmp;
119 timekeeper.cycle_interval = interval; 150 tk->cycle_interval = interval;
120 151
121 /* Go back from cycles -> shifted ns */ 152 /* Go back from cycles -> shifted ns */
122 timekeeper.xtime_interval = (u64) interval * clock->mult; 153 tk->xtime_interval = (u64) interval * clock->mult;
123 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; 154 tk->xtime_remainder = ntpinterval - tk->xtime_interval;
124 timekeeper.raw_interval = 155 tk->raw_interval =
125 ((u64) interval * clock->mult) >> clock->shift; 156 ((u64) interval * clock->mult) >> clock->shift;
126 157
127 timekeeper.xtime_nsec = 0; 158 /* if changing clocks, convert xtime_nsec shift units */
128 timekeeper.shift = clock->shift; 159 if (old_clock) {
160 int shift_change = clock->shift - old_clock->shift;
161 if (shift_change < 0)
162 tk->xtime_nsec >>= -shift_change;
163 else
164 tk->xtime_nsec <<= shift_change;
165 }
166 tk->shift = clock->shift;
129 167
130 timekeeper.ntp_error = 0; 168 tk->ntp_error = 0;
131 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; 169 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
132 170
133 /* 171 /*
134 * The timekeeper keeps its own mult values for the currently 172 * The timekeeper keeps its own mult values for the currently
135 * active clocksource. These value will be adjusted via NTP 173 * active clocksource. These value will be adjusted via NTP
136 * to counteract clock drifting. 174 * to counteract clock drifting.
137 */ 175 */
138 timekeeper.mult = clock->mult; 176 tk->mult = clock->mult;
139} 177}
140 178
141/* Timekeeper helper functions. */ 179/* Timekeeper helper functions. */
142static inline s64 timekeeping_get_ns(void) 180static inline s64 timekeeping_get_ns(struct timekeeper *tk)
143{ 181{
144 cycle_t cycle_now, cycle_delta; 182 cycle_t cycle_now, cycle_delta;
145 struct clocksource *clock; 183 struct clocksource *clock;
184 s64 nsec;
146 185
147 /* read clocksource: */ 186 /* read clocksource: */
148 clock = timekeeper.clock; 187 clock = tk->clock;
149 cycle_now = clock->read(clock); 188 cycle_now = clock->read(clock);
150 189
151 /* calculate the delta since the last update_wall_time: */ 190 /* calculate the delta since the last update_wall_time: */
152 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 191 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
153 192
154 /* return delta convert to nanoseconds using ntp adjusted mult. */ 193 nsec = cycle_delta * tk->mult + tk->xtime_nsec;
155 return clocksource_cyc2ns(cycle_delta, timekeeper.mult, 194 nsec >>= tk->shift;
156 timekeeper.shift); 195
196 /* If arch requires, add in gettimeoffset() */
197 return nsec + arch_gettimeoffset();
157} 198}
158 199
159static inline s64 timekeeping_get_ns_raw(void) 200static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
160{ 201{
161 cycle_t cycle_now, cycle_delta; 202 cycle_t cycle_now, cycle_delta;
162 struct clocksource *clock; 203 struct clocksource *clock;
204 s64 nsec;
163 205
164 /* read clocksource: */ 206 /* read clocksource: */
165 clock = timekeeper.clock; 207 clock = tk->clock;
166 cycle_now = clock->read(clock); 208 cycle_now = clock->read(clock);
167 209
168 /* calculate the delta since the last update_wall_time: */ 210 /* calculate the delta since the last update_wall_time: */
169 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 211 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
170 212
171 /* return delta convert to nanoseconds. */ 213 /* convert delta to nanoseconds. */
172 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 214 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
215
216 /* If arch requires, add in gettimeoffset() */
217 return nsec + arch_gettimeoffset();
218}
219
220static void update_rt_offset(struct timekeeper *tk)
221{
222 struct timespec tmp, *wtm = &tk->wall_to_monotonic;
223
224 set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
225 tk->offs_real = timespec_to_ktime(tmp);
173} 226}
174 227
175/* must hold write on timekeeper.lock */ 228/* must hold write on timekeeper.lock */
176static void timekeeping_update(bool clearntp) 229static void timekeeping_update(struct timekeeper *tk, bool clearntp)
177{ 230{
231 struct timespec xt;
232
178 if (clearntp) { 233 if (clearntp) {
179 timekeeper.ntp_error = 0; 234 tk->ntp_error = 0;
180 ntp_clear(); 235 ntp_clear();
181 } 236 }
182 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, 237 update_rt_offset(tk);
183 timekeeper.clock, timekeeper.mult); 238 xt = tk_xtime(tk);
239 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
184} 240}
185 241
186 242
@@ -191,27 +247,26 @@ static void timekeeping_update(bool clearntp)
191 * update_wall_time(). This is useful before significant clock changes, 247 * update_wall_time(). This is useful before significant clock changes,
192 * as it avoids having to deal with this time offset explicitly. 248 * as it avoids having to deal with this time offset explicitly.
193 */ 249 */
194static void timekeeping_forward_now(void) 250static void timekeeping_forward_now(struct timekeeper *tk)
195{ 251{
196 cycle_t cycle_now, cycle_delta; 252 cycle_t cycle_now, cycle_delta;
197 struct clocksource *clock; 253 struct clocksource *clock;
198 s64 nsec; 254 s64 nsec;
199 255
200 clock = timekeeper.clock; 256 clock = tk->clock;
201 cycle_now = clock->read(clock); 257 cycle_now = clock->read(clock);
202 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 258 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
203 clock->cycle_last = cycle_now; 259 clock->cycle_last = cycle_now;
204 260
205 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, 261 tk->xtime_nsec += cycle_delta * tk->mult;
206 timekeeper.shift);
207 262
208 /* If arch requires, add in gettimeoffset() */ 263 /* If arch requires, add in gettimeoffset() */
209 nsec += arch_gettimeoffset(); 264 tk->xtime_nsec += arch_gettimeoffset() << tk->shift;
210 265
211 timespec_add_ns(&timekeeper.xtime, nsec); 266 tk_normalize_xtime(tk);
212 267
213 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 268 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
214 timespec_add_ns(&timekeeper.raw_time, nsec); 269 timespec_add_ns(&tk->raw_time, nsec);
215} 270}
216 271
217/** 272/**
@@ -223,18 +278,15 @@ static void timekeeping_forward_now(void)
223void getnstimeofday(struct timespec *ts) 278void getnstimeofday(struct timespec *ts)
224{ 279{
225 unsigned long seq; 280 unsigned long seq;
226 s64 nsecs; 281 s64 nsecs = 0;
227 282
228 WARN_ON(timekeeping_suspended); 283 WARN_ON(timekeeping_suspended);
229 284
230 do { 285 do {
231 seq = read_seqbegin(&timekeeper.lock); 286 seq = read_seqbegin(&timekeeper.lock);
232 287
233 *ts = timekeeper.xtime; 288 ts->tv_sec = timekeeper.xtime_sec;
234 nsecs = timekeeping_get_ns(); 289 ts->tv_nsec = timekeeping_get_ns(&timekeeper);
235
236 /* If arch requires, add in gettimeoffset() */
237 nsecs += arch_gettimeoffset();
238 290
239 } while (read_seqretry(&timekeeper.lock, seq)); 291 } while (read_seqretry(&timekeeper.lock, seq));
240 292
@@ -251,13 +303,10 @@ ktime_t ktime_get(void)
251 303
252 do { 304 do {
253 seq = read_seqbegin(&timekeeper.lock); 305 seq = read_seqbegin(&timekeeper.lock);
254 secs = timekeeper.xtime.tv_sec + 306 secs = timekeeper.xtime_sec +
255 timekeeper.wall_to_monotonic.tv_sec; 307 timekeeper.wall_to_monotonic.tv_sec;
256 nsecs = timekeeper.xtime.tv_nsec + 308 nsecs = timekeeping_get_ns(&timekeeper) +
257 timekeeper.wall_to_monotonic.tv_nsec; 309 timekeeper.wall_to_monotonic.tv_nsec;
258 nsecs += timekeeping_get_ns();
259 /* If arch requires, add in gettimeoffset() */
260 nsecs += arch_gettimeoffset();
261 310
262 } while (read_seqretry(&timekeeper.lock, seq)); 311 } while (read_seqretry(&timekeeper.lock, seq));
263 /* 312 /*
@@ -280,22 +329,19 @@ void ktime_get_ts(struct timespec *ts)
280{ 329{
281 struct timespec tomono; 330 struct timespec tomono;
282 unsigned int seq; 331 unsigned int seq;
283 s64 nsecs;
284 332
285 WARN_ON(timekeeping_suspended); 333 WARN_ON(timekeeping_suspended);
286 334
287 do { 335 do {
288 seq = read_seqbegin(&timekeeper.lock); 336 seq = read_seqbegin(&timekeeper.lock);
289 *ts = timekeeper.xtime; 337 ts->tv_sec = timekeeper.xtime_sec;
338 ts->tv_nsec = timekeeping_get_ns(&timekeeper);
290 tomono = timekeeper.wall_to_monotonic; 339 tomono = timekeeper.wall_to_monotonic;
291 nsecs = timekeeping_get_ns();
292 /* If arch requires, add in gettimeoffset() */
293 nsecs += arch_gettimeoffset();
294 340
295 } while (read_seqretry(&timekeeper.lock, seq)); 341 } while (read_seqretry(&timekeeper.lock, seq));
296 342
297 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 343 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
298 ts->tv_nsec + tomono.tv_nsec + nsecs); 344 ts->tv_nsec + tomono.tv_nsec);
299} 345}
300EXPORT_SYMBOL_GPL(ktime_get_ts); 346EXPORT_SYMBOL_GPL(ktime_get_ts);
301 347
@@ -318,20 +364,14 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
318 WARN_ON_ONCE(timekeeping_suspended); 364 WARN_ON_ONCE(timekeeping_suspended);
319 365
320 do { 366 do {
321 u32 arch_offset;
322
323 seq = read_seqbegin(&timekeeper.lock); 367 seq = read_seqbegin(&timekeeper.lock);
324 368
325 *ts_raw = timekeeper.raw_time; 369 *ts_raw = timekeeper.raw_time;
326 *ts_real = timekeeper.xtime; 370 ts_real->tv_sec = timekeeper.xtime_sec;
327 371 ts_real->tv_nsec = 0;
328 nsecs_raw = timekeeping_get_ns_raw();
329 nsecs_real = timekeeping_get_ns();
330 372
331 /* If arch requires, add in gettimeoffset() */ 373 nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
332 arch_offset = arch_gettimeoffset(); 374 nsecs_real = timekeeping_get_ns(&timekeeper);
333 nsecs_raw += arch_offset;
334 nsecs_real += arch_offset;
335 375
336 } while (read_seqretry(&timekeeper.lock, seq)); 376 } while (read_seqretry(&timekeeper.lock, seq));
337 377
@@ -366,7 +406,7 @@ EXPORT_SYMBOL(do_gettimeofday);
366 */ 406 */
367int do_settimeofday(const struct timespec *tv) 407int do_settimeofday(const struct timespec *tv)
368{ 408{
369 struct timespec ts_delta; 409 struct timespec ts_delta, xt;
370 unsigned long flags; 410 unsigned long flags;
371 411
372 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 412 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
@@ -374,15 +414,18 @@ int do_settimeofday(const struct timespec *tv)
374 414
375 write_seqlock_irqsave(&timekeeper.lock, flags); 415 write_seqlock_irqsave(&timekeeper.lock, flags);
376 416
377 timekeeping_forward_now(); 417 timekeeping_forward_now(&timekeeper);
418
419 xt = tk_xtime(&timekeeper);
420 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
421 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
378 422
379 ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
380 ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
381 timekeeper.wall_to_monotonic = 423 timekeeper.wall_to_monotonic =
382 timespec_sub(timekeeper.wall_to_monotonic, ts_delta); 424 timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
383 425
384 timekeeper.xtime = *tv; 426 tk_set_xtime(&timekeeper, tv);
385 timekeeping_update(true); 427
428 timekeeping_update(&timekeeper, true);
386 429
387 write_sequnlock_irqrestore(&timekeeper.lock, flags); 430 write_sequnlock_irqrestore(&timekeeper.lock, flags);
388 431
@@ -409,13 +452,14 @@ int timekeeping_inject_offset(struct timespec *ts)
409 452
410 write_seqlock_irqsave(&timekeeper.lock, flags); 453 write_seqlock_irqsave(&timekeeper.lock, flags);
411 454
412 timekeeping_forward_now(); 455 timekeeping_forward_now(&timekeeper);
413 456
414 timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); 457
458 tk_xtime_add(&timekeeper, ts);
415 timekeeper.wall_to_monotonic = 459 timekeeper.wall_to_monotonic =
416 timespec_sub(timekeeper.wall_to_monotonic, *ts); 460 timespec_sub(timekeeper.wall_to_monotonic, *ts);
417 461
418 timekeeping_update(true); 462 timekeeping_update(&timekeeper, true);
419 463
420 write_sequnlock_irqrestore(&timekeeper.lock, flags); 464 write_sequnlock_irqrestore(&timekeeper.lock, flags);
421 465
@@ -440,14 +484,14 @@ static int change_clocksource(void *data)
440 484
441 write_seqlock_irqsave(&timekeeper.lock, flags); 485 write_seqlock_irqsave(&timekeeper.lock, flags);
442 486
443 timekeeping_forward_now(); 487 timekeeping_forward_now(&timekeeper);
444 if (!new->enable || new->enable(new) == 0) { 488 if (!new->enable || new->enable(new) == 0) {
445 old = timekeeper.clock; 489 old = timekeeper.clock;
446 timekeeper_setup_internals(new); 490 tk_setup_internals(&timekeeper, new);
447 if (old->disable) 491 if (old->disable)
448 old->disable(old); 492 old->disable(old);
449 } 493 }
450 timekeeping_update(true); 494 timekeeping_update(&timekeeper, true);
451 495
452 write_sequnlock_irqrestore(&timekeeper.lock, flags); 496 write_sequnlock_irqrestore(&timekeeper.lock, flags);
453 497
@@ -497,7 +541,7 @@ void getrawmonotonic(struct timespec *ts)
497 541
498 do { 542 do {
499 seq = read_seqbegin(&timekeeper.lock); 543 seq = read_seqbegin(&timekeeper.lock);
500 nsecs = timekeeping_get_ns_raw(); 544 nsecs = timekeeping_get_ns_raw(&timekeeper);
501 *ts = timekeeper.raw_time; 545 *ts = timekeeper.raw_time;
502 546
503 } while (read_seqretry(&timekeeper.lock, seq)); 547 } while (read_seqretry(&timekeeper.lock, seq));
@@ -532,6 +576,7 @@ u64 timekeeping_max_deferment(void)
532{ 576{
533 unsigned long seq; 577 unsigned long seq;
534 u64 ret; 578 u64 ret;
579
535 do { 580 do {
536 seq = read_seqbegin(&timekeeper.lock); 581 seq = read_seqbegin(&timekeeper.lock);
537 582
@@ -592,18 +637,17 @@ void __init timekeeping_init(void)
592 clock = clocksource_default_clock(); 637 clock = clocksource_default_clock();
593 if (clock->enable) 638 if (clock->enable)
594 clock->enable(clock); 639 clock->enable(clock);
595 timekeeper_setup_internals(clock); 640 tk_setup_internals(&timekeeper, clock);
596 641
597 timekeeper.xtime.tv_sec = now.tv_sec; 642 tk_set_xtime(&timekeeper, &now);
598 timekeeper.xtime.tv_nsec = now.tv_nsec;
599 timekeeper.raw_time.tv_sec = 0; 643 timekeeper.raw_time.tv_sec = 0;
600 timekeeper.raw_time.tv_nsec = 0; 644 timekeeper.raw_time.tv_nsec = 0;
601 if (boot.tv_sec == 0 && boot.tv_nsec == 0) { 645 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
602 boot.tv_sec = timekeeper.xtime.tv_sec; 646 boot = tk_xtime(&timekeeper);
603 boot.tv_nsec = timekeeper.xtime.tv_nsec; 647
604 }
605 set_normalized_timespec(&timekeeper.wall_to_monotonic, 648 set_normalized_timespec(&timekeeper.wall_to_monotonic,
606 -boot.tv_sec, -boot.tv_nsec); 649 -boot.tv_sec, -boot.tv_nsec);
650 update_rt_offset(&timekeeper);
607 timekeeper.total_sleep_time.tv_sec = 0; 651 timekeeper.total_sleep_time.tv_sec = 0;
608 timekeeper.total_sleep_time.tv_nsec = 0; 652 timekeeper.total_sleep_time.tv_nsec = 0;
609 write_sequnlock_irqrestore(&timekeeper.lock, flags); 653 write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +656,12 @@ void __init timekeeping_init(void)
612/* time in seconds when suspend began */ 656/* time in seconds when suspend began */
613static struct timespec timekeeping_suspend_time; 657static struct timespec timekeeping_suspend_time;
614 658
659static void update_sleep_time(struct timespec t)
660{
661 timekeeper.total_sleep_time = t;
662 timekeeper.offs_boot = timespec_to_ktime(t);
663}
664
615/** 665/**
616 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 666 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
617 * @delta: pointer to a timespec delta value 667 * @delta: pointer to a timespec delta value
@@ -619,7 +669,8 @@ static struct timespec timekeeping_suspend_time;
619 * Takes a timespec offset measuring a suspend interval and properly 669 * Takes a timespec offset measuring a suspend interval and properly
620 * adds the sleep offset to the timekeeping variables. 670 * adds the sleep offset to the timekeeping variables.
621 */ 671 */
622static void __timekeeping_inject_sleeptime(struct timespec *delta) 672static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
673 struct timespec *delta)
623{ 674{
624 if (!timespec_valid(delta)) { 675 if (!timespec_valid(delta)) {
625 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 676 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
@@ -627,11 +678,9 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
627 return; 678 return;
628 } 679 }
629 680
630 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); 681 tk_xtime_add(tk, delta);
631 timekeeper.wall_to_monotonic = 682 tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
632 timespec_sub(timekeeper.wall_to_monotonic, *delta); 683 update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
633 timekeeper.total_sleep_time = timespec_add(
634 timekeeper.total_sleep_time, *delta);
635} 684}
636 685
637 686
@@ -657,11 +706,11 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
657 706
658 write_seqlock_irqsave(&timekeeper.lock, flags); 707 write_seqlock_irqsave(&timekeeper.lock, flags);
659 708
660 timekeeping_forward_now(); 709 timekeeping_forward_now(&timekeeper);
661 710
662 __timekeeping_inject_sleeptime(delta); 711 __timekeeping_inject_sleeptime(&timekeeper, delta);
663 712
664 timekeeping_update(true); 713 timekeeping_update(&timekeeper, true);
665 714
666 write_sequnlock_irqrestore(&timekeeper.lock, flags); 715 write_sequnlock_irqrestore(&timekeeper.lock, flags);
667 716
@@ -690,12 +739,13 @@ static void timekeeping_resume(void)
690 739
691 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 740 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
692 ts = timespec_sub(ts, timekeeping_suspend_time); 741 ts = timespec_sub(ts, timekeeping_suspend_time);
693 __timekeeping_inject_sleeptime(&ts); 742 __timekeeping_inject_sleeptime(&timekeeper, &ts);
694 } 743 }
695 /* re-base the last cycle value */ 744 /* re-base the last cycle value */
696 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 745 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
697 timekeeper.ntp_error = 0; 746 timekeeper.ntp_error = 0;
698 timekeeping_suspended = 0; 747 timekeeping_suspended = 0;
748 timekeeping_update(&timekeeper, false);
699 write_sequnlock_irqrestore(&timekeeper.lock, flags); 749 write_sequnlock_irqrestore(&timekeeper.lock, flags);
700 750
701 touch_softlockup_watchdog(); 751 touch_softlockup_watchdog();
@@ -715,7 +765,7 @@ static int timekeeping_suspend(void)
715 read_persistent_clock(&timekeeping_suspend_time); 765 read_persistent_clock(&timekeeping_suspend_time);
716 766
717 write_seqlock_irqsave(&timekeeper.lock, flags); 767 write_seqlock_irqsave(&timekeeper.lock, flags);
718 timekeeping_forward_now(); 768 timekeeping_forward_now(&timekeeper);
719 timekeeping_suspended = 1; 769 timekeeping_suspended = 1;
720 770
721 /* 771 /*
@@ -724,7 +774,7 @@ static int timekeeping_suspend(void)
724 * try to compensate so the difference in system time 774 * try to compensate so the difference in system time
725 * and persistent_clock time stays close to constant. 775 * and persistent_clock time stays close to constant.
726 */ 776 */
727 delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); 777 delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
728 delta_delta = timespec_sub(delta, old_delta); 778 delta_delta = timespec_sub(delta, old_delta);
729 if (abs(delta_delta.tv_sec) >= 2) { 779 if (abs(delta_delta.tv_sec) >= 2) {
730 /* 780 /*
@@ -763,7 +813,8 @@ device_initcall(timekeeping_init_ops);
763 * If the error is already larger, we look ahead even further 813 * If the error is already larger, we look ahead even further
764 * to compensate for late or lost adjustments. 814 * to compensate for late or lost adjustments.
765 */ 815 */
766static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, 816static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
817 s64 error, s64 *interval,
767 s64 *offset) 818 s64 *offset)
768{ 819{
769 s64 tick_error, i; 820 s64 tick_error, i;
@@ -779,7 +830,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
779 * here. This is tuned so that an error of about 1 msec is adjusted 830 * here. This is tuned so that an error of about 1 msec is adjusted
780 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 831 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
781 */ 832 */
782 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 833 error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
783 error2 = abs(error2); 834 error2 = abs(error2);
784 for (look_ahead = 0; error2 > 0; look_ahead++) 835 for (look_ahead = 0; error2 > 0; look_ahead++)
785 error2 >>= 2; 836 error2 >>= 2;
@@ -788,8 +839,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
788 * Now calculate the error in (1 << look_ahead) ticks, but first 839 * Now calculate the error in (1 << look_ahead) ticks, but first
789 * remove the single look ahead already included in the error. 840 * remove the single look ahead already included in the error.
790 */ 841 */
791 tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); 842 tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
792 tick_error -= timekeeper.xtime_interval >> 1; 843 tick_error -= tk->xtime_interval >> 1;
793 error = ((error - tick_error) >> look_ahead) + tick_error; 844 error = ((error - tick_error) >> look_ahead) + tick_error;
794 845
795 /* Finally calculate the adjustment shift value. */ 846 /* Finally calculate the adjustment shift value. */
@@ -814,9 +865,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
814 * this is optimized for the most common adjustments of -1,0,1, 865 * this is optimized for the most common adjustments of -1,0,1,
815 * for other values we can do a bit more work. 866 * for other values we can do a bit more work.
816 */ 867 */
817static void timekeeping_adjust(s64 offset) 868static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
818{ 869{
819 s64 error, interval = timekeeper.cycle_interval; 870 s64 error, interval = tk->cycle_interval;
820 int adj; 871 int adj;
821 872
822 /* 873 /*
@@ -832,7 +883,7 @@ static void timekeeping_adjust(s64 offset)
832 * 883 *
833 * Note: It does not "save" on aggravation when reading the code. 884 * Note: It does not "save" on aggravation when reading the code.
834 */ 885 */
835 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); 886 error = tk->ntp_error >> (tk->ntp_error_shift - 1);
836 if (error > interval) { 887 if (error > interval) {
837 /* 888 /*
838 * We now divide error by 4(via shift), which checks if 889 * We now divide error by 4(via shift), which checks if
@@ -854,7 +905,8 @@ static void timekeeping_adjust(s64 offset)
854 if (likely(error <= interval)) 905 if (likely(error <= interval))
855 adj = 1; 906 adj = 1;
856 else 907 else
857 adj = timekeeping_bigadjust(error, &interval, &offset); 908 adj = timekeeping_bigadjust(tk, error, &interval,
909 &offset);
858 } else if (error < -interval) { 910 } else if (error < -interval) {
859 /* See comment above, this is just switched for the negative */ 911 /* See comment above, this is just switched for the negative */
860 error >>= 2; 912 error >>= 2;
@@ -863,18 +915,17 @@ static void timekeeping_adjust(s64 offset)
863 interval = -interval; 915 interval = -interval;
864 offset = -offset; 916 offset = -offset;
865 } else 917 } else
866 adj = timekeeping_bigadjust(error, &interval, &offset); 918 adj = timekeeping_bigadjust(tk, error, &interval,
867 } else /* No adjustment needed */ 919 &offset);
920 } else
868 return; 921 return;
869 922
870 if (unlikely(timekeeper.clock->maxadj && 923 if (unlikely(tk->clock->maxadj &&
871 (timekeeper.mult + adj > 924 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
872 timekeeper.clock->mult + timekeeper.clock->maxadj))) {
873 printk_once(KERN_WARNING 925 printk_once(KERN_WARNING
874 "Adjusting %s more than 11%% (%ld vs %ld)\n", 926 "Adjusting %s more than 11%% (%ld vs %ld)\n",
875 timekeeper.clock->name, (long)timekeeper.mult + adj, 927 tk->clock->name, (long)tk->mult + adj,
876 (long)timekeeper.clock->mult + 928 (long)tk->clock->mult + tk->clock->maxadj);
877 timekeeper.clock->maxadj);
878 } 929 }
879 /* 930 /*
880 * So the following can be confusing. 931 * So the following can be confusing.
@@ -925,11 +976,60 @@ static void timekeeping_adjust(s64 offset)
925 * 976 *
926 * XXX - TODO: Doc ntp_error calculation. 977 * XXX - TODO: Doc ntp_error calculation.
927 */ 978 */
928 timekeeper.mult += adj; 979 tk->mult += adj;
929 timekeeper.xtime_interval += interval; 980 tk->xtime_interval += interval;
930 timekeeper.xtime_nsec -= offset; 981 tk->xtime_nsec -= offset;
931 timekeeper.ntp_error -= (interval - offset) << 982 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
932 timekeeper.ntp_error_shift; 983
984 /*
985 * It may be possible that when we entered this function, xtime_nsec
986 * was very small. Further, if we're slightly speeding the clocksource
987 * in the code above, its possible the required corrective factor to
988 * xtime_nsec could cause it to underflow.
989 *
990 * Now, since we already accumulated the second, cannot simply roll
991 * the accumulated second back, since the NTP subsystem has been
992 * notified via second_overflow. So instead we push xtime_nsec forward
993 * by the amount we underflowed, and add that amount into the error.
994 *
995 * We'll correct this error next time through this function, when
996 * xtime_nsec is not as small.
997 */
998 if (unlikely((s64)tk->xtime_nsec < 0)) {
999 s64 neg = -(s64)tk->xtime_nsec;
1000 tk->xtime_nsec = 0;
1001 tk->ntp_error += neg << tk->ntp_error_shift;
1002 }
1003
1004}
1005
1006
1007/**
1008 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
1009 *
1010 * Helper function that accumulates a the nsecs greater then a second
1011 * from the xtime_nsec field to the xtime_secs field.
1012 * It also calls into the NTP code to handle leapsecond processing.
1013 *
1014 */
1015static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1016{
1017 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
1018
1019 while (tk->xtime_nsec >= nsecps) {
1020 int leap;
1021
1022 tk->xtime_nsec -= nsecps;
1023 tk->xtime_sec++;
1024
1025 /* Figure out if its a leap sec and apply if needed */
1026 leap = second_overflow(tk->xtime_sec);
1027 tk->xtime_sec += leap;
1028 tk->wall_to_monotonic.tv_sec -= leap;
1029 if (leap)
1030 clock_was_set_delayed();
1031
1032 }
933} 1033}
934 1034
935 1035
@@ -942,44 +1042,36 @@ static void timekeeping_adjust(s64 offset)
942 * 1042 *
943 * Returns the unconsumed cycles. 1043 * Returns the unconsumed cycles.
944 */ 1044 */
945static cycle_t logarithmic_accumulation(cycle_t offset, int shift) 1045static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1046 u32 shift)
946{ 1047{
947 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
948 u64 raw_nsecs; 1048 u64 raw_nsecs;
949 1049
950 /* If the offset is smaller than a shifted interval, do nothing */ 1050 /* If the offset is smaller then a shifted interval, do nothing */
951 if (offset < timekeeper.cycle_interval<<shift) 1051 if (offset < tk->cycle_interval<<shift)
952 return offset; 1052 return offset;
953 1053
954 /* Accumulate one shifted interval */ 1054 /* Accumulate one shifted interval */
955 offset -= timekeeper.cycle_interval << shift; 1055 offset -= tk->cycle_interval << shift;
956 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; 1056 tk->clock->cycle_last += tk->cycle_interval << shift;
957 1057
958 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; 1058 tk->xtime_nsec += tk->xtime_interval << shift;
959 while (timekeeper.xtime_nsec >= nsecps) { 1059 accumulate_nsecs_to_secs(tk);
960 int leap;
961 timekeeper.xtime_nsec -= nsecps;
962 timekeeper.xtime.tv_sec++;
963 leap = second_overflow(timekeeper.xtime.tv_sec);
964 timekeeper.xtime.tv_sec += leap;
965 timekeeper.wall_to_monotonic.tv_sec -= leap;
966 }
967 1060
968 /* Accumulate raw time */ 1061 /* Accumulate raw time */
969 raw_nsecs = timekeeper.raw_interval << shift; 1062 raw_nsecs = tk->raw_interval << shift;
970 raw_nsecs += timekeeper.raw_time.tv_nsec; 1063 raw_nsecs += tk->raw_time.tv_nsec;
971 if (raw_nsecs >= NSEC_PER_SEC) { 1064 if (raw_nsecs >= NSEC_PER_SEC) {
972 u64 raw_secs = raw_nsecs; 1065 u64 raw_secs = raw_nsecs;
973 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); 1066 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
974 timekeeper.raw_time.tv_sec += raw_secs; 1067 tk->raw_time.tv_sec += raw_secs;
975 } 1068 }
976 timekeeper.raw_time.tv_nsec = raw_nsecs; 1069 tk->raw_time.tv_nsec = raw_nsecs;
977 1070
978 /* Accumulate error between NTP and clock interval */ 1071 /* Accumulate error between NTP and clock interval */
979 timekeeper.ntp_error += ntp_tick_length() << shift; 1072 tk->ntp_error += ntp_tick_length() << shift;
980 timekeeper.ntp_error -= 1073 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
981 (timekeeper.xtime_interval + timekeeper.xtime_remainder) << 1074 (tk->ntp_error_shift + shift);
982 (timekeeper.ntp_error_shift + shift);
983 1075
984 return offset; 1076 return offset;
985} 1077}
@@ -995,6 +1087,7 @@ static void update_wall_time(void)
995 cycle_t offset; 1087 cycle_t offset;
996 int shift = 0, maxshift; 1088 int shift = 0, maxshift;
997 unsigned long flags; 1089 unsigned long flags;
1090 s64 remainder;
998 1091
999 write_seqlock_irqsave(&timekeeper.lock, flags); 1092 write_seqlock_irqsave(&timekeeper.lock, flags);
1000 1093
@@ -1009,8 +1102,6 @@ static void update_wall_time(void)
1009#else 1102#else
1010 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1103 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1011#endif 1104#endif
1012 timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
1013 timekeeper.shift;
1014 1105
1015 /* 1106 /*
1016 * With NO_HZ we may have to accumulate many cycle_intervals 1107 * With NO_HZ we may have to accumulate many cycle_intervals
@@ -1026,62 +1117,36 @@ static void update_wall_time(void)
1026 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 1117 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1027 shift = min(shift, maxshift); 1118 shift = min(shift, maxshift);
1028 while (offset >= timekeeper.cycle_interval) { 1119 while (offset >= timekeeper.cycle_interval) {
1029 offset = logarithmic_accumulation(offset, shift); 1120 offset = logarithmic_accumulation(&timekeeper, offset, shift);
1030 if(offset < timekeeper.cycle_interval<<shift) 1121 if(offset < timekeeper.cycle_interval<<shift)
1031 shift--; 1122 shift--;
1032 } 1123 }
1033 1124
1034 /* correct the clock when NTP error is too big */ 1125 /* correct the clock when NTP error is too big */
1035 timekeeping_adjust(offset); 1126 timekeeping_adjust(&timekeeper, offset);
1036
1037 /*
1038 * Since in the loop above, we accumulate any amount of time
1039 * in xtime_nsec over a second into xtime.tv_sec, its possible for
1040 * xtime_nsec to be fairly small after the loop. Further, if we're
1041 * slightly speeding the clocksource up in timekeeping_adjust(),
1042 * its possible the required corrective factor to xtime_nsec could
1043 * cause it to underflow.
1044 *
1045 * Now, we cannot simply roll the accumulated second back, since
1046 * the NTP subsystem has been notified via second_overflow. So
1047 * instead we push xtime_nsec forward by the amount we underflowed,
1048 * and add that amount into the error.
1049 *
1050 * We'll correct this error next time through this function, when
1051 * xtime_nsec is not as small.
1052 */
1053 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
1054 s64 neg = -(s64)timekeeper.xtime_nsec;
1055 timekeeper.xtime_nsec = 0;
1056 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
1057 }
1058 1127
1059 1128
1060 /* 1129 /*
1061 * Store full nanoseconds into xtime after rounding it up and 1130 * Store only full nanoseconds into xtime_nsec after rounding
1062 * add the remainder to the error difference. 1131 * it up and add the remainder to the error difference.
1063 */ 1132 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1064 timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> 1133 * by truncating the remainder in vsyscalls. However, it causes
1065 timekeeper.shift) + 1; 1134 * additional work to be done in timekeeping_adjust(). Once
1066 timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << 1135 * the vsyscall implementations are converted to use xtime_nsec
1067 timekeeper.shift; 1136 * (shifted nanoseconds), this can be killed.
1068 timekeeper.ntp_error += timekeeper.xtime_nsec << 1137 */
1069 timekeeper.ntp_error_shift; 1138 remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
1139 timekeeper.xtime_nsec -= remainder;
1140 timekeeper.xtime_nsec += 1 << timekeeper.shift;
1141 timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
1070 1142
1071 /* 1143 /*
1072 * Finally, make sure that after the rounding 1144 * Finally, make sure that after the rounding
1073 * xtime.tv_nsec isn't larger than NSEC_PER_SEC 1145 * xtime_nsec isn't larger than NSEC_PER_SEC
1074 */ 1146 */
1075 if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { 1147 accumulate_nsecs_to_secs(&timekeeper);
1076 int leap;
1077 timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
1078 timekeeper.xtime.tv_sec++;
1079 leap = second_overflow(timekeeper.xtime.tv_sec);
1080 timekeeper.xtime.tv_sec += leap;
1081 timekeeper.wall_to_monotonic.tv_sec -= leap;
1082 }
1083 1148
1084 timekeeping_update(false); 1149 timekeeping_update(&timekeeper, false);
1085 1150
1086out: 1151out:
1087 write_sequnlock_irqrestore(&timekeeper.lock, flags); 1152 write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -1126,21 +1191,20 @@ void get_monotonic_boottime(struct timespec *ts)
1126{ 1191{
1127 struct timespec tomono, sleep; 1192 struct timespec tomono, sleep;
1128 unsigned int seq; 1193 unsigned int seq;
1129 s64 nsecs;
1130 1194
1131 WARN_ON(timekeeping_suspended); 1195 WARN_ON(timekeeping_suspended);
1132 1196
1133 do { 1197 do {
1134 seq = read_seqbegin(&timekeeper.lock); 1198 seq = read_seqbegin(&timekeeper.lock);
1135 *ts = timekeeper.xtime; 1199 ts->tv_sec = timekeeper.xtime_sec;
1200 ts->tv_nsec = timekeeping_get_ns(&timekeeper);
1136 tomono = timekeeper.wall_to_monotonic; 1201 tomono = timekeeper.wall_to_monotonic;
1137 sleep = timekeeper.total_sleep_time; 1202 sleep = timekeeper.total_sleep_time;
1138 nsecs = timekeeping_get_ns();
1139 1203
1140 } while (read_seqretry(&timekeeper.lock, seq)); 1204 } while (read_seqretry(&timekeeper.lock, seq));
1141 1205
1142 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1206 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
1143 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); 1207 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
1144} 1208}
1145EXPORT_SYMBOL_GPL(get_monotonic_boottime); 1209EXPORT_SYMBOL_GPL(get_monotonic_boottime);
1146 1210
@@ -1173,13 +1237,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1173 1237
1174unsigned long get_seconds(void) 1238unsigned long get_seconds(void)
1175{ 1239{
1176 return timekeeper.xtime.tv_sec; 1240 return timekeeper.xtime_sec;
1177} 1241}
1178EXPORT_SYMBOL(get_seconds); 1242EXPORT_SYMBOL(get_seconds);
1179 1243
1180struct timespec __current_kernel_time(void) 1244struct timespec __current_kernel_time(void)
1181{ 1245{
1182 return timekeeper.xtime; 1246 return tk_xtime(&timekeeper);
1183} 1247}
1184 1248
1185struct timespec current_kernel_time(void) 1249struct timespec current_kernel_time(void)
@@ -1190,7 +1254,7 @@ struct timespec current_kernel_time(void)
1190 do { 1254 do {
1191 seq = read_seqbegin(&timekeeper.lock); 1255 seq = read_seqbegin(&timekeeper.lock);
1192 1256
1193 now = timekeeper.xtime; 1257 now = tk_xtime(&timekeeper);
1194 } while (read_seqretry(&timekeeper.lock, seq)); 1258 } while (read_seqretry(&timekeeper.lock, seq));
1195 1259
1196 return now; 1260 return now;
@@ -1205,7 +1269,7 @@ struct timespec get_monotonic_coarse(void)
1205 do { 1269 do {
1206 seq = read_seqbegin(&timekeeper.lock); 1270 seq = read_seqbegin(&timekeeper.lock);
1207 1271
1208 now = timekeeper.xtime; 1272 now = tk_xtime(&timekeeper);
1209 mono = timekeeper.wall_to_monotonic; 1273 mono = timekeeper.wall_to_monotonic;
1210 } while (read_seqretry(&timekeeper.lock, seq)); 1274 } while (read_seqretry(&timekeeper.lock, seq));
1211 1275
@@ -1240,12 +1304,43 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1240 1304
1241 do { 1305 do {
1242 seq = read_seqbegin(&timekeeper.lock); 1306 seq = read_seqbegin(&timekeeper.lock);
1243 *xtim = timekeeper.xtime; 1307 *xtim = tk_xtime(&timekeeper);
1244 *wtom = timekeeper.wall_to_monotonic; 1308 *wtom = timekeeper.wall_to_monotonic;
1245 *sleep = timekeeper.total_sleep_time; 1309 *sleep = timekeeper.total_sleep_time;
1246 } while (read_seqretry(&timekeeper.lock, seq)); 1310 } while (read_seqretry(&timekeeper.lock, seq));
1247} 1311}
1248 1312
1313#ifdef CONFIG_HIGH_RES_TIMERS
1314/**
1315 * ktime_get_update_offsets - hrtimer helper
1316 * @offs_real: pointer to storage for monotonic -> realtime offset
1317 * @offs_boot: pointer to storage for monotonic -> boottime offset
1318 *
1319 * Returns current monotonic time and updates the offsets
1320 * Called from hrtimer_interupt() or retrigger_next_event()
1321 */
1322ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1323{
1324 ktime_t now;
1325 unsigned int seq;
1326 u64 secs, nsecs;
1327
1328 do {
1329 seq = read_seqbegin(&timekeeper.lock);
1330
1331 secs = timekeeper.xtime_sec;
1332 nsecs = timekeeping_get_ns(&timekeeper);
1333
1334 *offs_real = timekeeper.offs_real;
1335 *offs_boot = timekeeper.offs_boot;
1336 } while (read_seqretry(&timekeeper.lock, seq));
1337
1338 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1339 now = ktime_sub(now, *offs_real);
1340 return now;
1341}
1342#endif
1343
1249/** 1344/**
1250 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format 1345 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1251 */ 1346 */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3258455549f4..af5a7e9f164b 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
167 { 167 {
168 struct tick_sched *ts = tick_get_tick_sched(cpu); 168 struct tick_sched *ts = tick_get_tick_sched(cpu);
169 P(nohz_mode); 169 P(nohz_mode);
170 P_ns(idle_tick); 170 P_ns(last_tick);
171 P(tick_stopped); 171 P(tick_stopped);
172 P(idle_jiffies); 172 P(idle_jiffies);
173 P(idle_calls); 173 P(idle_calls);
@@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
259 u64 now = ktime_to_ns(ktime_get()); 259 u64 now = ktime_to_ns(ktime_get());
260 int cpu; 260 int cpu;
261 261
262 SEQ_printf(m, "Timer List Version: v0.6\n"); 262 SEQ_printf(m, "Timer List Version: v0.7\n");
263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
265 265
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ec7e7e0db43..a61c09374eba 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -77,6 +77,7 @@ struct tvec_base {
77 struct timer_list *running_timer; 77 struct timer_list *running_timer;
78 unsigned long timer_jiffies; 78 unsigned long timer_jiffies;
79 unsigned long next_timer; 79 unsigned long next_timer;
80 unsigned long active_timers;
80 struct tvec_root tv1; 81 struct tvec_root tv1;
81 struct tvec tv2; 82 struct tvec tv2;
82 struct tvec tv3; 83 struct tvec tv3;
@@ -330,7 +331,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
330} 331}
331EXPORT_SYMBOL_GPL(set_timer_slack); 332EXPORT_SYMBOL_GPL(set_timer_slack);
332 333
333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 334static void
335__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
334{ 336{
335 unsigned long expires = timer->expires; 337 unsigned long expires = timer->expires;
336 unsigned long idx = expires - base->timer_jiffies; 338 unsigned long idx = expires - base->timer_jiffies;
@@ -372,6 +374,19 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
372 list_add_tail(&timer->entry, vec); 374 list_add_tail(&timer->entry, vec);
373} 375}
374 376
377static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
378{
379 __internal_add_timer(base, timer);
380 /*
381 * Update base->active_timers and base->next_timer
382 */
383 if (!tbase_get_deferrable(timer->base)) {
384 if (time_before(timer->expires, base->next_timer))
385 base->next_timer = timer->expires;
386 base->active_timers++;
387 }
388}
389
375#ifdef CONFIG_TIMER_STATS 390#ifdef CONFIG_TIMER_STATS
376void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) 391void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
377{ 392{
@@ -654,8 +669,7 @@ void init_timer_deferrable_key(struct timer_list *timer,
654} 669}
655EXPORT_SYMBOL(init_timer_deferrable_key); 670EXPORT_SYMBOL(init_timer_deferrable_key);
656 671
657static inline void detach_timer(struct timer_list *timer, 672static inline void detach_timer(struct timer_list *timer, bool clear_pending)
658 int clear_pending)
659{ 673{
660 struct list_head *entry = &timer->entry; 674 struct list_head *entry = &timer->entry;
661 675
@@ -667,6 +681,29 @@ static inline void detach_timer(struct timer_list *timer,
667 entry->prev = LIST_POISON2; 681 entry->prev = LIST_POISON2;
668} 682}
669 683
684static inline void
685detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
686{
687 detach_timer(timer, true);
688 if (!tbase_get_deferrable(timer->base))
689 timer->base->active_timers--;
690}
691
692static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
693 bool clear_pending)
694{
695 if (!timer_pending(timer))
696 return 0;
697
698 detach_timer(timer, clear_pending);
699 if (!tbase_get_deferrable(timer->base)) {
700 timer->base->active_timers--;
701 if (timer->expires == base->next_timer)
702 base->next_timer = base->timer_jiffies;
703 }
704 return 1;
705}
706
670/* 707/*
671 * We are using hashed locking: holding per_cpu(tvec_bases).lock 708 * We are using hashed locking: holding per_cpu(tvec_bases).lock
672 * means that all timers which are tied to this base via timer->base are 709 * means that all timers which are tied to this base via timer->base are
@@ -712,16 +749,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
712 749
713 base = lock_timer_base(timer, &flags); 750 base = lock_timer_base(timer, &flags);
714 751
715 if (timer_pending(timer)) { 752 ret = detach_if_pending(timer, base, false);
716 detach_timer(timer, 0); 753 if (!ret && pending_only)
717 if (timer->expires == base->next_timer && 754 goto out_unlock;
718 !tbase_get_deferrable(timer->base))
719 base->next_timer = base->timer_jiffies;
720 ret = 1;
721 } else {
722 if (pending_only)
723 goto out_unlock;
724 }
725 755
726 debug_activate(timer, expires); 756 debug_activate(timer, expires);
727 757
@@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
752 } 782 }
753 783
754 timer->expires = expires; 784 timer->expires = expires;
755 if (time_before(timer->expires, base->next_timer) &&
756 !tbase_get_deferrable(timer->base))
757 base->next_timer = timer->expires;
758 internal_add_timer(base, timer); 785 internal_add_timer(base, timer);
759 786
760out_unlock: 787out_unlock:
@@ -920,9 +947,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
920 spin_lock_irqsave(&base->lock, flags); 947 spin_lock_irqsave(&base->lock, flags);
921 timer_set_base(timer, base); 948 timer_set_base(timer, base);
922 debug_activate(timer, timer->expires); 949 debug_activate(timer, timer->expires);
923 if (time_before(timer->expires, base->next_timer) &&
924 !tbase_get_deferrable(timer->base))
925 base->next_timer = timer->expires;
926 internal_add_timer(base, timer); 950 internal_add_timer(base, timer);
927 /* 951 /*
928 * Check whether the other CPU is idle and needs to be 952 * Check whether the other CPU is idle and needs to be
@@ -959,13 +983,7 @@ int del_timer(struct timer_list *timer)
959 timer_stats_timer_clear_start_info(timer); 983 timer_stats_timer_clear_start_info(timer);
960 if (timer_pending(timer)) { 984 if (timer_pending(timer)) {
961 base = lock_timer_base(timer, &flags); 985 base = lock_timer_base(timer, &flags);
962 if (timer_pending(timer)) { 986 ret = detach_if_pending(timer, base, true);
963 detach_timer(timer, 1);
964 if (timer->expires == base->next_timer &&
965 !tbase_get_deferrable(timer->base))
966 base->next_timer = base->timer_jiffies;
967 ret = 1;
968 }
969 spin_unlock_irqrestore(&base->lock, flags); 987 spin_unlock_irqrestore(&base->lock, flags);
970 } 988 }
971 989
@@ -990,19 +1008,10 @@ int try_to_del_timer_sync(struct timer_list *timer)
990 1008
991 base = lock_timer_base(timer, &flags); 1009 base = lock_timer_base(timer, &flags);
992 1010
993 if (base->running_timer == timer) 1011 if (base->running_timer != timer) {
994 goto out; 1012 timer_stats_timer_clear_start_info(timer);
995 1013 ret = detach_if_pending(timer, base, true);
996 timer_stats_timer_clear_start_info(timer);
997 ret = 0;
998 if (timer_pending(timer)) {
999 detach_timer(timer, 1);
1000 if (timer->expires == base->next_timer &&
1001 !tbase_get_deferrable(timer->base))
1002 base->next_timer = base->timer_jiffies;
1003 ret = 1;
1004 } 1014 }
1005out:
1006 spin_unlock_irqrestore(&base->lock, flags); 1015 spin_unlock_irqrestore(&base->lock, flags);
1007 1016
1008 return ret; 1017 return ret;
@@ -1089,7 +1098,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1089 */ 1098 */
1090 list_for_each_entry_safe(timer, tmp, &tv_list, entry) { 1099 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
1091 BUG_ON(tbase_get_base(timer->base) != base); 1100 BUG_ON(tbase_get_base(timer->base) != base);
1092 internal_add_timer(base, timer); 1101 /* No accounting, while moving them */
1102 __internal_add_timer(base, timer);
1093 } 1103 }
1094 1104
1095 return index; 1105 return index;
@@ -1178,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base)
1178 timer_stats_account_timer(timer); 1188 timer_stats_account_timer(timer);
1179 1189
1180 base->running_timer = timer; 1190 base->running_timer = timer;
1181 detach_timer(timer, 1); 1191 detach_expired_timer(timer, base);
1182 1192
1183 spin_unlock_irq(&base->lock); 1193 spin_unlock_irq(&base->lock);
1184 call_timer_fn(timer, fn, data); 1194 call_timer_fn(timer, fn, data);
@@ -1316,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1316unsigned long get_next_timer_interrupt(unsigned long now) 1326unsigned long get_next_timer_interrupt(unsigned long now)
1317{ 1327{
1318 struct tvec_base *base = __this_cpu_read(tvec_bases); 1328 struct tvec_base *base = __this_cpu_read(tvec_bases);
1319 unsigned long expires; 1329 unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
1320 1330
1321 /* 1331 /*
1322 * Pretend that there is no timer pending if the cpu is offline. 1332 * Pretend that there is no timer pending if the cpu is offline.
1323 * Possible pending timers will be migrated later to an active cpu. 1333 * Possible pending timers will be migrated later to an active cpu.
1324 */ 1334 */
1325 if (cpu_is_offline(smp_processor_id())) 1335 if (cpu_is_offline(smp_processor_id()))
1326 return now + NEXT_TIMER_MAX_DELTA; 1336 return expires;
1337
1327 spin_lock(&base->lock); 1338 spin_lock(&base->lock);
1328 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1339 if (base->active_timers) {
1329 base->next_timer = __next_timer_interrupt(base); 1340 if (time_before_eq(base->next_timer, base->timer_jiffies))
1330 expires = base->next_timer; 1341 base->next_timer = __next_timer_interrupt(base);
1342 expires = base->next_timer;
1343 }
1331 spin_unlock(&base->lock); 1344 spin_unlock(&base->lock);
1332 1345
1333 if (time_before_eq(expires, now)) 1346 if (time_before_eq(expires, now))
@@ -1704,6 +1717,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1704 1717
1705 base->timer_jiffies = jiffies; 1718 base->timer_jiffies = jiffies;
1706 base->next_timer = base->timer_jiffies; 1719 base->next_timer = base->timer_jiffies;
1720 base->active_timers = 0;
1707 return 0; 1721 return 0;
1708} 1722}
1709 1723
@@ -1714,11 +1728,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1714 1728
1715 while (!list_empty(head)) { 1729 while (!list_empty(head)) {
1716 timer = list_first_entry(head, struct timer_list, entry); 1730 timer = list_first_entry(head, struct timer_list, entry);
1717 detach_timer(timer, 0); 1731 /* We ignore the accounting on the dying cpu */
1732 detach_timer(timer, false);
1718 timer_set_base(timer, new_base); 1733 timer_set_base(timer, new_base);
1719 if (time_before(timer->expires, new_base->next_timer) &&
1720 !tbase_get_deferrable(timer->base))
1721 new_base->next_timer = timer->expires;
1722 internal_add_timer(new_base, timer); 1734 internal_add_timer(new_base, timer);
1723 } 1735 }
1724} 1736}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a008663d86c8..b4f20fba09fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
312 312
313static int __register_ftrace_function(struct ftrace_ops *ops) 313static int __register_ftrace_function(struct ftrace_ops *ops)
314{ 314{
315 if (ftrace_disabled) 315 if (unlikely(ftrace_disabled))
316 return -ENODEV; 316 return -ENODEV;
317 317
318 if (FTRACE_WARN_ON(ops == &global_ops)) 318 if (FTRACE_WARN_ON(ops == &global_ops))
@@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
4299 4299
4300 mutex_lock(&ftrace_lock); 4300 mutex_lock(&ftrace_lock);
4301 4301
4302 if (unlikely(ftrace_disabled))
4303 goto out_unlock;
4304
4305 ret = __register_ftrace_function(ops); 4302 ret = __register_ftrace_function(ops);
4306 if (!ret) 4303 if (!ret)
4307 ret = ftrace_startup(ops, 0); 4304 ret = ftrace_startup(ops, 0);
4308 4305
4309
4310 out_unlock:
4311 mutex_unlock(&ftrace_lock); 4306 mutex_unlock(&ftrace_lock);
4307
4312 return ret; 4308 return ret;
4313} 4309}
4314EXPORT_SYMBOL_GPL(register_ftrace_function); 4310EXPORT_SYMBOL_GPL(register_ftrace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..49491fa7daa2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1075 rb_init_page(bpage->page); 1075 rb_init_page(bpage->page);
1076 1076
1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1078 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1078 1079
1079 ret = rb_allocate_pages(cpu_buffer, nr_pages); 1080 ret = rb_allocate_pages(cpu_buffer, nr_pages);
1080 if (ret < 0) 1081 if (ret < 0)
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1346 * If something was added to this page, it was full 1347 * If something was added to this page, it was full
1347 * since it is not the tail page. So we deduct the 1348 * since it is not the tail page. So we deduct the
1348 * bytes consumed in ring buffer from here. 1349 * bytes consumed in ring buffer from here.
1349 * No need to update overruns, since this page is 1350 * Increment overrun to account for the lost events.
1350 * deleted from ring buffer and its entries are
1351 * already accounted for.
1352 */ 1351 */
1352 local_add(page_entries, &cpu_buffer->overrun);
1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); 1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1354 } 1354 }
1355 1355
@@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3239 if (cpu_buffer->commit_page == cpu_buffer->reader_page) 3239 if (cpu_buffer->commit_page == cpu_buffer->reader_page)
3240 goto out; 3240 goto out;
3241 3241
3242 /* Don't bother swapping if the ring buffer is empty */
3243 if (rb_num_of_entries(cpu_buffer) == 0)
3244 goto out;
3245
3242 /* 3246 /*
3243 * Reset the reader page to size zero. 3247 * Reset the reader page to size zero.
3244 */ 3248 */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a7fa0702be1c..a120f98c4112 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -830,6 +830,8 @@ int register_tracer(struct tracer *type)
830 current_trace = saved_tracer; 830 current_trace = saved_tracer;
831 if (ret) { 831 if (ret) {
832 printk(KERN_CONT "FAILED!\n"); 832 printk(KERN_CONT "FAILED!\n");
833 /* Add the warning after printing 'FAILED' */
834 WARN_ON(1);
833 goto out; 835 goto out;
834 } 836 }
835 /* Only reset on passing, to avoid touching corrupted buffers */ 837 /* Only reset on passing, to avoid touching corrupted buffers */
@@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
1708 1710
1709static void trace_iterator_increment(struct trace_iterator *iter) 1711static void trace_iterator_increment(struct trace_iterator *iter)
1710{ 1712{
1713 struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
1714
1711 iter->idx++; 1715 iter->idx++;
1712 if (iter->buffer_iter[iter->cpu]) 1716 if (buf_iter)
1713 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); 1717 ring_buffer_read(buf_iter, NULL);
1714} 1718}
1715 1719
1716static struct trace_entry * 1720static struct trace_entry *
@@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1718 unsigned long *lost_events) 1722 unsigned long *lost_events)
1719{ 1723{
1720 struct ring_buffer_event *event; 1724 struct ring_buffer_event *event;
1721 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1725 struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
1722 1726
1723 if (buf_iter) 1727 if (buf_iter)
1724 event = ring_buffer_iter_peek(buf_iter, ts); 1728 event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1856 1860
1857 tr->data[cpu]->skipped_entries = 0; 1861 tr->data[cpu]->skipped_entries = 0;
1858 1862
1859 if (!iter->buffer_iter[cpu]) 1863 buf_iter = trace_buffer_iter(iter, cpu);
1864 if (!buf_iter)
1860 return; 1865 return;
1861 1866
1862 buf_iter = iter->buffer_iter[cpu];
1863 ring_buffer_iter_reset(buf_iter); 1867 ring_buffer_iter_reset(buf_iter);
1864 1868
1865 /* 1869 /*
@@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2205 2209
2206int trace_empty(struct trace_iterator *iter) 2210int trace_empty(struct trace_iterator *iter)
2207{ 2211{
2212 struct ring_buffer_iter *buf_iter;
2208 int cpu; 2213 int cpu;
2209 2214
2210 /* If we are looking at one CPU buffer, only check that one */ 2215 /* If we are looking at one CPU buffer, only check that one */
2211 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { 2216 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
2212 cpu = iter->cpu_file; 2217 cpu = iter->cpu_file;
2213 if (iter->buffer_iter[cpu]) { 2218 buf_iter = trace_buffer_iter(iter, cpu);
2214 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 2219 if (buf_iter) {
2220 if (!ring_buffer_iter_empty(buf_iter))
2215 return 0; 2221 return 0;
2216 } else { 2222 } else {
2217 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2223 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter)
2221 } 2227 }
2222 2228
2223 for_each_tracing_cpu(cpu) { 2229 for_each_tracing_cpu(cpu) {
2224 if (iter->buffer_iter[cpu]) { 2230 buf_iter = trace_buffer_iter(iter, cpu);
2225 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 2231 if (buf_iter) {
2232 if (!ring_buffer_iter_empty(buf_iter))
2226 return 0; 2233 return 0;
2227 } else { 2234 } else {
2228 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2235 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file)
2381 if (!iter) 2388 if (!iter)
2382 return ERR_PTR(-ENOMEM); 2389 return ERR_PTR(-ENOMEM);
2383 2390
2391 iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
2392 GFP_KERNEL);
2393 if (!iter->buffer_iter)
2394 goto release;
2395
2384 /* 2396 /*
2385 * We make a copy of the current tracer to avoid concurrent 2397 * We make a copy of the current tracer to avoid concurrent
2386 * changes on it while we are reading. 2398 * changes on it while we are reading.
@@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file)
2441 fail: 2453 fail:
2442 mutex_unlock(&trace_types_lock); 2454 mutex_unlock(&trace_types_lock);
2443 kfree(iter->trace); 2455 kfree(iter->trace);
2456 kfree(iter->buffer_iter);
2457release:
2444 seq_release_private(inode, file); 2458 seq_release_private(inode, file);
2445 return ERR_PTR(-ENOMEM); 2459 return ERR_PTR(-ENOMEM);
2446} 2460}
@@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file)
2481 mutex_destroy(&iter->mutex); 2495 mutex_destroy(&iter->mutex);
2482 free_cpumask_var(iter->started); 2496 free_cpumask_var(iter->started);
2483 kfree(iter->trace); 2497 kfree(iter->trace);
2498 kfree(iter->buffer_iter);
2484 seq_release_private(inode, file); 2499 seq_release_private(inode, file);
2485 return 0; 2500 return 0;
2486} 2501}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5aec220d2de0..55e1f7f0db12 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -317,6 +317,14 @@ struct tracer {
317 317
318#define TRACE_PIPE_ALL_CPU -1 318#define TRACE_PIPE_ALL_CPU -1
319 319
320static inline struct ring_buffer_iter *
321trace_buffer_iter(struct trace_iterator *iter, int cpu)
322{
323 if (iter->buffer_iter && iter->buffer_iter[cpu])
324 return iter->buffer_iter[cpu];
325 return NULL;
326}
327
320int tracer_init(struct tracer *t, struct trace_array *tr); 328int tracer_init(struct tracer *t, struct trace_array *tr);
321int tracing_is_enabled(void); 329int tracing_is_enabled(void);
322void trace_wake_up(void); 330void trace_wake_up(void);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7d2a4c653d8..ce27c8ba8d31 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
538 next = &data->ret; 538 next = &data->ret;
539 } else { 539 } else {
540 540
541 ring_iter = iter->buffer_iter[iter->cpu]; 541 ring_iter = trace_buffer_iter(iter, iter->cpu);
542 542
543 /* First peek to compare current entry and the next one */ 543 /* First peek to compare current entry and the next one */
544 if (ring_iter) 544 if (ring_iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df611a0e76c5..123b189c732c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1325,4 +1325,4 @@ __init static int init_events(void)
1325 1325
1326 return 0; 1326 return 0;
1327} 1327}
1328device_initcall(init_events); 1328early_initcall(init_events);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67df..692d97628a10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,32 +45,41 @@
45#include "workqueue_sched.h" 45#include "workqueue_sched.h"
46 46
47enum { 47enum {
48 /* global_cwq flags */ 48 /*
49 GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 49 * global_cwq flags
50 GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ 50 *
51 GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 51 * A bound gcwq is either associated or disassociated with its CPU.
52 GCWQ_FREEZING = 1 << 3, /* freeze in progress */ 52 * While associated (!DISASSOCIATED), all workers are bound to the
53 GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ 53 * CPU and none has %WORKER_UNBOUND set and concurrency management
54 * is in effect.
55 *
56 * While DISASSOCIATED, the cpu may be offline and all workers have
57 * %WORKER_UNBOUND set and concurrency management disabled, and may
58 * be executing on any CPU. The gcwq behaves as an unbound one.
59 *
60 * Note that DISASSOCIATED can be flipped only while holding
61 * managership of all pools on the gcwq to avoid changing binding
62 * state while create_worker() is in progress.
63 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65 GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66
67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
54 69
55 /* worker flags */ 70 /* worker flags */
56 WORKER_STARTED = 1 << 0, /* started */ 71 WORKER_STARTED = 1 << 0, /* started */
57 WORKER_DIE = 1 << 1, /* die die die */ 72 WORKER_DIE = 1 << 1, /* die die die */
58 WORKER_IDLE = 1 << 2, /* is idle */ 73 WORKER_IDLE = 1 << 2, /* is idle */
59 WORKER_PREP = 1 << 3, /* preparing to run works */ 74 WORKER_PREP = 1 << 3, /* preparing to run works */
60 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
61 WORKER_REBIND = 1 << 5, /* mom is home, come back */ 75 WORKER_REBIND = 1 << 5, /* mom is home, come back */
62 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 76 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
63 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 77 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
64 78
65 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | 79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
66 WORKER_CPU_INTENSIVE | WORKER_UNBOUND, 80 WORKER_CPU_INTENSIVE,
67 81
68 /* gcwq->trustee_state */ 82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
69 TRUSTEE_START = 0, /* start */
70 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
71 TRUSTEE_BUTCHER = 2, /* butcher workers */
72 TRUSTEE_RELEASE = 3, /* release workers */
73 TRUSTEE_DONE = 4, /* trustee is done */
74 83
75 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
76 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, 85 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
@@ -84,13 +93,13 @@ enum {
84 (min two ticks) */ 93 (min two ticks) */
85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 94 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
86 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 95 CREATE_COOLDOWN = HZ, /* time to breath after fail */
87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
88 96
89 /* 97 /*
90 * Rescue workers are used only on emergencies and shared by 98 * Rescue workers are used only on emergencies and shared by
91 * all cpus. Give -20. 99 * all cpus. Give -20.
92 */ 100 */
93 RESCUER_NICE_LEVEL = -20, 101 RESCUER_NICE_LEVEL = -20,
102 HIGHPRI_NICE_LEVEL = -20,
94}; 103};
95 104
96/* 105/*
@@ -115,6 +124,8 @@ enum {
115 */ 124 */
116 125
117struct global_cwq; 126struct global_cwq;
127struct worker_pool;
128struct idle_rebind;
118 129
119/* 130/*
120 * The poor guys doing the actual heavy lifting. All on-duty workers 131 * The poor guys doing the actual heavy lifting. All on-duty workers
@@ -131,12 +142,31 @@ struct worker {
131 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ 142 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
132 struct list_head scheduled; /* L: scheduled works */ 143 struct list_head scheduled; /* L: scheduled works */
133 struct task_struct *task; /* I: worker task */ 144 struct task_struct *task; /* I: worker task */
134 struct global_cwq *gcwq; /* I: the associated gcwq */ 145 struct worker_pool *pool; /* I: the associated pool */
135 /* 64 bytes boundary on 64bit, 32 on 32bit */ 146 /* 64 bytes boundary on 64bit, 32 on 32bit */
136 unsigned long last_active; /* L: last active timestamp */ 147 unsigned long last_active; /* L: last active timestamp */
137 unsigned int flags; /* X: flags */ 148 unsigned int flags; /* X: flags */
138 int id; /* I: worker id */ 149 int id; /* I: worker id */
139 struct work_struct rebind_work; /* L: rebind worker to cpu */ 150
151 /* for rebinding worker to CPU */
152 struct idle_rebind *idle_rebind; /* L: for idle worker */
153 struct work_struct rebind_work; /* L: for busy worker */
154};
155
156struct worker_pool {
157 struct global_cwq *gcwq; /* I: the owning gcwq */
158 unsigned int flags; /* X: flags */
159
160 struct list_head worklist; /* L: list of pending works */
161 int nr_workers; /* L: total number of workers */
162 int nr_idle; /* L: currently idle ones */
163
164 struct list_head idle_list; /* X: list of idle workers */
165 struct timer_list idle_timer; /* L: worker idle timeout */
166 struct timer_list mayday_timer; /* L: SOS timer for workers */
167
168 struct mutex manager_mutex; /* mutex manager should hold */
169 struct ida worker_ida; /* L: for worker IDs */
140}; 170};
141 171
142/* 172/*
@@ -146,27 +176,16 @@ struct worker {
146 */ 176 */
147struct global_cwq { 177struct global_cwq {
148 spinlock_t lock; /* the gcwq lock */ 178 spinlock_t lock; /* the gcwq lock */
149 struct list_head worklist; /* L: list of pending works */
150 unsigned int cpu; /* I: the associated cpu */ 179 unsigned int cpu; /* I: the associated cpu */
151 unsigned int flags; /* L: GCWQ_* flags */ 180 unsigned int flags; /* L: GCWQ_* flags */
152 181
153 int nr_workers; /* L: total number of workers */ 182 /* workers are chained either in busy_hash or pool idle_list */
154 int nr_idle; /* L: currently idle ones */
155
156 /* workers are chained either in the idle_list or busy_hash */
157 struct list_head idle_list; /* X: list of idle workers */
158 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; 183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
159 /* L: hash of busy workers */ 184 /* L: hash of busy workers */
160 185
161 struct timer_list idle_timer; /* L: worker idle timeout */ 186 struct worker_pool pools[2]; /* normal and highpri pools */
162 struct timer_list mayday_timer; /* L: SOS timer for dworkers */
163
164 struct ida worker_ida; /* L: for worker IDs */
165 187
166 struct task_struct *trustee; /* L: for gcwq shutdown */ 188 wait_queue_head_t rebind_hold; /* rebind hold wait */
167 unsigned int trustee_state; /* L: trustee state */
168 wait_queue_head_t trustee_wait; /* trustee wait */
169 struct worker *first_idle; /* L: first idle worker */
170} ____cacheline_aligned_in_smp; 189} ____cacheline_aligned_in_smp;
171 190
172/* 191/*
@@ -175,7 +194,7 @@ struct global_cwq {
175 * aligned at two's power of the number of flag bits. 194 * aligned at two's power of the number of flag bits.
176 */ 195 */
177struct cpu_workqueue_struct { 196struct cpu_workqueue_struct {
178 struct global_cwq *gcwq; /* I: the associated gcwq */ 197 struct worker_pool *pool; /* I: the associated pool */
179 struct workqueue_struct *wq; /* I: the owning workqueue */ 198 struct workqueue_struct *wq; /* I: the owning workqueue */
180 int work_color; /* L: current color */ 199 int work_color; /* L: current color */
181 int flush_color; /* L: flushing color */ 200 int flush_color; /* L: flushing color */
@@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
264#define CREATE_TRACE_POINTS 283#define CREATE_TRACE_POINTS
265#include <trace/events/workqueue.h> 284#include <trace/events/workqueue.h>
266 285
286#define for_each_worker_pool(pool, gcwq) \
287 for ((pool) = &(gcwq)->pools[0]; \
288 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
289
267#define for_each_busy_worker(worker, i, pos, gcwq) \ 290#define for_each_busy_worker(worker, i, pos, gcwq) \
268 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 291 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
269 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) 292 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -444,7 +467,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */
444 * try_to_wake_up(). Put it in a separate cacheline. 467 * try_to_wake_up(). Put it in a separate cacheline.
445 */ 468 */
446static DEFINE_PER_CPU(struct global_cwq, global_cwq); 469static DEFINE_PER_CPU(struct global_cwq, global_cwq);
447static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); 470static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
448 471
449/* 472/*
450 * Global cpu workqueue and nr_running counter for unbound gcwq. The 473 * Global cpu workqueue and nr_running counter for unbound gcwq. The
@@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
452 * workers have WORKER_UNBOUND set. 475 * workers have WORKER_UNBOUND set.
453 */ 476 */
454static struct global_cwq unbound_global_cwq; 477static struct global_cwq unbound_global_cwq;
455static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ 478static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
479 [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
480};
456 481
457static int worker_thread(void *__worker); 482static int worker_thread(void *__worker);
458 483
484static int worker_pool_pri(struct worker_pool *pool)
485{
486 return pool - pool->gcwq->pools;
487}
488
459static struct global_cwq *get_gcwq(unsigned int cpu) 489static struct global_cwq *get_gcwq(unsigned int cpu)
460{ 490{
461 if (cpu != WORK_CPU_UNBOUND) 491 if (cpu != WORK_CPU_UNBOUND)
@@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
464 return &unbound_global_cwq; 494 return &unbound_global_cwq;
465} 495}
466 496
467static atomic_t *get_gcwq_nr_running(unsigned int cpu) 497static atomic_t *get_pool_nr_running(struct worker_pool *pool)
468{ 498{
499 int cpu = pool->gcwq->cpu;
500 int idx = worker_pool_pri(pool);
501
469 if (cpu != WORK_CPU_UNBOUND) 502 if (cpu != WORK_CPU_UNBOUND)
470 return &per_cpu(gcwq_nr_running, cpu); 503 return &per_cpu(pool_nr_running, cpu)[idx];
471 else 504 else
472 return &unbound_gcwq_nr_running; 505 return &unbound_pool_nr_running[idx];
473} 506}
474 507
475static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, 508static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
555 588
556 if (data & WORK_STRUCT_CWQ) 589 if (data & WORK_STRUCT_CWQ)
557 return ((struct cpu_workqueue_struct *) 590 return ((struct cpu_workqueue_struct *)
558 (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; 591 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
559 592
560 cpu = data >> WORK_STRUCT_FLAG_BITS; 593 cpu = data >> WORK_STRUCT_FLAG_BITS;
561 if (cpu == WORK_CPU_NONE) 594 if (cpu == WORK_CPU_NONE)
@@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
566} 599}
567 600
568/* 601/*
569 * Policy functions. These define the policies on how the global 602 * Policy functions. These define the policies on how the global worker
570 * worker pool is managed. Unless noted otherwise, these functions 603 * pools are managed. Unless noted otherwise, these functions assume that
571 * assume that they're being called with gcwq->lock held. 604 * they're being called with gcwq->lock held.
572 */ 605 */
573 606
574static bool __need_more_worker(struct global_cwq *gcwq) 607static bool __need_more_worker(struct worker_pool *pool)
575{ 608{
576 return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || 609 return !atomic_read(get_pool_nr_running(pool));
577 gcwq->flags & GCWQ_HIGHPRI_PENDING;
578} 610}
579 611
580/* 612/*
581 * Need to wake up a worker? Called from anything but currently 613 * Need to wake up a worker? Called from anything but currently
582 * running workers. 614 * running workers.
615 *
616 * Note that, because unbound workers never contribute to nr_running, this
617 * function will always return %true for unbound gcwq as long as the
618 * worklist isn't empty.
583 */ 619 */
584static bool need_more_worker(struct global_cwq *gcwq) 620static bool need_more_worker(struct worker_pool *pool)
585{ 621{
586 return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); 622 return !list_empty(&pool->worklist) && __need_more_worker(pool);
587} 623}
588 624
589/* Can I start working? Called from busy but !running workers. */ 625/* Can I start working? Called from busy but !running workers. */
590static bool may_start_working(struct global_cwq *gcwq) 626static bool may_start_working(struct worker_pool *pool)
591{ 627{
592 return gcwq->nr_idle; 628 return pool->nr_idle;
593} 629}
594 630
595/* Do I need to keep working? Called from currently running workers. */ 631/* Do I need to keep working? Called from currently running workers. */
596static bool keep_working(struct global_cwq *gcwq) 632static bool keep_working(struct worker_pool *pool)
597{ 633{
598 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 634 atomic_t *nr_running = get_pool_nr_running(pool);
599 635
600 return !list_empty(&gcwq->worklist) && 636 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
601 (atomic_read(nr_running) <= 1 ||
602 gcwq->flags & GCWQ_HIGHPRI_PENDING);
603} 637}
604 638
605/* Do we need a new worker? Called from manager. */ 639/* Do we need a new worker? Called from manager. */
606static bool need_to_create_worker(struct global_cwq *gcwq) 640static bool need_to_create_worker(struct worker_pool *pool)
607{ 641{
608 return need_more_worker(gcwq) && !may_start_working(gcwq); 642 return need_more_worker(pool) && !may_start_working(pool);
609} 643}
610 644
611/* Do I need to be the manager? */ 645/* Do I need to be the manager? */
612static bool need_to_manage_workers(struct global_cwq *gcwq) 646static bool need_to_manage_workers(struct worker_pool *pool)
613{ 647{
614 return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; 648 return need_to_create_worker(pool) ||
649 (pool->flags & POOL_MANAGE_WORKERS);
615} 650}
616 651
617/* Do we have too many workers and should some go away? */ 652/* Do we have too many workers and should some go away? */
618static bool too_many_workers(struct global_cwq *gcwq) 653static bool too_many_workers(struct worker_pool *pool)
619{ 654{
620 bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; 655 bool managing = mutex_is_locked(&pool->manager_mutex);
621 int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ 656 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
622 int nr_busy = gcwq->nr_workers - nr_idle; 657 int nr_busy = pool->nr_workers - nr_idle;
623 658
624 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 659 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
625} 660}
@@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
629 */ 664 */
630 665
631/* Return the first worker. Safe with preemption disabled */ 666/* Return the first worker. Safe with preemption disabled */
632static struct worker *first_worker(struct global_cwq *gcwq) 667static struct worker *first_worker(struct worker_pool *pool)
633{ 668{
634 if (unlikely(list_empty(&gcwq->idle_list))) 669 if (unlikely(list_empty(&pool->idle_list)))
635 return NULL; 670 return NULL;
636 671
637 return list_first_entry(&gcwq->idle_list, struct worker, entry); 672 return list_first_entry(&pool->idle_list, struct worker, entry);
638} 673}
639 674
640/** 675/**
641 * wake_up_worker - wake up an idle worker 676 * wake_up_worker - wake up an idle worker
642 * @gcwq: gcwq to wake worker for 677 * @pool: worker pool to wake worker from
643 * 678 *
644 * Wake up the first idle worker of @gcwq. 679 * Wake up the first idle worker of @pool.
645 * 680 *
646 * CONTEXT: 681 * CONTEXT:
647 * spin_lock_irq(gcwq->lock). 682 * spin_lock_irq(gcwq->lock).
648 */ 683 */
649static void wake_up_worker(struct global_cwq *gcwq) 684static void wake_up_worker(struct worker_pool *pool)
650{ 685{
651 struct worker *worker = first_worker(gcwq); 686 struct worker *worker = first_worker(pool);
652 687
653 if (likely(worker)) 688 if (likely(worker))
654 wake_up_process(worker->task); 689 wake_up_process(worker->task);
@@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
670 struct worker *worker = kthread_data(task); 705 struct worker *worker = kthread_data(task);
671 706
672 if (!(worker->flags & WORKER_NOT_RUNNING)) 707 if (!(worker->flags & WORKER_NOT_RUNNING))
673 atomic_inc(get_gcwq_nr_running(cpu)); 708 atomic_inc(get_pool_nr_running(worker->pool));
674} 709}
675 710
676/** 711/**
@@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
692 unsigned int cpu) 727 unsigned int cpu)
693{ 728{
694 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 729 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
695 struct global_cwq *gcwq = get_gcwq(cpu); 730 struct worker_pool *pool = worker->pool;
696 atomic_t *nr_running = get_gcwq_nr_running(cpu); 731 atomic_t *nr_running = get_pool_nr_running(pool);
697 732
698 if (worker->flags & WORKER_NOT_RUNNING) 733 if (worker->flags & WORKER_NOT_RUNNING)
699 return NULL; 734 return NULL;
@@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
706 * worklist not empty test sequence is in insert_work(). 741 * worklist not empty test sequence is in insert_work().
707 * Please read comment there. 742 * Please read comment there.
708 * 743 *
709 * NOT_RUNNING is clear. This means that trustee is not in 744 * NOT_RUNNING is clear. This means that we're bound to and
710 * charge and we're running on the local cpu w/ rq lock held 745 * running on the local cpu w/ rq lock held and preemption
711 * and preemption disabled, which in turn means that none else 746 * disabled, which in turn means that none else could be
712 * could be manipulating idle_list, so dereferencing idle_list 747 * manipulating idle_list, so dereferencing idle_list without gcwq
713 * without gcwq lock is safe. 748 * lock is safe.
714 */ 749 */
715 if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) 750 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
716 to_wakeup = first_worker(gcwq); 751 to_wakeup = first_worker(pool);
717 return to_wakeup ? to_wakeup->task : NULL; 752 return to_wakeup ? to_wakeup->task : NULL;
718} 753}
719 754
@@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
733static inline void worker_set_flags(struct worker *worker, unsigned int flags, 768static inline void worker_set_flags(struct worker *worker, unsigned int flags,
734 bool wakeup) 769 bool wakeup)
735{ 770{
736 struct global_cwq *gcwq = worker->gcwq; 771 struct worker_pool *pool = worker->pool;
737 772
738 WARN_ON_ONCE(worker->task != current); 773 WARN_ON_ONCE(worker->task != current);
739 774
@@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
744 */ 779 */
745 if ((flags & WORKER_NOT_RUNNING) && 780 if ((flags & WORKER_NOT_RUNNING) &&
746 !(worker->flags & WORKER_NOT_RUNNING)) { 781 !(worker->flags & WORKER_NOT_RUNNING)) {
747 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 782 atomic_t *nr_running = get_pool_nr_running(pool);
748 783
749 if (wakeup) { 784 if (wakeup) {
750 if (atomic_dec_and_test(nr_running) && 785 if (atomic_dec_and_test(nr_running) &&
751 !list_empty(&gcwq->worklist)) 786 !list_empty(&pool->worklist))
752 wake_up_worker(gcwq); 787 wake_up_worker(pool);
753 } else 788 } else
754 atomic_dec(nr_running); 789 atomic_dec(nr_running);
755 } 790 }
@@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
769 */ 804 */
770static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 805static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
771{ 806{
772 struct global_cwq *gcwq = worker->gcwq; 807 struct worker_pool *pool = worker->pool;
773 unsigned int oflags = worker->flags; 808 unsigned int oflags = worker->flags;
774 809
775 WARN_ON_ONCE(worker->task != current); 810 WARN_ON_ONCE(worker->task != current);
@@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
783 */ 818 */
784 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 819 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
785 if (!(worker->flags & WORKER_NOT_RUNNING)) 820 if (!(worker->flags & WORKER_NOT_RUNNING))
786 atomic_inc(get_gcwq_nr_running(gcwq->cpu)); 821 atomic_inc(get_pool_nr_running(pool));
787} 822}
788 823
789/** 824/**
@@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
867} 902}
868 903
869/** 904/**
870 * gcwq_determine_ins_pos - find insertion position
871 * @gcwq: gcwq of interest
872 * @cwq: cwq a work is being queued for
873 *
874 * A work for @cwq is about to be queued on @gcwq, determine insertion
875 * position for the work. If @cwq is for HIGHPRI wq, the work is
876 * queued at the head of the queue but in FIFO order with respect to
877 * other HIGHPRI works; otherwise, at the end of the queue. This
878 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
879 * there are HIGHPRI works pending.
880 *
881 * CONTEXT:
882 * spin_lock_irq(gcwq->lock).
883 *
884 * RETURNS:
885 * Pointer to inserstion position.
886 */
887static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
888 struct cpu_workqueue_struct *cwq)
889{
890 struct work_struct *twork;
891
892 if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
893 return &gcwq->worklist;
894
895 list_for_each_entry(twork, &gcwq->worklist, entry) {
896 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
897
898 if (!(tcwq->wq->flags & WQ_HIGHPRI))
899 break;
900 }
901
902 gcwq->flags |= GCWQ_HIGHPRI_PENDING;
903 return &twork->entry;
904}
905
906/**
907 * insert_work - insert a work into gcwq 905 * insert_work - insert a work into gcwq
908 * @cwq: cwq @work belongs to 906 * @cwq: cwq @work belongs to
909 * @work: work to insert 907 * @work: work to insert
@@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
920 struct work_struct *work, struct list_head *head, 918 struct work_struct *work, struct list_head *head,
921 unsigned int extra_flags) 919 unsigned int extra_flags)
922{ 920{
923 struct global_cwq *gcwq = cwq->gcwq; 921 struct worker_pool *pool = cwq->pool;
924 922
925 /* we own @work, set data and link */ 923 /* we own @work, set data and link */
926 set_work_cwq(work, cwq, extra_flags); 924 set_work_cwq(work, cwq, extra_flags);
@@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
940 */ 938 */
941 smp_mb(); 939 smp_mb();
942 940
943 if (__need_more_worker(gcwq)) 941 if (__need_more_worker(pool))
944 wake_up_worker(gcwq); 942 wake_up_worker(pool);
945} 943}
946 944
947/* 945/*
@@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1043 if (likely(cwq->nr_active < cwq->max_active)) { 1041 if (likely(cwq->nr_active < cwq->max_active)) {
1044 trace_workqueue_activate_work(work); 1042 trace_workqueue_activate_work(work);
1045 cwq->nr_active++; 1043 cwq->nr_active++;
1046 worklist = gcwq_determine_ins_pos(gcwq, cwq); 1044 worklist = &cwq->pool->worklist;
1047 } else { 1045 } else {
1048 work_flags |= WORK_STRUCT_DELAYED; 1046 work_flags |= WORK_STRUCT_DELAYED;
1049 worklist = &cwq->delayed_works; 1047 worklist = &cwq->delayed_works;
@@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1192 */ 1190 */
1193static void worker_enter_idle(struct worker *worker) 1191static void worker_enter_idle(struct worker *worker)
1194{ 1192{
1195 struct global_cwq *gcwq = worker->gcwq; 1193 struct worker_pool *pool = worker->pool;
1194 struct global_cwq *gcwq = pool->gcwq;
1196 1195
1197 BUG_ON(worker->flags & WORKER_IDLE); 1196 BUG_ON(worker->flags & WORKER_IDLE);
1198 BUG_ON(!list_empty(&worker->entry) && 1197 BUG_ON(!list_empty(&worker->entry) &&
@@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker)
1200 1199
1201 /* can't use worker_set_flags(), also called from start_worker() */ 1200 /* can't use worker_set_flags(), also called from start_worker() */
1202 worker->flags |= WORKER_IDLE; 1201 worker->flags |= WORKER_IDLE;
1203 gcwq->nr_idle++; 1202 pool->nr_idle++;
1204 worker->last_active = jiffies; 1203 worker->last_active = jiffies;
1205 1204
1206 /* idle_list is LIFO */ 1205 /* idle_list is LIFO */
1207 list_add(&worker->entry, &gcwq->idle_list); 1206 list_add(&worker->entry, &pool->idle_list);
1208 1207
1209 if (likely(!(worker->flags & WORKER_ROGUE))) { 1208 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1210 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) 1209 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1211 mod_timer(&gcwq->idle_timer,
1212 jiffies + IDLE_WORKER_TIMEOUT);
1213 } else
1214 wake_up_all(&gcwq->trustee_wait);
1215 1210
1216 /* 1211 /*
1217 * Sanity check nr_running. Because trustee releases gcwq->lock 1212 * Sanity check nr_running. Because gcwq_unbind_fn() releases
1218 * between setting %WORKER_ROGUE and zapping nr_running, the 1213 * gcwq->lock between setting %WORKER_UNBOUND and zapping
1219 * warning may trigger spuriously. Check iff trustee is idle. 1214 * nr_running, the warning may trigger spuriously. Check iff
1215 * unbind is not in progress.
1220 */ 1216 */
1221 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && 1217 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
1222 gcwq->nr_workers == gcwq->nr_idle && 1218 pool->nr_workers == pool->nr_idle &&
1223 atomic_read(get_gcwq_nr_running(gcwq->cpu))); 1219 atomic_read(get_pool_nr_running(pool)));
1224} 1220}
1225 1221
1226/** 1222/**
@@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker)
1234 */ 1230 */
1235static void worker_leave_idle(struct worker *worker) 1231static void worker_leave_idle(struct worker *worker)
1236{ 1232{
1237 struct global_cwq *gcwq = worker->gcwq; 1233 struct worker_pool *pool = worker->pool;
1238 1234
1239 BUG_ON(!(worker->flags & WORKER_IDLE)); 1235 BUG_ON(!(worker->flags & WORKER_IDLE));
1240 worker_clr_flags(worker, WORKER_IDLE); 1236 worker_clr_flags(worker, WORKER_IDLE);
1241 gcwq->nr_idle--; 1237 pool->nr_idle--;
1242 list_del_init(&worker->entry); 1238 list_del_init(&worker->entry);
1243} 1239}
1244 1240
@@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker)
1258 * verbatim as it's best effort and blocking and gcwq may be 1254 * verbatim as it's best effort and blocking and gcwq may be
1259 * [dis]associated in the meantime. 1255 * [dis]associated in the meantime.
1260 * 1256 *
1261 * This function tries set_cpus_allowed() and locks gcwq and verifies 1257 * This function tries set_cpus_allowed() and locks gcwq and verifies the
1262 * the binding against GCWQ_DISASSOCIATED which is set during 1258 * binding against %GCWQ_DISASSOCIATED which is set during
1263 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters 1259 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1264 * idle state or fetches works without dropping lock, it can guarantee 1260 * enters idle state or fetches works without dropping lock, it can
1265 * the scheduling requirement described in the first paragraph. 1261 * guarantee the scheduling requirement described in the first paragraph.
1266 * 1262 *
1267 * CONTEXT: 1263 * CONTEXT:
1268 * Might sleep. Called without any lock but returns with gcwq->lock 1264 * Might sleep. Called without any lock but returns with gcwq->lock
@@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker)
1275static bool worker_maybe_bind_and_lock(struct worker *worker) 1271static bool worker_maybe_bind_and_lock(struct worker *worker)
1276__acquires(&gcwq->lock) 1272__acquires(&gcwq->lock)
1277{ 1273{
1278 struct global_cwq *gcwq = worker->gcwq; 1274 struct global_cwq *gcwq = worker->pool->gcwq;
1279 struct task_struct *task = worker->task; 1275 struct task_struct *task = worker->task;
1280 1276
1281 while (true) { 1277 while (true) {
@@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock)
1308 } 1304 }
1309} 1305}
1310 1306
1307struct idle_rebind {
1308 int cnt; /* # workers to be rebound */
1309 struct completion done; /* all workers rebound */
1310};
1311
1312/*
1313 * Rebind an idle @worker to its CPU. During CPU onlining, this has to
1314 * happen synchronously for idle workers. worker_thread() will test
1315 * %WORKER_REBIND before leaving idle and call this function.
1316 */
1317static void idle_worker_rebind(struct worker *worker)
1318{
1319 struct global_cwq *gcwq = worker->pool->gcwq;
1320
1321 /* CPU must be online at this point */
1322 WARN_ON(!worker_maybe_bind_and_lock(worker));
1323 if (!--worker->idle_rebind->cnt)
1324 complete(&worker->idle_rebind->done);
1325 spin_unlock_irq(&worker->pool->gcwq->lock);
1326
1327 /* we did our part, wait for rebind_workers() to finish up */
1328 wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
1329}
1330
1311/* 1331/*
1312 * Function for worker->rebind_work used to rebind rogue busy workers 1332 * Function for @worker->rebind.work used to rebind unbound busy workers to
1313 * to the associated cpu which is coming back online. This is 1333 * the associated cpu which is coming back online. This is scheduled by
1314 * scheduled by cpu up but can race with other cpu hotplug operations 1334 * cpu up but can race with other cpu hotplug operations and may be
1315 * and may be executed twice without intervening cpu down. 1335 * executed twice without intervening cpu down.
1316 */ 1336 */
1317static void worker_rebind_fn(struct work_struct *work) 1337static void busy_worker_rebind_fn(struct work_struct *work)
1318{ 1338{
1319 struct worker *worker = container_of(work, struct worker, rebind_work); 1339 struct worker *worker = container_of(work, struct worker, rebind_work);
1320 struct global_cwq *gcwq = worker->gcwq; 1340 struct global_cwq *gcwq = worker->pool->gcwq;
1321 1341
1322 if (worker_maybe_bind_and_lock(worker)) 1342 if (worker_maybe_bind_and_lock(worker))
1323 worker_clr_flags(worker, WORKER_REBIND); 1343 worker_clr_flags(worker, WORKER_REBIND);
@@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work)
1325 spin_unlock_irq(&gcwq->lock); 1345 spin_unlock_irq(&gcwq->lock);
1326} 1346}
1327 1347
1348/**
1349 * rebind_workers - rebind all workers of a gcwq to the associated CPU
1350 * @gcwq: gcwq of interest
1351 *
1352 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
1353 * is different for idle and busy ones.
1354 *
1355 * The idle ones should be rebound synchronously and idle rebinding should
1356 * be complete before any worker starts executing work items with
1357 * concurrency management enabled; otherwise, scheduler may oops trying to
1358 * wake up non-local idle worker from wq_worker_sleeping().
1359 *
1360 * This is achieved by repeatedly requesting rebinding until all idle
1361 * workers are known to have been rebound under @gcwq->lock and holding all
1362 * idle workers from becoming busy until idle rebinding is complete.
1363 *
1364 * Once idle workers are rebound, busy workers can be rebound as they
1365 * finish executing their current work items. Queueing the rebind work at
1366 * the head of their scheduled lists is enough. Note that nr_running will
1367 * be properbly bumped as busy workers rebind.
1368 *
1369 * On return, all workers are guaranteed to either be bound or have rebind
1370 * work item scheduled.
1371 */
1372static void rebind_workers(struct global_cwq *gcwq)
1373 __releases(&gcwq->lock) __acquires(&gcwq->lock)
1374{
1375 struct idle_rebind idle_rebind;
1376 struct worker_pool *pool;
1377 struct worker *worker;
1378 struct hlist_node *pos;
1379 int i;
1380
1381 lockdep_assert_held(&gcwq->lock);
1382
1383 for_each_worker_pool(pool, gcwq)
1384 lockdep_assert_held(&pool->manager_mutex);
1385
1386 /*
1387 * Rebind idle workers. Interlocked both ways. We wait for
1388 * workers to rebind via @idle_rebind.done. Workers will wait for
1389 * us to finish up by watching %WORKER_REBIND.
1390 */
1391 init_completion(&idle_rebind.done);
1392retry:
1393 idle_rebind.cnt = 1;
1394 INIT_COMPLETION(idle_rebind.done);
1395
1396 /* set REBIND and kick idle ones, we'll wait for these later */
1397 for_each_worker_pool(pool, gcwq) {
1398 list_for_each_entry(worker, &pool->idle_list, entry) {
1399 if (worker->flags & WORKER_REBIND)
1400 continue;
1401
1402 /* morph UNBOUND to REBIND */
1403 worker->flags &= ~WORKER_UNBOUND;
1404 worker->flags |= WORKER_REBIND;
1405
1406 idle_rebind.cnt++;
1407 worker->idle_rebind = &idle_rebind;
1408
1409 /* worker_thread() will call idle_worker_rebind() */
1410 wake_up_process(worker->task);
1411 }
1412 }
1413
1414 if (--idle_rebind.cnt) {
1415 spin_unlock_irq(&gcwq->lock);
1416 wait_for_completion(&idle_rebind.done);
1417 spin_lock_irq(&gcwq->lock);
1418 /* busy ones might have become idle while waiting, retry */
1419 goto retry;
1420 }
1421
1422 /*
1423 * All idle workers are rebound and waiting for %WORKER_REBIND to
1424 * be cleared inside idle_worker_rebind(). Clear and release.
1425 * Clearing %WORKER_REBIND from this foreign context is safe
1426 * because these workers are still guaranteed to be idle.
1427 */
1428 for_each_worker_pool(pool, gcwq)
1429 list_for_each_entry(worker, &pool->idle_list, entry)
1430 worker->flags &= ~WORKER_REBIND;
1431
1432 wake_up_all(&gcwq->rebind_hold);
1433
1434 /* rebind busy workers */
1435 for_each_busy_worker(worker, i, pos, gcwq) {
1436 struct work_struct *rebind_work = &worker->rebind_work;
1437
1438 /* morph UNBOUND to REBIND */
1439 worker->flags &= ~WORKER_UNBOUND;
1440 worker->flags |= WORKER_REBIND;
1441
1442 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1443 work_data_bits(rebind_work)))
1444 continue;
1445
1446 /* wq doesn't matter, use the default one */
1447 debug_work_activate(rebind_work);
1448 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
1449 worker->scheduled.next,
1450 work_color_to_flags(WORK_NO_COLOR));
1451 }
1452}
1453
1328static struct worker *alloc_worker(void) 1454static struct worker *alloc_worker(void)
1329{ 1455{
1330 struct worker *worker; 1456 struct worker *worker;
@@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void)
1333 if (worker) { 1459 if (worker) {
1334 INIT_LIST_HEAD(&worker->entry); 1460 INIT_LIST_HEAD(&worker->entry);
1335 INIT_LIST_HEAD(&worker->scheduled); 1461 INIT_LIST_HEAD(&worker->scheduled);
1336 INIT_WORK(&worker->rebind_work, worker_rebind_fn); 1462 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1337 /* on creation a worker is in !idle && prep state */ 1463 /* on creation a worker is in !idle && prep state */
1338 worker->flags = WORKER_PREP; 1464 worker->flags = WORKER_PREP;
1339 } 1465 }
@@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void)
1342 1468
1343/** 1469/**
1344 * create_worker - create a new workqueue worker 1470 * create_worker - create a new workqueue worker
1345 * @gcwq: gcwq the new worker will belong to 1471 * @pool: pool the new worker will belong to
1346 * @bind: whether to set affinity to @cpu or not
1347 * 1472 *
1348 * Create a new worker which is bound to @gcwq. The returned worker 1473 * Create a new worker which is bound to @pool. The returned worker
1349 * can be started by calling start_worker() or destroyed using 1474 * can be started by calling start_worker() or destroyed using
1350 * destroy_worker(). 1475 * destroy_worker().
1351 * 1476 *
@@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void)
1355 * RETURNS: 1480 * RETURNS:
1356 * Pointer to the newly created worker. 1481 * Pointer to the newly created worker.
1357 */ 1482 */
1358static struct worker *create_worker(struct global_cwq *gcwq, bool bind) 1483static struct worker *create_worker(struct worker_pool *pool)
1359{ 1484{
1360 bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; 1485 struct global_cwq *gcwq = pool->gcwq;
1486 const char *pri = worker_pool_pri(pool) ? "H" : "";
1361 struct worker *worker = NULL; 1487 struct worker *worker = NULL;
1362 int id = -1; 1488 int id = -1;
1363 1489
1364 spin_lock_irq(&gcwq->lock); 1490 spin_lock_irq(&gcwq->lock);
1365 while (ida_get_new(&gcwq->worker_ida, &id)) { 1491 while (ida_get_new(&pool->worker_ida, &id)) {
1366 spin_unlock_irq(&gcwq->lock); 1492 spin_unlock_irq(&gcwq->lock);
1367 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) 1493 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1368 goto fail; 1494 goto fail;
1369 spin_lock_irq(&gcwq->lock); 1495 spin_lock_irq(&gcwq->lock);
1370 } 1496 }
@@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1374 if (!worker) 1500 if (!worker)
1375 goto fail; 1501 goto fail;
1376 1502
1377 worker->gcwq = gcwq; 1503 worker->pool = pool;
1378 worker->id = id; 1504 worker->id = id;
1379 1505
1380 if (!on_unbound_cpu) 1506 if (gcwq->cpu != WORK_CPU_UNBOUND)
1381 worker->task = kthread_create_on_node(worker_thread, 1507 worker->task = kthread_create_on_node(worker_thread,
1382 worker, 1508 worker, cpu_to_node(gcwq->cpu),
1383 cpu_to_node(gcwq->cpu), 1509 "kworker/%u:%d%s", gcwq->cpu, id, pri);
1384 "kworker/%u:%d", gcwq->cpu, id);
1385 else 1510 else
1386 worker->task = kthread_create(worker_thread, worker, 1511 worker->task = kthread_create(worker_thread, worker,
1387 "kworker/u:%d", id); 1512 "kworker/u:%d%s", id, pri);
1388 if (IS_ERR(worker->task)) 1513 if (IS_ERR(worker->task))
1389 goto fail; 1514 goto fail;
1390 1515
1516 if (worker_pool_pri(pool))
1517 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1518
1391 /* 1519 /*
1392 * A rogue worker will become a regular one if CPU comes 1520 * Determine CPU binding of the new worker depending on
1393 * online later on. Make sure every worker has 1521 * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the
1394 * PF_THREAD_BOUND set. 1522 * flag remains stable across this function. See the comments
1523 * above the flag definition for details.
1524 *
1525 * As an unbound worker may later become a regular one if CPU comes
1526 * online, make sure every worker has %PF_THREAD_BOUND set.
1395 */ 1527 */
1396 if (bind && !on_unbound_cpu) 1528 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
1397 kthread_bind(worker->task, gcwq->cpu); 1529 kthread_bind(worker->task, gcwq->cpu);
1398 else { 1530 } else {
1399 worker->task->flags |= PF_THREAD_BOUND; 1531 worker->task->flags |= PF_THREAD_BOUND;
1400 if (on_unbound_cpu) 1532 worker->flags |= WORKER_UNBOUND;
1401 worker->flags |= WORKER_UNBOUND;
1402 } 1533 }
1403 1534
1404 return worker; 1535 return worker;
1405fail: 1536fail:
1406 if (id >= 0) { 1537 if (id >= 0) {
1407 spin_lock_irq(&gcwq->lock); 1538 spin_lock_irq(&gcwq->lock);
1408 ida_remove(&gcwq->worker_ida, id); 1539 ida_remove(&pool->worker_ida, id);
1409 spin_unlock_irq(&gcwq->lock); 1540 spin_unlock_irq(&gcwq->lock);
1410 } 1541 }
1411 kfree(worker); 1542 kfree(worker);
@@ -1424,7 +1555,7 @@ fail:
1424static void start_worker(struct worker *worker) 1555static void start_worker(struct worker *worker)
1425{ 1556{
1426 worker->flags |= WORKER_STARTED; 1557 worker->flags |= WORKER_STARTED;
1427 worker->gcwq->nr_workers++; 1558 worker->pool->nr_workers++;
1428 worker_enter_idle(worker); 1559 worker_enter_idle(worker);
1429 wake_up_process(worker->task); 1560 wake_up_process(worker->task);
1430} 1561}
@@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker)
1440 */ 1571 */
1441static void destroy_worker(struct worker *worker) 1572static void destroy_worker(struct worker *worker)
1442{ 1573{
1443 struct global_cwq *gcwq = worker->gcwq; 1574 struct worker_pool *pool = worker->pool;
1575 struct global_cwq *gcwq = pool->gcwq;
1444 int id = worker->id; 1576 int id = worker->id;
1445 1577
1446 /* sanity check frenzy */ 1578 /* sanity check frenzy */
@@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker)
1448 BUG_ON(!list_empty(&worker->scheduled)); 1580 BUG_ON(!list_empty(&worker->scheduled));
1449 1581
1450 if (worker->flags & WORKER_STARTED) 1582 if (worker->flags & WORKER_STARTED)
1451 gcwq->nr_workers--; 1583 pool->nr_workers--;
1452 if (worker->flags & WORKER_IDLE) 1584 if (worker->flags & WORKER_IDLE)
1453 gcwq->nr_idle--; 1585 pool->nr_idle--;
1454 1586
1455 list_del_init(&worker->entry); 1587 list_del_init(&worker->entry);
1456 worker->flags |= WORKER_DIE; 1588 worker->flags |= WORKER_DIE;
@@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker)
1461 kfree(worker); 1593 kfree(worker);
1462 1594
1463 spin_lock_irq(&gcwq->lock); 1595 spin_lock_irq(&gcwq->lock);
1464 ida_remove(&gcwq->worker_ida, id); 1596 ida_remove(&pool->worker_ida, id);
1465} 1597}
1466 1598
1467static void idle_worker_timeout(unsigned long __gcwq) 1599static void idle_worker_timeout(unsigned long __pool)
1468{ 1600{
1469 struct global_cwq *gcwq = (void *)__gcwq; 1601 struct worker_pool *pool = (void *)__pool;
1602 struct global_cwq *gcwq = pool->gcwq;
1470 1603
1471 spin_lock_irq(&gcwq->lock); 1604 spin_lock_irq(&gcwq->lock);
1472 1605
1473 if (too_many_workers(gcwq)) { 1606 if (too_many_workers(pool)) {
1474 struct worker *worker; 1607 struct worker *worker;
1475 unsigned long expires; 1608 unsigned long expires;
1476 1609
1477 /* idle_list is kept in LIFO order, check the last one */ 1610 /* idle_list is kept in LIFO order, check the last one */
1478 worker = list_entry(gcwq->idle_list.prev, struct worker, entry); 1611 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1479 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1612 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1480 1613
1481 if (time_before(jiffies, expires)) 1614 if (time_before(jiffies, expires))
1482 mod_timer(&gcwq->idle_timer, expires); 1615 mod_timer(&pool->idle_timer, expires);
1483 else { 1616 else {
1484 /* it's been idle for too long, wake up manager */ 1617 /* it's been idle for too long, wake up manager */
1485 gcwq->flags |= GCWQ_MANAGE_WORKERS; 1618 pool->flags |= POOL_MANAGE_WORKERS;
1486 wake_up_worker(gcwq); 1619 wake_up_worker(pool);
1487 } 1620 }
1488 } 1621 }
1489 1622
@@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work)
1500 return false; 1633 return false;
1501 1634
1502 /* mayday mayday mayday */ 1635 /* mayday mayday mayday */
1503 cpu = cwq->gcwq->cpu; 1636 cpu = cwq->pool->gcwq->cpu;
1504 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1637 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1505 if (cpu == WORK_CPU_UNBOUND) 1638 if (cpu == WORK_CPU_UNBOUND)
1506 cpu = 0; 1639 cpu = 0;
@@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work)
1509 return true; 1642 return true;
1510} 1643}
1511 1644
1512static void gcwq_mayday_timeout(unsigned long __gcwq) 1645static void gcwq_mayday_timeout(unsigned long __pool)
1513{ 1646{
1514 struct global_cwq *gcwq = (void *)__gcwq; 1647 struct worker_pool *pool = (void *)__pool;
1648 struct global_cwq *gcwq = pool->gcwq;
1515 struct work_struct *work; 1649 struct work_struct *work;
1516 1650
1517 spin_lock_irq(&gcwq->lock); 1651 spin_lock_irq(&gcwq->lock);
1518 1652
1519 if (need_to_create_worker(gcwq)) { 1653 if (need_to_create_worker(pool)) {
1520 /* 1654 /*
1521 * We've been trying to create a new worker but 1655 * We've been trying to create a new worker but
1522 * haven't been successful. We might be hitting an 1656 * haven't been successful. We might be hitting an
1523 * allocation deadlock. Send distress signals to 1657 * allocation deadlock. Send distress signals to
1524 * rescuers. 1658 * rescuers.
1525 */ 1659 */
1526 list_for_each_entry(work, &gcwq->worklist, entry) 1660 list_for_each_entry(work, &pool->worklist, entry)
1527 send_mayday(work); 1661 send_mayday(work);
1528 } 1662 }
1529 1663
1530 spin_unlock_irq(&gcwq->lock); 1664 spin_unlock_irq(&gcwq->lock);
1531 1665
1532 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); 1666 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1533} 1667}
1534 1668
1535/** 1669/**
1536 * maybe_create_worker - create a new worker if necessary 1670 * maybe_create_worker - create a new worker if necessary
1537 * @gcwq: gcwq to create a new worker for 1671 * @pool: pool to create a new worker for
1538 * 1672 *
1539 * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to 1673 * Create a new worker for @pool if necessary. @pool is guaranteed to
1540 * have at least one idle worker on return from this function. If 1674 * have at least one idle worker on return from this function. If
1541 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is 1675 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1542 * sent to all rescuers with works scheduled on @gcwq to resolve 1676 * sent to all rescuers with works scheduled on @pool to resolve
1543 * possible allocation deadlock. 1677 * possible allocation deadlock.
1544 * 1678 *
1545 * On return, need_to_create_worker() is guaranteed to be false and 1679 * On return, need_to_create_worker() is guaranteed to be false and
@@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
1554 * false if no action was taken and gcwq->lock stayed locked, true 1688 * false if no action was taken and gcwq->lock stayed locked, true
1555 * otherwise. 1689 * otherwise.
1556 */ 1690 */
1557static bool maybe_create_worker(struct global_cwq *gcwq) 1691static bool maybe_create_worker(struct worker_pool *pool)
1558__releases(&gcwq->lock) 1692__releases(&gcwq->lock)
1559__acquires(&gcwq->lock) 1693__acquires(&gcwq->lock)
1560{ 1694{
1561 if (!need_to_create_worker(gcwq)) 1695 struct global_cwq *gcwq = pool->gcwq;
1696
1697 if (!need_to_create_worker(pool))
1562 return false; 1698 return false;
1563restart: 1699restart:
1564 spin_unlock_irq(&gcwq->lock); 1700 spin_unlock_irq(&gcwq->lock);
1565 1701
1566 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 1702 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1567 mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1703 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1568 1704
1569 while (true) { 1705 while (true) {
1570 struct worker *worker; 1706 struct worker *worker;
1571 1707
1572 worker = create_worker(gcwq, true); 1708 worker = create_worker(pool);
1573 if (worker) { 1709 if (worker) {
1574 del_timer_sync(&gcwq->mayday_timer); 1710 del_timer_sync(&pool->mayday_timer);
1575 spin_lock_irq(&gcwq->lock); 1711 spin_lock_irq(&gcwq->lock);
1576 start_worker(worker); 1712 start_worker(worker);
1577 BUG_ON(need_to_create_worker(gcwq)); 1713 BUG_ON(need_to_create_worker(pool));
1578 return true; 1714 return true;
1579 } 1715 }
1580 1716
1581 if (!need_to_create_worker(gcwq)) 1717 if (!need_to_create_worker(pool))
1582 break; 1718 break;
1583 1719
1584 __set_current_state(TASK_INTERRUPTIBLE); 1720 __set_current_state(TASK_INTERRUPTIBLE);
1585 schedule_timeout(CREATE_COOLDOWN); 1721 schedule_timeout(CREATE_COOLDOWN);
1586 1722
1587 if (!need_to_create_worker(gcwq)) 1723 if (!need_to_create_worker(pool))
1588 break; 1724 break;
1589 } 1725 }
1590 1726
1591 del_timer_sync(&gcwq->mayday_timer); 1727 del_timer_sync(&pool->mayday_timer);
1592 spin_lock_irq(&gcwq->lock); 1728 spin_lock_irq(&gcwq->lock);
1593 if (need_to_create_worker(gcwq)) 1729 if (need_to_create_worker(pool))
1594 goto restart; 1730 goto restart;
1595 return true; 1731 return true;
1596} 1732}
1597 1733
1598/** 1734/**
1599 * maybe_destroy_worker - destroy workers which have been idle for a while 1735 * maybe_destroy_worker - destroy workers which have been idle for a while
1600 * @gcwq: gcwq to destroy workers for 1736 * @pool: pool to destroy workers for
1601 * 1737 *
1602 * Destroy @gcwq workers which have been idle for longer than 1738 * Destroy @pool workers which have been idle for longer than
1603 * IDLE_WORKER_TIMEOUT. 1739 * IDLE_WORKER_TIMEOUT.
1604 * 1740 *
1605 * LOCKING: 1741 * LOCKING:
@@ -1610,19 +1746,19 @@ restart:
1610 * false if no action was taken and gcwq->lock stayed locked, true 1746 * false if no action was taken and gcwq->lock stayed locked, true
1611 * otherwise. 1747 * otherwise.
1612 */ 1748 */
1613static bool maybe_destroy_workers(struct global_cwq *gcwq) 1749static bool maybe_destroy_workers(struct worker_pool *pool)
1614{ 1750{
1615 bool ret = false; 1751 bool ret = false;
1616 1752
1617 while (too_many_workers(gcwq)) { 1753 while (too_many_workers(pool)) {
1618 struct worker *worker; 1754 struct worker *worker;
1619 unsigned long expires; 1755 unsigned long expires;
1620 1756
1621 worker = list_entry(gcwq->idle_list.prev, struct worker, entry); 1757 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1622 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1758 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1623 1759
1624 if (time_before(jiffies, expires)) { 1760 if (time_before(jiffies, expires)) {
1625 mod_timer(&gcwq->idle_timer, expires); 1761 mod_timer(&pool->idle_timer, expires);
1626 break; 1762 break;
1627 } 1763 }
1628 1764
@@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
1655 */ 1791 */
1656static bool manage_workers(struct worker *worker) 1792static bool manage_workers(struct worker *worker)
1657{ 1793{
1658 struct global_cwq *gcwq = worker->gcwq; 1794 struct worker_pool *pool = worker->pool;
1659 bool ret = false; 1795 bool ret = false;
1660 1796
1661 if (gcwq->flags & GCWQ_MANAGING_WORKERS) 1797 if (!mutex_trylock(&pool->manager_mutex))
1662 return ret; 1798 return ret;
1663 1799
1664 gcwq->flags &= ~GCWQ_MANAGE_WORKERS; 1800 pool->flags &= ~POOL_MANAGE_WORKERS;
1665 gcwq->flags |= GCWQ_MANAGING_WORKERS;
1666 1801
1667 /* 1802 /*
1668 * Destroy and then create so that may_start_working() is true 1803 * Destroy and then create so that may_start_working() is true
1669 * on return. 1804 * on return.
1670 */ 1805 */
1671 ret |= maybe_destroy_workers(gcwq); 1806 ret |= maybe_destroy_workers(pool);
1672 ret |= maybe_create_worker(gcwq); 1807 ret |= maybe_create_worker(pool);
1673
1674 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1675
1676 /*
1677 * The trustee might be waiting to take over the manager
1678 * position, tell it we're done.
1679 */
1680 if (unlikely(gcwq->trustee))
1681 wake_up_all(&gcwq->trustee_wait);
1682 1808
1809 mutex_unlock(&pool->manager_mutex);
1683 return ret; 1810 return ret;
1684} 1811}
1685 1812
@@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1728{ 1855{
1729 struct work_struct *work = list_first_entry(&cwq->delayed_works, 1856 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1730 struct work_struct, entry); 1857 struct work_struct, entry);
1731 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1732 1858
1733 trace_workqueue_activate_work(work); 1859 trace_workqueue_activate_work(work);
1734 move_linked_works(work, pos, NULL); 1860 move_linked_works(work, &cwq->pool->worklist, NULL);
1735 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 1861 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1736 cwq->nr_active++; 1862 cwq->nr_active++;
1737} 1863}
@@ -1804,7 +1930,8 @@ __releases(&gcwq->lock)
1804__acquires(&gcwq->lock) 1930__acquires(&gcwq->lock)
1805{ 1931{
1806 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 1932 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1807 struct global_cwq *gcwq = cwq->gcwq; 1933 struct worker_pool *pool = worker->pool;
1934 struct global_cwq *gcwq = pool->gcwq;
1808 struct hlist_head *bwh = busy_worker_head(gcwq, work); 1935 struct hlist_head *bwh = busy_worker_head(gcwq, work);
1809 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; 1936 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1810 work_func_t f = work->func; 1937 work_func_t f = work->func;
@@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock)
1823 lockdep_copy_map(&lockdep_map, &work->lockdep_map); 1950 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
1824#endif 1951#endif
1825 /* 1952 /*
1953 * Ensure we're on the correct CPU. DISASSOCIATED test is
1954 * necessary to avoid spurious warnings from rescuers servicing the
1955 * unbound or a disassociated gcwq.
1956 */
1957 WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
1958 !(gcwq->flags & GCWQ_DISASSOCIATED) &&
1959 raw_smp_processor_id() != gcwq->cpu);
1960
1961 /*
1826 * A single work shouldn't be executed concurrently by 1962 * A single work shouldn't be executed concurrently by
1827 * multiple workers on a single cpu. Check whether anyone is 1963 * multiple workers on a single cpu. Check whether anyone is
1828 * already processing the work. If so, defer the work to the 1964 * already processing the work. If so, defer the work to the
@@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock)
1846 list_del_init(&work->entry); 1982 list_del_init(&work->entry);
1847 1983
1848 /* 1984 /*
1849 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1850 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1851 */
1852 if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1853 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1854 struct work_struct, entry);
1855
1856 if (!list_empty(&gcwq->worklist) &&
1857 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1858 wake_up_worker(gcwq);
1859 else
1860 gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1861 }
1862
1863 /*
1864 * CPU intensive works don't participate in concurrency 1985 * CPU intensive works don't participate in concurrency
1865 * management. They're the scheduler's responsibility. 1986 * management. They're the scheduler's responsibility.
1866 */ 1987 */
1867 if (unlikely(cpu_intensive)) 1988 if (unlikely(cpu_intensive))
1868 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 1989 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1869 1990
1991 /*
1992 * Unbound gcwq isn't concurrency managed and work items should be
1993 * executed ASAP. Wake up another worker if necessary.
1994 */
1995 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
1996 wake_up_worker(pool);
1997
1870 spin_unlock_irq(&gcwq->lock); 1998 spin_unlock_irq(&gcwq->lock);
1871 1999
1872 work_clear_pending(work); 2000 work_clear_pending(work);
@@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker)
1939static int worker_thread(void *__worker) 2067static int worker_thread(void *__worker)
1940{ 2068{
1941 struct worker *worker = __worker; 2069 struct worker *worker = __worker;
1942 struct global_cwq *gcwq = worker->gcwq; 2070 struct worker_pool *pool = worker->pool;
2071 struct global_cwq *gcwq = pool->gcwq;
1943 2072
1944 /* tell the scheduler that this is a workqueue worker */ 2073 /* tell the scheduler that this is a workqueue worker */
1945 worker->task->flags |= PF_WQ_WORKER; 2074 worker->task->flags |= PF_WQ_WORKER;
1946woke_up: 2075woke_up:
1947 spin_lock_irq(&gcwq->lock); 2076 spin_lock_irq(&gcwq->lock);
1948 2077
1949 /* DIE can be set only while we're idle, checking here is enough */ 2078 /*
1950 if (worker->flags & WORKER_DIE) { 2079 * DIE can be set only while idle and REBIND set while busy has
2080 * @worker->rebind_work scheduled. Checking here is enough.
2081 */
2082 if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
1951 spin_unlock_irq(&gcwq->lock); 2083 spin_unlock_irq(&gcwq->lock);
1952 worker->task->flags &= ~PF_WQ_WORKER; 2084
1953 return 0; 2085 if (worker->flags & WORKER_DIE) {
2086 worker->task->flags &= ~PF_WQ_WORKER;
2087 return 0;
2088 }
2089
2090 idle_worker_rebind(worker);
2091 goto woke_up;
1954 } 2092 }
1955 2093
1956 worker_leave_idle(worker); 2094 worker_leave_idle(worker);
1957recheck: 2095recheck:
1958 /* no more worker necessary? */ 2096 /* no more worker necessary? */
1959 if (!need_more_worker(gcwq)) 2097 if (!need_more_worker(pool))
1960 goto sleep; 2098 goto sleep;
1961 2099
1962 /* do we need to manage? */ 2100 /* do we need to manage? */
1963 if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) 2101 if (unlikely(!may_start_working(pool)) && manage_workers(worker))
1964 goto recheck; 2102 goto recheck;
1965 2103
1966 /* 2104 /*
@@ -1979,7 +2117,7 @@ recheck:
1979 2117
1980 do { 2118 do {
1981 struct work_struct *work = 2119 struct work_struct *work =
1982 list_first_entry(&gcwq->worklist, 2120 list_first_entry(&pool->worklist,
1983 struct work_struct, entry); 2121 struct work_struct, entry);
1984 2122
1985 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { 2123 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1991,11 +2129,11 @@ recheck:
1991 move_linked_works(work, &worker->scheduled, NULL); 2129 move_linked_works(work, &worker->scheduled, NULL);
1992 process_scheduled_works(worker); 2130 process_scheduled_works(worker);
1993 } 2131 }
1994 } while (keep_working(gcwq)); 2132 } while (keep_working(pool));
1995 2133
1996 worker_set_flags(worker, WORKER_PREP, false); 2134 worker_set_flags(worker, WORKER_PREP, false);
1997sleep: 2135sleep:
1998 if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) 2136 if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
1999 goto recheck; 2137 goto recheck;
2000 2138
2001 /* 2139 /*
@@ -2053,14 +2191,15 @@ repeat:
2053 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2191 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2054 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2192 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2055 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); 2193 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2056 struct global_cwq *gcwq = cwq->gcwq; 2194 struct worker_pool *pool = cwq->pool;
2195 struct global_cwq *gcwq = pool->gcwq;
2057 struct work_struct *work, *n; 2196 struct work_struct *work, *n;
2058 2197
2059 __set_current_state(TASK_RUNNING); 2198 __set_current_state(TASK_RUNNING);
2060 mayday_clear_cpu(cpu, wq->mayday_mask); 2199 mayday_clear_cpu(cpu, wq->mayday_mask);
2061 2200
2062 /* migrate to the target cpu if possible */ 2201 /* migrate to the target cpu if possible */
2063 rescuer->gcwq = gcwq; 2202 rescuer->pool = pool;
2064 worker_maybe_bind_and_lock(rescuer); 2203 worker_maybe_bind_and_lock(rescuer);
2065 2204
2066 /* 2205 /*
@@ -2068,7 +2207,7 @@ repeat:
2068 * process'em. 2207 * process'em.
2069 */ 2208 */
2070 BUG_ON(!list_empty(&rescuer->scheduled)); 2209 BUG_ON(!list_empty(&rescuer->scheduled));
2071 list_for_each_entry_safe(work, n, &gcwq->worklist, entry) 2210 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2072 if (get_work_cwq(work) == cwq) 2211 if (get_work_cwq(work) == cwq)
2073 move_linked_works(work, scheduled, &n); 2212 move_linked_works(work, scheduled, &n);
2074 2213
@@ -2079,8 +2218,8 @@ repeat:
2079 * regular worker; otherwise, we end up with 0 concurrency 2218 * regular worker; otherwise, we end up with 0 concurrency
2080 * and stalling the execution. 2219 * and stalling the execution.
2081 */ 2220 */
2082 if (keep_working(gcwq)) 2221 if (keep_working(pool))
2083 wake_up_worker(gcwq); 2222 wake_up_worker(pool);
2084 2223
2085 spin_unlock_irq(&gcwq->lock); 2224 spin_unlock_irq(&gcwq->lock);
2086 } 2225 }
@@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2205 2344
2206 for_each_cwq_cpu(cpu, wq) { 2345 for_each_cwq_cpu(cpu, wq) {
2207 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2346 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2208 struct global_cwq *gcwq = cwq->gcwq; 2347 struct global_cwq *gcwq = cwq->pool->gcwq;
2209 2348
2210 spin_lock_irq(&gcwq->lock); 2349 spin_lock_irq(&gcwq->lock);
2211 2350
@@ -2421,9 +2560,9 @@ reflush:
2421 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2560 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2422 bool drained; 2561 bool drained;
2423 2562
2424 spin_lock_irq(&cwq->gcwq->lock); 2563 spin_lock_irq(&cwq->pool->gcwq->lock);
2425 drained = !cwq->nr_active && list_empty(&cwq->delayed_works); 2564 drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2426 spin_unlock_irq(&cwq->gcwq->lock); 2565 spin_unlock_irq(&cwq->pool->gcwq->lock);
2427 2566
2428 if (drained) 2567 if (drained)
2429 continue; 2568 continue;
@@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2463 */ 2602 */
2464 smp_rmb(); 2603 smp_rmb();
2465 cwq = get_work_cwq(work); 2604 cwq = get_work_cwq(work);
2466 if (unlikely(!cwq || gcwq != cwq->gcwq)) 2605 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2467 goto already_gone; 2606 goto already_gone;
2468 } else if (wait_executing) { 2607 } else if (wait_executing) {
2469 worker = find_worker_executing_work(gcwq, work); 2608 worker = find_worker_executing_work(gcwq, work);
@@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
2984 if (flags & WQ_MEM_RECLAIM) 3123 if (flags & WQ_MEM_RECLAIM)
2985 flags |= WQ_RESCUER; 3124 flags |= WQ_RESCUER;
2986 3125
2987 /*
2988 * Unbound workqueues aren't concurrency managed and should be
2989 * dispatched to workers immediately.
2990 */
2991 if (flags & WQ_UNBOUND)
2992 flags |= WQ_HIGHPRI;
2993
2994 max_active = max_active ?: WQ_DFL_ACTIVE; 3126 max_active = max_active ?: WQ_DFL_ACTIVE;
2995 max_active = wq_clamp_max_active(max_active, flags, wq->name); 3127 max_active = wq_clamp_max_active(max_active, flags, wq->name);
2996 3128
@@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3011 for_each_cwq_cpu(cpu, wq) { 3143 for_each_cwq_cpu(cpu, wq) {
3012 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3144 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3013 struct global_cwq *gcwq = get_gcwq(cpu); 3145 struct global_cwq *gcwq = get_gcwq(cpu);
3146 int pool_idx = (bool)(flags & WQ_HIGHPRI);
3014 3147
3015 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3148 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
3016 cwq->gcwq = gcwq; 3149 cwq->pool = &gcwq->pools[pool_idx];
3017 cwq->wq = wq; 3150 cwq->wq = wq;
3018 cwq->flush_color = -1; 3151 cwq->flush_color = -1;
3019 cwq->max_active = max_active; 3152 cwq->max_active = max_active;
@@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy);
3225 * gcwqs serve mix of short, long and very long running works making 3358 * gcwqs serve mix of short, long and very long running works making
3226 * blocked draining impractical. 3359 * blocked draining impractical.
3227 * 3360 *
3228 * This is solved by allowing a gcwq to be detached from CPU, running 3361 * This is solved by allowing a gcwq to be disassociated from the CPU
3229 * it with unbound (rogue) workers and allowing it to be reattached 3362 * running as an unbound one and allowing it to be reattached later if the
3230 * later if the cpu comes back online. A separate thread is created 3363 * cpu comes back online.
3231 * to govern a gcwq in such state and is called the trustee of the
3232 * gcwq.
3233 *
3234 * Trustee states and their descriptions.
3235 *
3236 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3237 * new trustee is started with this state.
3238 *
3239 * IN_CHARGE Once started, trustee will enter this state after
3240 * assuming the manager role and making all existing
3241 * workers rogue. DOWN_PREPARE waits for trustee to
3242 * enter this state. After reaching IN_CHARGE, trustee
3243 * tries to execute the pending worklist until it's empty
3244 * and the state is set to BUTCHER, or the state is set
3245 * to RELEASE.
3246 *
3247 * BUTCHER Command state which is set by the cpu callback after
3248 * the cpu has went down. Once this state is set trustee
3249 * knows that there will be no new works on the worklist
3250 * and once the worklist is empty it can proceed to
3251 * killing idle workers.
3252 *
3253 * RELEASE Command state which is set by the cpu callback if the
3254 * cpu down has been canceled or it has come online
3255 * again. After recognizing this state, trustee stops
3256 * trying to drain or butcher and clears ROGUE, rebinds
3257 * all remaining workers back to the cpu and releases
3258 * manager role.
3259 *
3260 * DONE Trustee will enter this state after BUTCHER or RELEASE
3261 * is complete.
3262 *
3263 * trustee CPU draining
3264 * took over down complete
3265 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3266 * | | ^
3267 * | CPU is back online v return workers |
3268 * ----------------> RELEASE --------------
3269 */ 3364 */
3270 3365
3271/** 3366/* claim manager positions of all pools */
3272 * trustee_wait_event_timeout - timed event wait for trustee 3367static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
3273 * @cond: condition to wait for
3274 * @timeout: timeout in jiffies
3275 *
3276 * wait_event_timeout() for trustee to use. Handles locking and
3277 * checks for RELEASE request.
3278 *
3279 * CONTEXT:
3280 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3281 * multiple times. To be used by trustee.
3282 *
3283 * RETURNS:
3284 * Positive indicating left time if @cond is satisfied, 0 if timed
3285 * out, -1 if canceled.
3286 */
3287#define trustee_wait_event_timeout(cond, timeout) ({ \
3288 long __ret = (timeout); \
3289 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3290 __ret) { \
3291 spin_unlock_irq(&gcwq->lock); \
3292 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3293 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3294 __ret); \
3295 spin_lock_irq(&gcwq->lock); \
3296 } \
3297 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3298})
3299
3300/**
3301 * trustee_wait_event - event wait for trustee
3302 * @cond: condition to wait for
3303 *
3304 * wait_event() for trustee to use. Automatically handles locking and
3305 * checks for CANCEL request.
3306 *
3307 * CONTEXT:
3308 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3309 * multiple times. To be used by trustee.
3310 *
3311 * RETURNS:
3312 * 0 if @cond is satisfied, -1 if canceled.
3313 */
3314#define trustee_wait_event(cond) ({ \
3315 long __ret1; \
3316 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3317 __ret1 < 0 ? -1 : 0; \
3318})
3319
3320static int __cpuinit trustee_thread(void *__gcwq)
3321{ 3368{
3322 struct global_cwq *gcwq = __gcwq; 3369 struct worker_pool *pool;
3323 struct worker *worker;
3324 struct work_struct *work;
3325 struct hlist_node *pos;
3326 long rc;
3327 int i;
3328
3329 BUG_ON(gcwq->cpu != smp_processor_id());
3330 3370
3371 for_each_worker_pool(pool, gcwq)
3372 mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
3331 spin_lock_irq(&gcwq->lock); 3373 spin_lock_irq(&gcwq->lock);
3332 /* 3374}
3333 * Claim the manager position and make all workers rogue.
3334 * Trustee must be bound to the target cpu and can't be
3335 * cancelled.
3336 */
3337 BUG_ON(gcwq->cpu != smp_processor_id());
3338 rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3339 BUG_ON(rc < 0);
3340
3341 gcwq->flags |= GCWQ_MANAGING_WORKERS;
3342
3343 list_for_each_entry(worker, &gcwq->idle_list, entry)
3344 worker->flags |= WORKER_ROGUE;
3345 3375
3346 for_each_busy_worker(worker, i, pos, gcwq) 3376/* release manager positions */
3347 worker->flags |= WORKER_ROGUE; 3377static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
3378{
3379 struct worker_pool *pool;
3348 3380
3349 /*
3350 * Call schedule() so that we cross rq->lock and thus can
3351 * guarantee sched callbacks see the rogue flag. This is
3352 * necessary as scheduler callbacks may be invoked from other
3353 * cpus.
3354 */
3355 spin_unlock_irq(&gcwq->lock); 3381 spin_unlock_irq(&gcwq->lock);
3356 schedule(); 3382 for_each_worker_pool(pool, gcwq)
3357 spin_lock_irq(&gcwq->lock); 3383 mutex_unlock(&pool->manager_mutex);
3384}
3358 3385
3359 /* 3386static void gcwq_unbind_fn(struct work_struct *work)
3360 * Sched callbacks are disabled now. Zap nr_running. After 3387{
3361 * this, nr_running stays zero and need_more_worker() and 3388 struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3362 * keep_working() are always true as long as the worklist is 3389 struct worker_pool *pool;
3363 * not empty. 3390 struct worker *worker;
3364 */ 3391 struct hlist_node *pos;
3365 atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); 3392 int i;
3366 3393
3367 spin_unlock_irq(&gcwq->lock); 3394 BUG_ON(gcwq->cpu != smp_processor_id());
3368 del_timer_sync(&gcwq->idle_timer);
3369 spin_lock_irq(&gcwq->lock);
3370 3395
3371 /* 3396 gcwq_claim_management_and_lock(gcwq);
3372 * We're now in charge. Notify and proceed to drain. We need
3373 * to keep the gcwq running during the whole CPU down
3374 * procedure as other cpu hotunplug callbacks may need to
3375 * flush currently running tasks.
3376 */
3377 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3378 wake_up_all(&gcwq->trustee_wait);
3379 3397
3380 /* 3398 /*
3381 * The original cpu is in the process of dying and may go away 3399 * We've claimed all manager positions. Make all workers unbound
3382 * anytime now. When that happens, we and all workers would 3400 * and set DISASSOCIATED. Before this, all workers except for the
3383 * be migrated to other cpus. Try draining any left work. We 3401 * ones which are still executing works from before the last CPU
3384 * want to get it over with ASAP - spam rescuers, wake up as 3402 * down must be on the cpu. After this, they may become diasporas.
3385 * many idlers as necessary and create new ones till the
3386 * worklist is empty. Note that if the gcwq is frozen, there
3387 * may be frozen works in freezable cwqs. Don't declare
3388 * completion while frozen.
3389 */ 3403 */
3390 while (gcwq->nr_workers != gcwq->nr_idle || 3404 for_each_worker_pool(pool, gcwq)
3391 gcwq->flags & GCWQ_FREEZING || 3405 list_for_each_entry(worker, &pool->idle_list, entry)
3392 gcwq->trustee_state == TRUSTEE_IN_CHARGE) { 3406 worker->flags |= WORKER_UNBOUND;
3393 int nr_works = 0;
3394
3395 list_for_each_entry(work, &gcwq->worklist, entry) {
3396 send_mayday(work);
3397 nr_works++;
3398 }
3399 3407
3400 list_for_each_entry(worker, &gcwq->idle_list, entry) { 3408 for_each_busy_worker(worker, i, pos, gcwq)
3401 if (!nr_works--) 3409 worker->flags |= WORKER_UNBOUND;
3402 break;
3403 wake_up_process(worker->task);
3404 }
3405 3410
3406 if (need_to_create_worker(gcwq)) { 3411 gcwq->flags |= GCWQ_DISASSOCIATED;
3407 spin_unlock_irq(&gcwq->lock);
3408 worker = create_worker(gcwq, false);
3409 spin_lock_irq(&gcwq->lock);
3410 if (worker) {
3411 worker->flags |= WORKER_ROGUE;
3412 start_worker(worker);
3413 }
3414 }
3415 3412
3416 /* give a breather */ 3413 gcwq_release_management_and_unlock(gcwq);
3417 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3418 break;
3419 }
3420 3414
3421 /* 3415 /*
3422 * Either all works have been scheduled and cpu is down, or 3416 * Call schedule() so that we cross rq->lock and thus can guarantee
3423 * cpu down has already been canceled. Wait for and butcher 3417 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
3424 * all workers till we're canceled. 3418 * as scheduler callbacks may be invoked from other cpus.
3425 */ 3419 */
3426 do { 3420 schedule();
3427 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3428 while (!list_empty(&gcwq->idle_list))
3429 destroy_worker(list_first_entry(&gcwq->idle_list,
3430 struct worker, entry));
3431 } while (gcwq->nr_workers && rc >= 0);
3432 3421
3433 /* 3422 /*
3434 * At this point, either draining has completed and no worker 3423 * Sched callbacks are disabled now. Zap nr_running. After this,
3435 * is left, or cpu down has been canceled or the cpu is being 3424 * nr_running stays zero and need_more_worker() and keep_working()
3436 * brought back up. There shouldn't be any idle one left. 3425 * are always true as long as the worklist is not empty. @gcwq now
3437 * Tell the remaining busy ones to rebind once it finishes the 3426 * behaves as unbound (in terms of concurrency management) gcwq
3438 * currently scheduled works by scheduling the rebind_work. 3427 * which is served by workers tied to the CPU.
3428 *
3429 * On return from this function, the current worker would trigger
3430 * unbound chain execution of pending work items if other workers
3431 * didn't already.
3439 */ 3432 */
3440 WARN_ON(!list_empty(&gcwq->idle_list)); 3433 for_each_worker_pool(pool, gcwq)
3441 3434 atomic_set(get_pool_nr_running(pool), 0);
3442 for_each_busy_worker(worker, i, pos, gcwq) {
3443 struct work_struct *rebind_work = &worker->rebind_work;
3444
3445 /*
3446 * Rebind_work may race with future cpu hotplug
3447 * operations. Use a separate flag to mark that
3448 * rebinding is scheduled.
3449 */
3450 worker->flags |= WORKER_REBIND;
3451 worker->flags &= ~WORKER_ROGUE;
3452
3453 /* queue rebind_work, wq doesn't matter, use the default one */
3454 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3455 work_data_bits(rebind_work)))
3456 continue;
3457
3458 debug_work_activate(rebind_work);
3459 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3460 worker->scheduled.next,
3461 work_color_to_flags(WORK_NO_COLOR));
3462 }
3463
3464 /* relinquish manager role */
3465 gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3466
3467 /* notify completion */
3468 gcwq->trustee = NULL;
3469 gcwq->trustee_state = TRUSTEE_DONE;
3470 wake_up_all(&gcwq->trustee_wait);
3471 spin_unlock_irq(&gcwq->lock);
3472 return 0;
3473} 3435}
3474 3436
3475/** 3437/*
3476 * wait_trustee_state - wait for trustee to enter the specified state 3438 * Workqueues should be brought up before normal priority CPU notifiers.
3477 * @gcwq: gcwq the trustee of interest belongs to 3439 * This will be registered high priority CPU notifier.
3478 * @state: target state to wait for
3479 *
3480 * Wait for the trustee to reach @state. DONE is already matched.
3481 *
3482 * CONTEXT:
3483 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3484 * multiple times. To be used by cpu_callback.
3485 */ 3440 */
3486static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) 3441static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3487__releases(&gcwq->lock) 3442 unsigned long action,
3488__acquires(&gcwq->lock) 3443 void *hcpu)
3489{
3490 if (!(gcwq->trustee_state == state ||
3491 gcwq->trustee_state == TRUSTEE_DONE)) {
3492 spin_unlock_irq(&gcwq->lock);
3493 __wait_event(gcwq->trustee_wait,
3494 gcwq->trustee_state == state ||
3495 gcwq->trustee_state == TRUSTEE_DONE);
3496 spin_lock_irq(&gcwq->lock);
3497 }
3498}
3499
3500static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3501 unsigned long action,
3502 void *hcpu)
3503{ 3444{
3504 unsigned int cpu = (unsigned long)hcpu; 3445 unsigned int cpu = (unsigned long)hcpu;
3505 struct global_cwq *gcwq = get_gcwq(cpu); 3446 struct global_cwq *gcwq = get_gcwq(cpu);
3506 struct task_struct *new_trustee = NULL; 3447 struct worker_pool *pool;
3507 struct worker *uninitialized_var(new_worker);
3508 unsigned long flags;
3509
3510 action &= ~CPU_TASKS_FROZEN;
3511 3448
3512 switch (action) { 3449 switch (action & ~CPU_TASKS_FROZEN) {
3513 case CPU_DOWN_PREPARE:
3514 new_trustee = kthread_create(trustee_thread, gcwq,
3515 "workqueue_trustee/%d\n", cpu);
3516 if (IS_ERR(new_trustee))
3517 return notifier_from_errno(PTR_ERR(new_trustee));
3518 kthread_bind(new_trustee, cpu);
3519 /* fall through */
3520 case CPU_UP_PREPARE: 3450 case CPU_UP_PREPARE:
3521 BUG_ON(gcwq->first_idle); 3451 for_each_worker_pool(pool, gcwq) {
3522 new_worker = create_worker(gcwq, false); 3452 struct worker *worker;
3523 if (!new_worker) {
3524 if (new_trustee)
3525 kthread_stop(new_trustee);
3526 return NOTIFY_BAD;
3527 }
3528 }
3529
3530 /* some are called w/ irq disabled, don't disturb irq status */
3531 spin_lock_irqsave(&gcwq->lock, flags);
3532 3453
3533 switch (action) { 3454 if (pool->nr_workers)
3534 case CPU_DOWN_PREPARE: 3455 continue;
3535 /* initialize trustee and tell it to acquire the gcwq */
3536 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3537 gcwq->trustee = new_trustee;
3538 gcwq->trustee_state = TRUSTEE_START;
3539 wake_up_process(gcwq->trustee);
3540 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3541 /* fall through */
3542 case CPU_UP_PREPARE:
3543 BUG_ON(gcwq->first_idle);
3544 gcwq->first_idle = new_worker;
3545 break;
3546 3456
3547 case CPU_DYING: 3457 worker = create_worker(pool);
3548 /* 3458 if (!worker)
3549 * Before this, the trustee and all workers except for 3459 return NOTIFY_BAD;
3550 * the ones which are still executing works from
3551 * before the last CPU down must be on the cpu. After
3552 * this, they'll all be diasporas.
3553 */
3554 gcwq->flags |= GCWQ_DISASSOCIATED;
3555 break;
3556 3460
3557 case CPU_POST_DEAD: 3461 spin_lock_irq(&gcwq->lock);
3558 gcwq->trustee_state = TRUSTEE_BUTCHER; 3462 start_worker(worker);
3559 /* fall through */ 3463 spin_unlock_irq(&gcwq->lock);
3560 case CPU_UP_CANCELED: 3464 }
3561 destroy_worker(gcwq->first_idle);
3562 gcwq->first_idle = NULL;
3563 break; 3465 break;
3564 3466
3565 case CPU_DOWN_FAILED: 3467 case CPU_DOWN_FAILED:
3566 case CPU_ONLINE: 3468 case CPU_ONLINE:
3469 gcwq_claim_management_and_lock(gcwq);
3567 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3470 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3568 if (gcwq->trustee_state != TRUSTEE_DONE) { 3471 rebind_workers(gcwq);
3569 gcwq->trustee_state = TRUSTEE_RELEASE; 3472 gcwq_release_management_and_unlock(gcwq);
3570 wake_up_process(gcwq->trustee);
3571 wait_trustee_state(gcwq, TRUSTEE_DONE);
3572 }
3573
3574 /*
3575 * Trustee is done and there might be no worker left.
3576 * Put the first_idle in and request a real manager to
3577 * take a look.
3578 */
3579 spin_unlock_irq(&gcwq->lock);
3580 kthread_bind(gcwq->first_idle->task, cpu);
3581 spin_lock_irq(&gcwq->lock);
3582 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3583 start_worker(gcwq->first_idle);
3584 gcwq->first_idle = NULL;
3585 break; 3473 break;
3586 } 3474 }
3475 return NOTIFY_OK;
3476}
3587 3477
3588 spin_unlock_irqrestore(&gcwq->lock, flags); 3478/*
3479 * Workqueues should be brought down after normal priority CPU notifiers.
3480 * This will be registered as low priority CPU notifier.
3481 */
3482static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3483 unsigned long action,
3484 void *hcpu)
3485{
3486 unsigned int cpu = (unsigned long)hcpu;
3487 struct work_struct unbind_work;
3589 3488
3590 return notifier_from_errno(0); 3489 switch (action & ~CPU_TASKS_FROZEN) {
3490 case CPU_DOWN_PREPARE:
3491 /* unbinding should happen on the local CPU */
3492 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3493 schedule_work_on(cpu, &unbind_work);
3494 flush_work(&unbind_work);
3495 break;
3496 }
3497 return NOTIFY_OK;
3591} 3498}
3592 3499
3593#ifdef CONFIG_SMP 3500#ifdef CONFIG_SMP
@@ -3746,6 +3653,7 @@ void thaw_workqueues(void)
3746 3653
3747 for_each_gcwq_cpu(cpu) { 3654 for_each_gcwq_cpu(cpu) {
3748 struct global_cwq *gcwq = get_gcwq(cpu); 3655 struct global_cwq *gcwq = get_gcwq(cpu);
3656 struct worker_pool *pool;
3749 struct workqueue_struct *wq; 3657 struct workqueue_struct *wq;
3750 3658
3751 spin_lock_irq(&gcwq->lock); 3659 spin_lock_irq(&gcwq->lock);
@@ -3767,7 +3675,8 @@ void thaw_workqueues(void)
3767 cwq_activate_first_delayed(cwq); 3675 cwq_activate_first_delayed(cwq);
3768 } 3676 }
3769 3677
3770 wake_up_worker(gcwq); 3678 for_each_worker_pool(pool, gcwq)
3679 wake_up_worker(pool);
3771 3680
3772 spin_unlock_irq(&gcwq->lock); 3681 spin_unlock_irq(&gcwq->lock);
3773 } 3682 }
@@ -3783,46 +3692,57 @@ static int __init init_workqueues(void)
3783 unsigned int cpu; 3692 unsigned int cpu;
3784 int i; 3693 int i;
3785 3694
3786 cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); 3695 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3696 cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3787 3697
3788 /* initialize gcwqs */ 3698 /* initialize gcwqs */
3789 for_each_gcwq_cpu(cpu) { 3699 for_each_gcwq_cpu(cpu) {
3790 struct global_cwq *gcwq = get_gcwq(cpu); 3700 struct global_cwq *gcwq = get_gcwq(cpu);
3701 struct worker_pool *pool;
3791 3702
3792 spin_lock_init(&gcwq->lock); 3703 spin_lock_init(&gcwq->lock);
3793 INIT_LIST_HEAD(&gcwq->worklist);
3794 gcwq->cpu = cpu; 3704 gcwq->cpu = cpu;
3795 gcwq->flags |= GCWQ_DISASSOCIATED; 3705 gcwq->flags |= GCWQ_DISASSOCIATED;
3796 3706
3797 INIT_LIST_HEAD(&gcwq->idle_list);
3798 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) 3707 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3799 INIT_HLIST_HEAD(&gcwq->busy_hash[i]); 3708 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3800 3709
3801 init_timer_deferrable(&gcwq->idle_timer); 3710 for_each_worker_pool(pool, gcwq) {
3802 gcwq->idle_timer.function = idle_worker_timeout; 3711 pool->gcwq = gcwq;
3803 gcwq->idle_timer.data = (unsigned long)gcwq; 3712 INIT_LIST_HEAD(&pool->worklist);
3713 INIT_LIST_HEAD(&pool->idle_list);
3714
3715 init_timer_deferrable(&pool->idle_timer);
3716 pool->idle_timer.function = idle_worker_timeout;
3717 pool->idle_timer.data = (unsigned long)pool;
3804 3718
3805 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, 3719 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
3806 (unsigned long)gcwq); 3720 (unsigned long)pool);
3807 3721
3808 ida_init(&gcwq->worker_ida); 3722 mutex_init(&pool->manager_mutex);
3723 ida_init(&pool->worker_ida);
3724 }
3809 3725
3810 gcwq->trustee_state = TRUSTEE_DONE; 3726 init_waitqueue_head(&gcwq->rebind_hold);
3811 init_waitqueue_head(&gcwq->trustee_wait);
3812 } 3727 }
3813 3728
3814 /* create the initial worker */ 3729 /* create the initial worker */
3815 for_each_online_gcwq_cpu(cpu) { 3730 for_each_online_gcwq_cpu(cpu) {
3816 struct global_cwq *gcwq = get_gcwq(cpu); 3731 struct global_cwq *gcwq = get_gcwq(cpu);
3817 struct worker *worker; 3732 struct worker_pool *pool;
3818 3733
3819 if (cpu != WORK_CPU_UNBOUND) 3734 if (cpu != WORK_CPU_UNBOUND)
3820 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3735 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3821 worker = create_worker(gcwq, true); 3736
3822 BUG_ON(!worker); 3737 for_each_worker_pool(pool, gcwq) {
3823 spin_lock_irq(&gcwq->lock); 3738 struct worker *worker;
3824 start_worker(worker); 3739
3825 spin_unlock_irq(&gcwq->lock); 3740 worker = create_worker(pool);
3741 BUG_ON(!worker);
3742 spin_lock_irq(&gcwq->lock);
3743 start_worker(worker);
3744 spin_unlock_irq(&gcwq->lock);
3745 }
3826 } 3746 }
3827 3747
3828 system_wq = alloc_workqueue("events", 0, 0); 3748 system_wq = alloc_workqueue("events", 0, 0);