diff options
author | James Morris <james.l.morris@oracle.com> | 2012-08-17 06:42:30 -0400 |
---|---|---|
committer | James Morris <james.l.morris@oracle.com> | 2012-08-17 06:42:30 -0400 |
commit | 51b743fe87d7fb3dba7a2ff4a1fe23bb65dc2245 (patch) | |
tree | f8b8f601713a3ecb264eb9f145636343d9350520 /kernel | |
parent | 9f99798ff49e73dded73a8c674044ea6fb6af651 (diff) | |
parent | d9875690d9b89a866022ff49e3fcea892345ad92 (diff) |
Merge tag 'v3.6-rc2' into next
Linux 3.6-rc2
Resync with Linus.
Diffstat (limited to 'kernel')
79 files changed, 4458 insertions, 3071 deletions
diff --git a/kernel/async.c b/kernel/async.c index bd0c168a3bbe..9d3118384858 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1; | |||
62 | #define MAX_WORK 32768 | 62 | #define MAX_WORK 32768 |
63 | 63 | ||
64 | static LIST_HEAD(async_pending); | 64 | static LIST_HEAD(async_pending); |
65 | static LIST_HEAD(async_running); | 65 | static ASYNC_DOMAIN(async_running); |
66 | static LIST_HEAD(async_domains); | ||
66 | static DEFINE_SPINLOCK(async_lock); | 67 | static DEFINE_SPINLOCK(async_lock); |
68 | static DEFINE_MUTEX(async_register_mutex); | ||
67 | 69 | ||
68 | struct async_entry { | 70 | struct async_entry { |
69 | struct list_head list; | 71 | struct list_head list; |
@@ -71,7 +73,7 @@ struct async_entry { | |||
71 | async_cookie_t cookie; | 73 | async_cookie_t cookie; |
72 | async_func_ptr *func; | 74 | async_func_ptr *func; |
73 | void *data; | 75 | void *data; |
74 | struct list_head *running; | 76 | struct async_domain *running; |
75 | }; | 77 | }; |
76 | 78 | ||
77 | static DECLARE_WAIT_QUEUE_HEAD(async_done); | 79 | static DECLARE_WAIT_QUEUE_HEAD(async_done); |
@@ -82,13 +84,12 @@ static atomic_t entry_count; | |||
82 | /* | 84 | /* |
83 | * MUST be called with the lock held! | 85 | * MUST be called with the lock held! |
84 | */ | 86 | */ |
85 | static async_cookie_t __lowest_in_progress(struct list_head *running) | 87 | static async_cookie_t __lowest_in_progress(struct async_domain *running) |
86 | { | 88 | { |
87 | struct async_entry *entry; | 89 | struct async_entry *entry; |
88 | 90 | ||
89 | if (!list_empty(running)) { | 91 | if (!list_empty(&running->domain)) { |
90 | entry = list_first_entry(running, | 92 | entry = list_first_entry(&running->domain, typeof(*entry), list); |
91 | struct async_entry, list); | ||
92 | return entry->cookie; | 93 | return entry->cookie; |
93 | } | 94 | } |
94 | 95 | ||
@@ -99,7 +100,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) | |||
99 | return next_cookie; /* "infinity" value */ | 100 | return next_cookie; /* "infinity" value */ |
100 | } | 101 | } |
101 | 102 | ||
102 | static async_cookie_t lowest_in_progress(struct list_head *running) | 103 | static async_cookie_t lowest_in_progress(struct async_domain *running) |
103 | { | 104 | { |
104 | unsigned long flags; | 105 | unsigned long flags; |
105 | async_cookie_t ret; | 106 | async_cookie_t ret; |
@@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work) | |||
119 | container_of(work, struct async_entry, work); | 120 | container_of(work, struct async_entry, work); |
120 | unsigned long flags; | 121 | unsigned long flags; |
121 | ktime_t uninitialized_var(calltime), delta, rettime; | 122 | ktime_t uninitialized_var(calltime), delta, rettime; |
123 | struct async_domain *running = entry->running; | ||
122 | 124 | ||
123 | /* 1) move self to the running queue */ | 125 | /* 1) move self to the running queue */ |
124 | spin_lock_irqsave(&async_lock, flags); | 126 | spin_lock_irqsave(&async_lock, flags); |
125 | list_move_tail(&entry->list, entry->running); | 127 | list_move_tail(&entry->list, &running->domain); |
126 | spin_unlock_irqrestore(&async_lock, flags); | 128 | spin_unlock_irqrestore(&async_lock, flags); |
127 | 129 | ||
128 | /* 2) run (and print duration) */ | 130 | /* 2) run (and print duration) */ |
@@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work) | |||
145 | /* 3) remove self from the running queue */ | 147 | /* 3) remove self from the running queue */ |
146 | spin_lock_irqsave(&async_lock, flags); | 148 | spin_lock_irqsave(&async_lock, flags); |
147 | list_del(&entry->list); | 149 | list_del(&entry->list); |
150 | if (running->registered && --running->count == 0) | ||
151 | list_del_init(&running->node); | ||
148 | 152 | ||
149 | /* 4) free the entry */ | 153 | /* 4) free the entry */ |
150 | kfree(entry); | 154 | kfree(entry); |
@@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
156 | wake_up(&async_done); | 160 | wake_up(&async_done); |
157 | } | 161 | } |
158 | 162 | ||
159 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) | 163 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) |
160 | { | 164 | { |
161 | struct async_entry *entry; | 165 | struct async_entry *entry; |
162 | unsigned long flags; | 166 | unsigned long flags; |
@@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
187 | spin_lock_irqsave(&async_lock, flags); | 191 | spin_lock_irqsave(&async_lock, flags); |
188 | newcookie = entry->cookie = next_cookie++; | 192 | newcookie = entry->cookie = next_cookie++; |
189 | list_add_tail(&entry->list, &async_pending); | 193 | list_add_tail(&entry->list, &async_pending); |
194 | if (running->registered && running->count++ == 0) | ||
195 | list_add_tail(&running->node, &async_domains); | ||
190 | atomic_inc(&entry_count); | 196 | atomic_inc(&entry_count); |
191 | spin_unlock_irqrestore(&async_lock, flags); | 197 | spin_unlock_irqrestore(&async_lock, flags); |
192 | 198 | ||
@@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule); | |||
223 | * Note: This function may be called from atomic or non-atomic contexts. | 229 | * Note: This function may be called from atomic or non-atomic contexts. |
224 | */ | 230 | */ |
225 | async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, | 231 | async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, |
226 | struct list_head *running) | 232 | struct async_domain *running) |
227 | { | 233 | { |
228 | return __async_schedule(ptr, data, running); | 234 | return __async_schedule(ptr, data, running); |
229 | } | 235 | } |
@@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); | |||
236 | */ | 242 | */ |
237 | void async_synchronize_full(void) | 243 | void async_synchronize_full(void) |
238 | { | 244 | { |
245 | mutex_lock(&async_register_mutex); | ||
239 | do { | 246 | do { |
240 | async_synchronize_cookie(next_cookie); | 247 | struct async_domain *domain = NULL; |
241 | } while (!list_empty(&async_running) || !list_empty(&async_pending)); | 248 | |
249 | spin_lock_irq(&async_lock); | ||
250 | if (!list_empty(&async_domains)) | ||
251 | domain = list_first_entry(&async_domains, typeof(*domain), node); | ||
252 | spin_unlock_irq(&async_lock); | ||
253 | |||
254 | async_synchronize_cookie_domain(next_cookie, domain); | ||
255 | } while (!list_empty(&async_domains)); | ||
256 | mutex_unlock(&async_register_mutex); | ||
242 | } | 257 | } |
243 | EXPORT_SYMBOL_GPL(async_synchronize_full); | 258 | EXPORT_SYMBOL_GPL(async_synchronize_full); |
244 | 259 | ||
245 | /** | 260 | /** |
261 | * async_unregister_domain - ensure no more anonymous waiters on this domain | ||
262 | * @domain: idle domain to flush out of any async_synchronize_full instances | ||
263 | * | ||
264 | * async_synchronize_{cookie|full}_domain() are not flushed since callers | ||
265 | * of these routines should know the lifetime of @domain | ||
266 | * | ||
267 | * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing | ||
268 | */ | ||
269 | void async_unregister_domain(struct async_domain *domain) | ||
270 | { | ||
271 | mutex_lock(&async_register_mutex); | ||
272 | spin_lock_irq(&async_lock); | ||
273 | WARN_ON(!domain->registered || !list_empty(&domain->node) || | ||
274 | !list_empty(&domain->domain)); | ||
275 | domain->registered = 0; | ||
276 | spin_unlock_irq(&async_lock); | ||
277 | mutex_unlock(&async_register_mutex); | ||
278 | } | ||
279 | EXPORT_SYMBOL_GPL(async_unregister_domain); | ||
280 | |||
281 | /** | ||
246 | * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain | 282 | * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain |
247 | * @list: running list to synchronize on | 283 | * @domain: running list to synchronize on |
248 | * | 284 | * |
249 | * This function waits until all asynchronous function calls for the | 285 | * This function waits until all asynchronous function calls for the |
250 | * synchronization domain specified by the running list @list have been done. | 286 | * synchronization domain specified by the running list @domain have been done. |
251 | */ | 287 | */ |
252 | void async_synchronize_full_domain(struct list_head *list) | 288 | void async_synchronize_full_domain(struct async_domain *domain) |
253 | { | 289 | { |
254 | async_synchronize_cookie_domain(next_cookie, list); | 290 | async_synchronize_cookie_domain(next_cookie, domain); |
255 | } | 291 | } |
256 | EXPORT_SYMBOL_GPL(async_synchronize_full_domain); | 292 | EXPORT_SYMBOL_GPL(async_synchronize_full_domain); |
257 | 293 | ||
@@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); | |||
261 | * @running: running list to synchronize on | 297 | * @running: running list to synchronize on |
262 | * | 298 | * |
263 | * This function waits until all asynchronous function calls for the | 299 | * This function waits until all asynchronous function calls for the |
264 | * synchronization domain specified by the running list @list submitted | 300 | * synchronization domain specified by running list @running submitted |
265 | * prior to @cookie have been done. | 301 | * prior to @cookie have been done. |
266 | */ | 302 | */ |
267 | void async_synchronize_cookie_domain(async_cookie_t cookie, | 303 | void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) |
268 | struct list_head *running) | ||
269 | { | 304 | { |
270 | ktime_t uninitialized_var(starttime), delta, endtime; | 305 | ktime_t uninitialized_var(starttime), delta, endtime; |
271 | 306 | ||
307 | if (!running) | ||
308 | return; | ||
309 | |||
272 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 310 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
273 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); | 311 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); |
274 | starttime = ktime_get(); | 312 | starttime = ktime_get(); |
diff --git a/kernel/audit.c b/kernel/audit.c index 1c7f2c61416b..ea3b7b6191c7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb) | |||
384 | static void audit_printk_skb(struct sk_buff *skb) | 384 | static void audit_printk_skb(struct sk_buff *skb) |
385 | { | 385 | { |
386 | struct nlmsghdr *nlh = nlmsg_hdr(skb); | 386 | struct nlmsghdr *nlh = nlmsg_hdr(skb); |
387 | char *data = NLMSG_DATA(nlh); | 387 | char *data = nlmsg_data(nlh); |
388 | 388 | ||
389 | if (nlh->nlmsg_type != AUDIT_EOE) { | 389 | if (nlh->nlmsg_type != AUDIT_EOE) { |
390 | if (printk_ratelimit()) | 390 | if (printk_ratelimit()) |
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, | |||
516 | if (!skb) | 516 | if (!skb) |
517 | return NULL; | 517 | return NULL; |
518 | 518 | ||
519 | nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); | 519 | nlh = nlmsg_put(skb, pid, seq, t, size, flags); |
520 | data = NLMSG_DATA(nlh); | 520 | if (!nlh) |
521 | goto out_kfree_skb; | ||
522 | data = nlmsg_data(nlh); | ||
521 | memcpy(data, payload, size); | 523 | memcpy(data, payload, size); |
522 | return skb; | 524 | return skb; |
523 | 525 | ||
524 | nlmsg_failure: /* Used by NLMSG_NEW */ | 526 | out_kfree_skb: |
525 | if (skb) | 527 | kfree_skb(skb); |
526 | kfree_skb(skb); | ||
527 | return NULL; | 528 | return NULL; |
528 | } | 529 | } |
529 | 530 | ||
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
680 | sessionid = audit_get_sessionid(current); | 681 | sessionid = audit_get_sessionid(current); |
681 | security_task_getsecid(current, &sid); | 682 | security_task_getsecid(current, &sid); |
682 | seq = nlh->nlmsg_seq; | 683 | seq = nlh->nlmsg_seq; |
683 | data = NLMSG_DATA(nlh); | 684 | data = nlmsg_data(nlh); |
684 | 685 | ||
685 | switch (msg_type) { | 686 | switch (msg_type) { |
686 | case AUDIT_GET: | 687 | case AUDIT_GET: |
@@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff *skb) | |||
961 | static int __init audit_init(void) | 962 | static int __init audit_init(void) |
962 | { | 963 | { |
963 | int i; | 964 | int i; |
965 | struct netlink_kernel_cfg cfg = { | ||
966 | .input = audit_receive, | ||
967 | }; | ||
964 | 968 | ||
965 | if (audit_initialized == AUDIT_DISABLED) | 969 | if (audit_initialized == AUDIT_DISABLED) |
966 | return 0; | 970 | return 0; |
967 | 971 | ||
968 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 972 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
969 | audit_default ? "enabled" : "disabled"); | 973 | audit_default ? "enabled" : "disabled"); |
970 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, | 974 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, |
971 | audit_receive, NULL, THIS_MODULE); | 975 | THIS_MODULE, &cfg); |
972 | if (!audit_sock) | 976 | if (!audit_sock) |
973 | audit_panic("cannot initialize netlink socket"); | 977 | audit_panic("cannot initialize netlink socket"); |
974 | else | 978 | else |
@@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, | |||
1060 | 1064 | ||
1061 | ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); | 1065 | ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); |
1062 | if (!ab->skb) | 1066 | if (!ab->skb) |
1063 | goto nlmsg_failure; | 1067 | goto err; |
1064 | 1068 | ||
1065 | nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); | 1069 | nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); |
1070 | if (!nlh) | ||
1071 | goto out_kfree_skb; | ||
1066 | 1072 | ||
1067 | return ab; | 1073 | return ab; |
1068 | 1074 | ||
1069 | nlmsg_failure: /* Used by NLMSG_NEW */ | 1075 | out_kfree_skb: |
1070 | kfree_skb(ab->skb); | 1076 | kfree_skb(ab->skb); |
1071 | ab->skb = NULL; | 1077 | ab->skb = NULL; |
1072 | err: | 1078 | err: |
@@ -1450,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key) | |||
1450 | } | 1456 | } |
1451 | 1457 | ||
1452 | /** | 1458 | /** |
1459 | * audit_log_link_denied - report a link restriction denial | ||
1460 | * @operation: specific link opreation | ||
1461 | * @link: the path that triggered the restriction | ||
1462 | */ | ||
1463 | void audit_log_link_denied(const char *operation, struct path *link) | ||
1464 | { | ||
1465 | struct audit_buffer *ab; | ||
1466 | |||
1467 | ab = audit_log_start(current->audit_context, GFP_KERNEL, | ||
1468 | AUDIT_ANOM_LINK); | ||
1469 | audit_log_format(ab, "op=%s action=denied", operation); | ||
1470 | audit_log_format(ab, " pid=%d comm=", current->pid); | ||
1471 | audit_log_untrustedstring(ab, current->comm); | ||
1472 | audit_log_d_path(ab, " path=", link); | ||
1473 | audit_log_format(ab, " dev="); | ||
1474 | audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); | ||
1475 | audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); | ||
1476 | audit_log_end(ab); | ||
1477 | } | ||
1478 | |||
1479 | /** | ||
1453 | * audit_log_end - end one audit record | 1480 | * audit_log_end - end one audit record |
1454 | * @ab: the audit_buffer | 1481 | * @ab: the audit_buffer |
1455 | * | 1482 | * |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 5bf0790497e7..3a5ca582ba1e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -595,7 +595,7 @@ void audit_trim_trees(void) | |||
595 | 595 | ||
596 | root_mnt = collect_mounts(&path); | 596 | root_mnt = collect_mounts(&path); |
597 | path_put(&path); | 597 | path_put(&path); |
598 | if (!root_mnt) | 598 | if (IS_ERR(root_mnt)) |
599 | goto skip_it; | 599 | goto skip_it; |
600 | 600 | ||
601 | spin_lock(&hash_lock); | 601 | spin_lock(&hash_lock); |
@@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule) | |||
669 | goto Err; | 669 | goto Err; |
670 | mnt = collect_mounts(&path); | 670 | mnt = collect_mounts(&path); |
671 | path_put(&path); | 671 | path_put(&path); |
672 | if (!mnt) { | 672 | if (IS_ERR(mnt)) { |
673 | err = -ENOMEM; | 673 | err = PTR_ERR(mnt); |
674 | goto Err; | 674 | goto Err; |
675 | } | 675 | } |
676 | 676 | ||
@@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new) | |||
719 | return err; | 719 | return err; |
720 | tagged = collect_mounts(&path2); | 720 | tagged = collect_mounts(&path2); |
721 | path_put(&path2); | 721 | path_put(&path2); |
722 | if (!tagged) | 722 | if (IS_ERR(tagged)) |
723 | return -ENOMEM; | 723 | return PTR_ERR(tagged); |
724 | 724 | ||
725 | err = kern_path(old, 0, &path1); | 725 | err = kern_path(old, 0, &path1); |
726 | if (err) { | 726 | if (err) { |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e683869365d9..3823281401b5 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
355 | /* Get path information necessary for adding watches. */ | 355 | /* Get path information necessary for adding watches. */ |
356 | static int audit_get_nd(struct audit_watch *watch, struct path *parent) | 356 | static int audit_get_nd(struct audit_watch *watch, struct path *parent) |
357 | { | 357 | { |
358 | struct nameidata nd; | 358 | struct dentry *d = kern_path_locked(watch->path, parent); |
359 | struct dentry *d; | 359 | if (IS_ERR(d)) |
360 | int err; | ||
361 | |||
362 | err = kern_path_parent(watch->path, &nd); | ||
363 | if (err) | ||
364 | return err; | ||
365 | |||
366 | if (nd.last_type != LAST_NORM) { | ||
367 | path_put(&nd.path); | ||
368 | return -EINVAL; | ||
369 | } | ||
370 | |||
371 | mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); | ||
372 | d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); | ||
373 | if (IS_ERR(d)) { | ||
374 | mutex_unlock(&nd.path.dentry->d_inode->i_mutex); | ||
375 | path_put(&nd.path); | ||
376 | return PTR_ERR(d); | 360 | return PTR_ERR(d); |
377 | } | 361 | mutex_unlock(&parent->dentry->d_inode->i_mutex); |
378 | if (d->d_inode) { | 362 | if (d->d_inode) { |
379 | /* update watch filter fields */ | 363 | /* update watch filter fields */ |
380 | watch->dev = d->d_inode->i_sb->s_dev; | 364 | watch->dev = d->d_inode->i_sb->s_dev; |
381 | watch->ino = d->d_inode->i_ino; | 365 | watch->ino = d->d_inode->i_ino; |
382 | } | 366 | } |
383 | mutex_unlock(&nd.path.dentry->d_inode->i_mutex); | ||
384 | |||
385 | *parent = nd.path; | ||
386 | dput(d); | 367 | dput(d); |
387 | return 0; | 368 | return 0; |
388 | } | 369 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 72fcd3069a90..79818507e444 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) | |||
255 | 255 | ||
256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
257 | 257 | ||
258 | static int css_unbias_refcnt(int refcnt) | ||
259 | { | ||
260 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | ||
261 | } | ||
262 | |||
258 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | 263 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ |
259 | static int css_refcnt(struct cgroup_subsys_state *css) | 264 | static int css_refcnt(struct cgroup_subsys_state *css) |
260 | { | 265 | { |
261 | int v = atomic_read(&css->refcnt); | 266 | int v = atomic_read(&css->refcnt); |
262 | 267 | ||
263 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | 268 | return css_unbias_refcnt(v); |
264 | } | 269 | } |
265 | 270 | ||
266 | /* convenient tests for these bits */ | 271 | /* convenient tests for these bits */ |
@@ -817,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
817 | */ | 822 | */ |
818 | 823 | ||
819 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); | 824 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
820 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); | 825 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); |
821 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 826 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
822 | static int cgroup_populate_dir(struct cgroup *cgrp); | 827 | static int cgroup_populate_dir(struct cgroup *cgrp); |
823 | static const struct inode_operations cgroup_dir_inode_operations; | 828 | static const struct inode_operations cgroup_dir_inode_operations; |
@@ -896,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
896 | mutex_unlock(&cgroup_mutex); | 901 | mutex_unlock(&cgroup_mutex); |
897 | 902 | ||
898 | /* | 903 | /* |
899 | * We want to drop the active superblock reference from the | 904 | * Drop the active superblock reference that we took when we |
900 | * cgroup creation after all the dentry refs are gone - | 905 | * created the cgroup |
901 | * kill_sb gets mighty unhappy otherwise. Mark | ||
902 | * dentry->d_fsdata with cgroup_diput() to tell | ||
903 | * cgroup_d_release() to call deactivate_super(). | ||
904 | */ | 906 | */ |
905 | dentry->d_fsdata = cgroup_diput; | 907 | deactivate_super(cgrp->root->sb); |
906 | 908 | ||
907 | /* | 909 | /* |
908 | * if we're getting rid of the cgroup, refcount should ensure | 910 | * if we're getting rid of the cgroup, refcount should ensure |
@@ -928,13 +930,6 @@ static int cgroup_delete(const struct dentry *d) | |||
928 | return 1; | 930 | return 1; |
929 | } | 931 | } |
930 | 932 | ||
931 | static void cgroup_d_release(struct dentry *dentry) | ||
932 | { | ||
933 | /* did cgroup_diput() tell me to deactivate super? */ | ||
934 | if (dentry->d_fsdata == cgroup_diput) | ||
935 | deactivate_super(dentry->d_sb); | ||
936 | } | ||
937 | |||
938 | static void remove_dir(struct dentry *d) | 933 | static void remove_dir(struct dentry *d) |
939 | { | 934 | { |
940 | struct dentry *parent = dget(d->d_parent); | 935 | struct dentry *parent = dget(d->d_parent); |
@@ -959,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
959 | 954 | ||
960 | dget(d); | 955 | dget(d); |
961 | d_delete(d); | 956 | d_delete(d); |
962 | simple_unlink(d->d_inode, d); | 957 | simple_unlink(cgrp->dentry->d_inode, d); |
963 | list_del_init(&cfe->node); | 958 | list_del_init(&cfe->node); |
964 | dput(d); | 959 | dput(d); |
965 | 960 | ||
@@ -1073,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1073 | BUG_ON(cgrp->subsys[i]); | 1068 | BUG_ON(cgrp->subsys[i]); |
1074 | BUG_ON(!dummytop->subsys[i]); | 1069 | BUG_ON(!dummytop->subsys[i]); |
1075 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); | 1070 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); |
1076 | mutex_lock(&ss->hierarchy_mutex); | ||
1077 | cgrp->subsys[i] = dummytop->subsys[i]; | 1071 | cgrp->subsys[i] = dummytop->subsys[i]; |
1078 | cgrp->subsys[i]->cgroup = cgrp; | 1072 | cgrp->subsys[i]->cgroup = cgrp; |
1079 | list_move(&ss->sibling, &root->subsys_list); | 1073 | list_move(&ss->sibling, &root->subsys_list); |
1080 | ss->root = root; | 1074 | ss->root = root; |
1081 | if (ss->bind) | 1075 | if (ss->bind) |
1082 | ss->bind(cgrp); | 1076 | ss->bind(cgrp); |
1083 | mutex_unlock(&ss->hierarchy_mutex); | ||
1084 | /* refcount was already taken, and we're keeping it */ | 1077 | /* refcount was already taken, and we're keeping it */ |
1085 | } else if (bit & removed_bits) { | 1078 | } else if (bit & removed_bits) { |
1086 | /* We're removing this subsystem */ | 1079 | /* We're removing this subsystem */ |
1087 | BUG_ON(ss == NULL); | 1080 | BUG_ON(ss == NULL); |
1088 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | 1081 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); |
1089 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1082 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); |
1090 | mutex_lock(&ss->hierarchy_mutex); | ||
1091 | if (ss->bind) | 1083 | if (ss->bind) |
1092 | ss->bind(dummytop); | 1084 | ss->bind(dummytop); |
1093 | dummytop->subsys[i]->cgroup = dummytop; | 1085 | dummytop->subsys[i]->cgroup = dummytop; |
1094 | cgrp->subsys[i] = NULL; | 1086 | cgrp->subsys[i] = NULL; |
1095 | subsys[i]->root = &rootnode; | 1087 | subsys[i]->root = &rootnode; |
1096 | list_move(&ss->sibling, &rootnode.subsys_list); | 1088 | list_move(&ss->sibling, &rootnode.subsys_list); |
1097 | mutex_unlock(&ss->hierarchy_mutex); | ||
1098 | /* subsystem is now free - drop reference on module */ | 1089 | /* subsystem is now free - drop reference on module */ |
1099 | module_put(ss->module); | 1090 | module_put(ss->module); |
1100 | } else if (bit & final_bits) { | 1091 | } else if (bit & final_bits) { |
@@ -1542,7 +1533,6 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1542 | static const struct dentry_operations cgroup_dops = { | 1533 | static const struct dentry_operations cgroup_dops = { |
1543 | .d_iput = cgroup_diput, | 1534 | .d_iput = cgroup_diput, |
1544 | .d_delete = cgroup_delete, | 1535 | .d_delete = cgroup_delete, |
1545 | .d_release = cgroup_d_release, | ||
1546 | }; | 1536 | }; |
1547 | 1537 | ||
1548 | struct inode *inode = | 1538 | struct inode *inode = |
@@ -1593,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1593 | opts.new_root = new_root; | 1583 | opts.new_root = new_root; |
1594 | 1584 | ||
1595 | /* Locate an existing or new sb for this hierarchy */ | 1585 | /* Locate an existing or new sb for this hierarchy */ |
1596 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); | 1586 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); |
1597 | if (IS_ERR(sb)) { | 1587 | if (IS_ERR(sb)) { |
1598 | ret = PTR_ERR(sb); | 1588 | ret = PTR_ERR(sb); |
1599 | cgroup_drop_root(opts.new_root); | 1589 | cgroup_drop_root(opts.new_root); |
@@ -2576,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = { | |||
2576 | .rename = cgroup_rename, | 2566 | .rename = cgroup_rename, |
2577 | }; | 2567 | }; |
2578 | 2568 | ||
2579 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) | 2569 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) |
2580 | { | 2570 | { |
2581 | if (dentry->d_name.len > NAME_MAX) | 2571 | if (dentry->d_name.len > NAME_MAX) |
2582 | return ERR_PTR(-ENAMETOOLONG); | 2572 | return ERR_PTR(-ENAMETOOLONG); |
@@ -3889,8 +3879,12 @@ static void css_dput_fn(struct work_struct *work) | |||
3889 | { | 3879 | { |
3890 | struct cgroup_subsys_state *css = | 3880 | struct cgroup_subsys_state *css = |
3891 | container_of(work, struct cgroup_subsys_state, dput_work); | 3881 | container_of(work, struct cgroup_subsys_state, dput_work); |
3882 | struct dentry *dentry = css->cgroup->dentry; | ||
3883 | struct super_block *sb = dentry->d_sb; | ||
3892 | 3884 | ||
3893 | dput(css->cgroup->dentry); | 3885 | atomic_inc(&sb->s_active); |
3886 | dput(dentry); | ||
3887 | deactivate_super(sb); | ||
3894 | } | 3888 | } |
3895 | 3889 | ||
3896 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3890 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
@@ -3917,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
3917 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | 3911 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); |
3918 | } | 3912 | } |
3919 | 3913 | ||
3920 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | ||
3921 | { | ||
3922 | /* We need to take each hierarchy_mutex in a consistent order */ | ||
3923 | int i; | ||
3924 | |||
3925 | /* | ||
3926 | * No worry about a race with rebind_subsystems that might mess up the | ||
3927 | * locking order, since both parties are under cgroup_mutex. | ||
3928 | */ | ||
3929 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
3930 | struct cgroup_subsys *ss = subsys[i]; | ||
3931 | if (ss == NULL) | ||
3932 | continue; | ||
3933 | if (ss->root == root) | ||
3934 | mutex_lock(&ss->hierarchy_mutex); | ||
3935 | } | ||
3936 | } | ||
3937 | |||
3938 | static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) | ||
3939 | { | ||
3940 | int i; | ||
3941 | |||
3942 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
3943 | struct cgroup_subsys *ss = subsys[i]; | ||
3944 | if (ss == NULL) | ||
3945 | continue; | ||
3946 | if (ss->root == root) | ||
3947 | mutex_unlock(&ss->hierarchy_mutex); | ||
3948 | } | ||
3949 | } | ||
3950 | |||
3951 | /* | 3914 | /* |
3952 | * cgroup_create - create a cgroup | 3915 | * cgroup_create - create a cgroup |
3953 | * @parent: cgroup that will be parent of the new cgroup | 3916 | * @parent: cgroup that will be parent of the new cgroup |
@@ -4008,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4008 | ss->post_clone(cgrp); | 3971 | ss->post_clone(cgrp); |
4009 | } | 3972 | } |
4010 | 3973 | ||
4011 | cgroup_lock_hierarchy(root); | ||
4012 | list_add(&cgrp->sibling, &cgrp->parent->children); | 3974 | list_add(&cgrp->sibling, &cgrp->parent->children); |
4013 | cgroup_unlock_hierarchy(root); | ||
4014 | root->number_of_cgroups++; | 3975 | root->number_of_cgroups++; |
4015 | 3976 | ||
4016 | err = cgroup_create_dir(cgrp, dentry, mode); | 3977 | err = cgroup_create_dir(cgrp, dentry, mode); |
@@ -4037,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4037 | 3998 | ||
4038 | err_remove: | 3999 | err_remove: |
4039 | 4000 | ||
4040 | cgroup_lock_hierarchy(root); | ||
4041 | list_del(&cgrp->sibling); | 4001 | list_del(&cgrp->sibling); |
4042 | cgroup_unlock_hierarchy(root); | ||
4043 | root->number_of_cgroups--; | 4002 | root->number_of_cgroups--; |
4044 | 4003 | ||
4045 | err_destroy: | 4004 | err_destroy: |
@@ -4247,10 +4206,8 @@ again: | |||
4247 | list_del_init(&cgrp->release_list); | 4206 | list_del_init(&cgrp->release_list); |
4248 | raw_spin_unlock(&release_list_lock); | 4207 | raw_spin_unlock(&release_list_lock); |
4249 | 4208 | ||
4250 | cgroup_lock_hierarchy(cgrp->root); | ||
4251 | /* delete this cgroup from parent->children */ | 4209 | /* delete this cgroup from parent->children */ |
4252 | list_del_init(&cgrp->sibling); | 4210 | list_del_init(&cgrp->sibling); |
4253 | cgroup_unlock_hierarchy(cgrp->root); | ||
4254 | 4211 | ||
4255 | list_del_init(&cgrp->allcg_node); | 4212 | list_del_init(&cgrp->allcg_node); |
4256 | 4213 | ||
@@ -4324,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4324 | * need to invoke fork callbacks here. */ | 4281 | * need to invoke fork callbacks here. */ |
4325 | BUG_ON(!list_empty(&init_task.tasks)); | 4282 | BUG_ON(!list_empty(&init_task.tasks)); |
4326 | 4283 | ||
4327 | mutex_init(&ss->hierarchy_mutex); | ||
4328 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); | ||
4329 | ss->active = 1; | 4284 | ss->active = 1; |
4330 | 4285 | ||
4331 | /* this function shouldn't be used with modular subsystems, since they | 4286 | /* this function shouldn't be used with modular subsystems, since they |
@@ -4452,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4452 | } | 4407 | } |
4453 | write_unlock(&css_set_lock); | 4408 | write_unlock(&css_set_lock); |
4454 | 4409 | ||
4455 | mutex_init(&ss->hierarchy_mutex); | ||
4456 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); | ||
4457 | ss->active = 1; | 4410 | ss->active = 1; |
4458 | 4411 | ||
4459 | /* success! */ | 4412 | /* success! */ |
@@ -4982,10 +4935,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); | |||
4982 | void __css_put(struct cgroup_subsys_state *css) | 4935 | void __css_put(struct cgroup_subsys_state *css) |
4983 | { | 4936 | { |
4984 | struct cgroup *cgrp = css->cgroup; | 4937 | struct cgroup *cgrp = css->cgroup; |
4938 | int v; | ||
4985 | 4939 | ||
4986 | rcu_read_lock(); | 4940 | rcu_read_lock(); |
4987 | atomic_dec(&css->refcnt); | 4941 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); |
4988 | switch (css_refcnt(css)) { | 4942 | |
4943 | switch (v) { | ||
4989 | case 1: | 4944 | case 1: |
4990 | if (notify_on_release(cgrp)) { | 4945 | if (notify_on_release(cgrp)) { |
4991 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4946 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb5227a19e..14d32588cccd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
416 | 416 | ||
417 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) { | 417 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) { |
418 | mutex_lock(&zonelists_mutex); | 418 | mutex_lock(&zonelists_mutex); |
419 | build_all_zonelists(NULL); | 419 | build_all_zonelists(NULL, NULL); |
420 | mutex_unlock(&zonelists_mutex); | 420 | mutex_unlock(&zonelists_mutex); |
421 | } | 421 | } |
422 | #endif | 422 | #endif |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd652dd12..f33c7153b6d7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -147,6 +147,12 @@ typedef enum { | |||
147 | CS_SPREAD_SLAB, | 147 | CS_SPREAD_SLAB, |
148 | } cpuset_flagbits_t; | 148 | } cpuset_flagbits_t; |
149 | 149 | ||
150 | /* the type of hotplug event */ | ||
151 | enum hotplug_event { | ||
152 | CPUSET_CPU_OFFLINE, | ||
153 | CPUSET_MEM_OFFLINE, | ||
154 | }; | ||
155 | |||
150 | /* convenient tests for these bits */ | 156 | /* convenient tests for these bits */ |
151 | static inline int is_cpu_exclusive(const struct cpuset *cs) | 157 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
152 | { | 158 | { |
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1990 | } | 1996 | } |
1991 | 1997 | ||
1992 | /* | 1998 | /* |
1993 | * Walk the specified cpuset subtree and look for empty cpusets. | 1999 | * Helper function to traverse cpusets. |
1994 | * The tasks of such cpuset must be moved to a parent cpuset. | 2000 | * It can be used to walk the cpuset tree from top to bottom, completing |
2001 | * one layer before dropping down to the next (thus always processing a | ||
2002 | * node before any of its children). | ||
2003 | */ | ||
2004 | static struct cpuset *cpuset_next(struct list_head *queue) | ||
2005 | { | ||
2006 | struct cpuset *cp; | ||
2007 | struct cpuset *child; /* scans child cpusets of cp */ | ||
2008 | struct cgroup *cont; | ||
2009 | |||
2010 | if (list_empty(queue)) | ||
2011 | return NULL; | ||
2012 | |||
2013 | cp = list_first_entry(queue, struct cpuset, stack_list); | ||
2014 | list_del(queue->next); | ||
2015 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
2016 | child = cgroup_cs(cont); | ||
2017 | list_add_tail(&child->stack_list, queue); | ||
2018 | } | ||
2019 | |||
2020 | return cp; | ||
2021 | } | ||
2022 | |||
2023 | |||
2024 | /* | ||
2025 | * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory | ||
2026 | * online/offline) and update the cpusets accordingly. | ||
2027 | * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such | ||
2028 | * cpuset must be moved to a parent cpuset. | ||
1995 | * | 2029 | * |
1996 | * Called with cgroup_mutex held. We take callback_mutex to modify | 2030 | * Called with cgroup_mutex held. We take callback_mutex to modify |
1997 | * cpus_allowed and mems_allowed. | 2031 | * cpus_allowed and mems_allowed. |
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
2000 | * before dropping down to the next. It always processes a node before | 2034 | * before dropping down to the next. It always processes a node before |
2001 | * any of its children. | 2035 | * any of its children. |
2002 | * | 2036 | * |
2003 | * For now, since we lack memory hot unplug, we'll never see a cpuset | 2037 | * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY |
2004 | * that has tasks along with an empty 'mems'. But if we did see such | 2038 | * if all present pages from a node are offlined. |
2005 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | ||
2006 | */ | 2039 | */ |
2007 | static void scan_for_empty_cpusets(struct cpuset *root) | 2040 | static void |
2041 | scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | ||
2008 | { | 2042 | { |
2009 | LIST_HEAD(queue); | 2043 | LIST_HEAD(queue); |
2010 | struct cpuset *cp; /* scans cpusets being updated */ | 2044 | struct cpuset *cp; /* scans cpusets being updated */ |
2011 | struct cpuset *child; /* scans child cpusets of cp */ | ||
2012 | struct cgroup *cont; | ||
2013 | static nodemask_t oldmems; /* protected by cgroup_mutex */ | 2045 | static nodemask_t oldmems; /* protected by cgroup_mutex */ |
2014 | 2046 | ||
2015 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 2047 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
2016 | 2048 | ||
2017 | while (!list_empty(&queue)) { | 2049 | switch (event) { |
2018 | cp = list_first_entry(&queue, struct cpuset, stack_list); | 2050 | case CPUSET_CPU_OFFLINE: |
2019 | list_del(queue.next); | 2051 | while ((cp = cpuset_next(&queue)) != NULL) { |
2020 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 2052 | |
2021 | child = cgroup_cs(cont); | 2053 | /* Continue past cpusets with all cpus online */ |
2022 | list_add_tail(&child->stack_list, &queue); | 2054 | if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) |
2055 | continue; | ||
2056 | |||
2057 | /* Remove offline cpus from this cpuset. */ | ||
2058 | mutex_lock(&callback_mutex); | ||
2059 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, | ||
2060 | cpu_active_mask); | ||
2061 | mutex_unlock(&callback_mutex); | ||
2062 | |||
2063 | /* Move tasks from the empty cpuset to a parent */ | ||
2064 | if (cpumask_empty(cp->cpus_allowed)) | ||
2065 | remove_tasks_in_empty_cpuset(cp); | ||
2066 | else | ||
2067 | update_tasks_cpumask(cp, NULL); | ||
2023 | } | 2068 | } |
2069 | break; | ||
2024 | 2070 | ||
2025 | /* Continue past cpusets with all cpus, mems online */ | 2071 | case CPUSET_MEM_OFFLINE: |
2026 | if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && | 2072 | while ((cp = cpuset_next(&queue)) != NULL) { |
2027 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | ||
2028 | continue; | ||
2029 | 2073 | ||
2030 | oldmems = cp->mems_allowed; | 2074 | /* Continue past cpusets with all mems online */ |
2075 | if (nodes_subset(cp->mems_allowed, | ||
2076 | node_states[N_HIGH_MEMORY])) | ||
2077 | continue; | ||
2031 | 2078 | ||
2032 | /* Remove offline cpus and mems from this cpuset. */ | 2079 | oldmems = cp->mems_allowed; |
2033 | mutex_lock(&callback_mutex); | 2080 | |
2034 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, | 2081 | /* Remove offline mems from this cpuset. */ |
2035 | cpu_active_mask); | 2082 | mutex_lock(&callback_mutex); |
2036 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2083 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
2037 | node_states[N_HIGH_MEMORY]); | 2084 | node_states[N_HIGH_MEMORY]); |
2038 | mutex_unlock(&callback_mutex); | 2085 | mutex_unlock(&callback_mutex); |
2039 | 2086 | ||
2040 | /* Move tasks from the empty cpuset to a parent */ | 2087 | /* Move tasks from the empty cpuset to a parent */ |
2041 | if (cpumask_empty(cp->cpus_allowed) || | 2088 | if (nodes_empty(cp->mems_allowed)) |
2042 | nodes_empty(cp->mems_allowed)) | 2089 | remove_tasks_in_empty_cpuset(cp); |
2043 | remove_tasks_in_empty_cpuset(cp); | 2090 | else |
2044 | else { | 2091 | update_tasks_nodemask(cp, &oldmems, NULL); |
2045 | update_tasks_cpumask(cp, NULL); | ||
2046 | update_tasks_nodemask(cp, &oldmems, NULL); | ||
2047 | } | 2092 | } |
2048 | } | 2093 | } |
2049 | } | 2094 | } |
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2054 | * (of no affect) on systems that are actively using CPU hotplug | 2099 | * (of no affect) on systems that are actively using CPU hotplug |
2055 | * but making no active use of cpusets. | 2100 | * but making no active use of cpusets. |
2056 | * | 2101 | * |
2102 | * The only exception to this is suspend/resume, where we don't | ||
2103 | * modify cpusets at all. | ||
2104 | * | ||
2057 | * This routine ensures that top_cpuset.cpus_allowed tracks | 2105 | * This routine ensures that top_cpuset.cpus_allowed tracks |
2058 | * cpu_active_mask on each CPU hotplug (cpuhp) event. | 2106 | * cpu_active_mask on each CPU hotplug (cpuhp) event. |
2059 | * | 2107 | * |
2060 | * Called within get_online_cpus(). Needs to call cgroup_lock() | 2108 | * Called within get_online_cpus(). Needs to call cgroup_lock() |
2061 | * before calling generate_sched_domains(). | 2109 | * before calling generate_sched_domains(). |
2110 | * | ||
2111 | * @cpu_online: Indicates whether this is a CPU online event (true) or | ||
2112 | * a CPU offline event (false). | ||
2062 | */ | 2113 | */ |
2063 | void cpuset_update_active_cpus(void) | 2114 | void cpuset_update_active_cpus(bool cpu_online) |
2064 | { | 2115 | { |
2065 | struct sched_domain_attr *attr; | 2116 | struct sched_domain_attr *attr; |
2066 | cpumask_var_t *doms; | 2117 | cpumask_var_t *doms; |
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void) | |||
2070 | mutex_lock(&callback_mutex); | 2121 | mutex_lock(&callback_mutex); |
2071 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2122 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2072 | mutex_unlock(&callback_mutex); | 2123 | mutex_unlock(&callback_mutex); |
2073 | scan_for_empty_cpusets(&top_cpuset); | 2124 | |
2125 | if (!cpu_online) | ||
2126 | scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); | ||
2127 | |||
2074 | ndoms = generate_sched_domains(&doms, &attr); | 2128 | ndoms = generate_sched_domains(&doms, &attr); |
2075 | cgroup_unlock(); | 2129 | cgroup_unlock(); |
2076 | 2130 | ||
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void) | |||
2082 | /* | 2136 | /* |
2083 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2137 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
2084 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. | 2138 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
2085 | * See also the previous routine cpuset_track_online_cpus(). | 2139 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
2086 | */ | 2140 | */ |
2087 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2141 | static int cpuset_track_online_nodes(struct notifier_block *self, |
2088 | unsigned long action, void *arg) | 2142 | unsigned long action, void *arg) |
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2101 | case MEM_OFFLINE: | 2155 | case MEM_OFFLINE: |
2102 | /* | 2156 | /* |
2103 | * needn't update top_cpuset.mems_allowed explicitly because | 2157 | * needn't update top_cpuset.mems_allowed explicitly because |
2104 | * scan_for_empty_cpusets() will update it. | 2158 | * scan_cpusets_upon_hotplug() will update it. |
2105 | */ | 2159 | */ |
2106 | scan_for_empty_cpusets(&top_cpuset); | 2160 | scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); |
2107 | break; | 2161 | break; |
2108 | default: | 2162 | default: |
2109 | break; | 2163 | break; |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8b68ce78ff17..be7b33b73d30 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/kdb.h> | 12 | #include <linux/kdb.h> |
13 | #include <linux/kdebug.h> | 13 | #include <linux/kdebug.h> |
14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
15 | #include <linux/hardirq.h> | ||
15 | #include "kdb_private.h" | 16 | #include "kdb_private.h" |
16 | #include "../debug_core.h" | 17 | #include "../debug_core.h" |
17 | 18 | ||
@@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks) | |||
52 | if (atomic_read(&kgdb_setting_breakpoint)) | 53 | if (atomic_read(&kgdb_setting_breakpoint)) |
53 | reason = KDB_REASON_KEYBOARD; | 54 | reason = KDB_REASON_KEYBOARD; |
54 | 55 | ||
56 | if (in_nmi()) | ||
57 | reason = KDB_REASON_NMI; | ||
58 | |||
55 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { | 59 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { |
56 | if ((bp->bp_enabled) && (bp->bp_addr == addr)) { | 60 | if ((bp->bp_enabled) && (bp->bp_addr == addr)) { |
57 | reason = KDB_REASON_BREAK; | 61 | reason = KDB_REASON_BREAK; |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index bb9520f0f6ff..0a69d2adc4f3 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -715,9 +715,6 @@ kdb_printit: | |||
715 | /* check for having reached the LINES number of printed lines */ | 715 | /* check for having reached the LINES number of printed lines */ |
716 | if (kdb_nextline == linecount) { | 716 | if (kdb_nextline == linecount) { |
717 | char buf1[16] = ""; | 717 | char buf1[16] = ""; |
718 | #if defined(CONFIG_SMP) | ||
719 | char buf2[32]; | ||
720 | #endif | ||
721 | 718 | ||
722 | /* Watch out for recursion here. Any routine that calls | 719 | /* Watch out for recursion here. Any routine that calls |
723 | * kdb_printf will come back through here. And kdb_read | 720 | * kdb_printf will come back through here. And kdb_read |
@@ -732,14 +729,6 @@ kdb_printit: | |||
732 | if (moreprompt == NULL) | 729 | if (moreprompt == NULL) |
733 | moreprompt = "more> "; | 730 | moreprompt = "more> "; |
734 | 731 | ||
735 | #if defined(CONFIG_SMP) | ||
736 | if (strchr(moreprompt, '%')) { | ||
737 | sprintf(buf2, moreprompt, get_cpu()); | ||
738 | put_cpu(); | ||
739 | moreprompt = buf2; | ||
740 | } | ||
741 | #endif | ||
742 | |||
743 | kdb_input_flush(); | 732 | kdb_input_flush(); |
744 | c = console_drivers; | 733 | c = console_drivers; |
745 | 734 | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 67b847dfa2bb..31df1706b9a9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
15 | #include <linux/string.h> | 15 | #include <linux/string.h> |
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/kmsg_dump.h> | ||
17 | #include <linux/reboot.h> | 18 | #include <linux/reboot.h> |
18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
19 | #include <linux/sysrq.h> | 20 | #include <linux/sysrq.h> |
@@ -138,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); | |||
138 | static char *__env[] = { | 139 | static char *__env[] = { |
139 | #if defined(CONFIG_SMP) | 140 | #if defined(CONFIG_SMP) |
140 | "PROMPT=[%d]kdb> ", | 141 | "PROMPT=[%d]kdb> ", |
141 | "MOREPROMPT=[%d]more> ", | ||
142 | #else | 142 | #else |
143 | "PROMPT=kdb> ", | 143 | "PROMPT=kdb> ", |
144 | "MOREPROMPT=more> ", | ||
145 | #endif | 144 | #endif |
145 | "MOREPROMPT=more> ", | ||
146 | "RADIX=16", | 146 | "RADIX=16", |
147 | "MDCOUNT=8", /* lines of md output */ | 147 | "MDCOUNT=8", /* lines of md output */ |
148 | KDB_PLATFORM_ENV, | 148 | KDB_PLATFORM_ENV, |
@@ -1235,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, | |||
1235 | *cmdbuf = '\0'; | 1235 | *cmdbuf = '\0'; |
1236 | *(cmd_hist[cmd_head]) = '\0'; | 1236 | *(cmd_hist[cmd_head]) = '\0'; |
1237 | 1237 | ||
1238 | if (KDB_FLAG(ONLY_DO_DUMP)) { | ||
1239 | /* kdb is off but a catastrophic error requires a dump. | ||
1240 | * Take the dump and reboot. | ||
1241 | * Turn on logging so the kdb output appears in the log | ||
1242 | * buffer in the dump. | ||
1243 | */ | ||
1244 | const char *setargs[] = { "set", "LOGGING", "1" }; | ||
1245 | kdb_set(2, setargs); | ||
1246 | kdb_reboot(0, NULL); | ||
1247 | /*NOTREACHED*/ | ||
1248 | } | ||
1249 | |||
1250 | do_full_getstr: | 1238 | do_full_getstr: |
1251 | #if defined(CONFIG_SMP) | 1239 | #if defined(CONFIG_SMP) |
1252 | snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), | 1240 | snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), |
@@ -2040,8 +2028,15 @@ static int kdb_env(int argc, const char **argv) | |||
2040 | */ | 2028 | */ |
2041 | static int kdb_dmesg(int argc, const char **argv) | 2029 | static int kdb_dmesg(int argc, const char **argv) |
2042 | { | 2030 | { |
2043 | char *syslog_data[4], *start, *end, c = '\0', *p; | 2031 | int diag; |
2044 | int diag, logging, logsize, lines = 0, adjust = 0, n; | 2032 | int logging; |
2033 | int lines = 0; | ||
2034 | int adjust = 0; | ||
2035 | int n = 0; | ||
2036 | int skip = 0; | ||
2037 | struct kmsg_dumper dumper = { .active = 1 }; | ||
2038 | size_t len; | ||
2039 | char buf[201]; | ||
2045 | 2040 | ||
2046 | if (argc > 2) | 2041 | if (argc > 2) |
2047 | return KDB_ARGCOUNT; | 2042 | return KDB_ARGCOUNT; |
@@ -2064,22 +2059,10 @@ static int kdb_dmesg(int argc, const char **argv) | |||
2064 | kdb_set(2, setargs); | 2059 | kdb_set(2, setargs); |
2065 | } | 2060 | } |
2066 | 2061 | ||
2067 | /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] | 2062 | kmsg_dump_rewind_nolock(&dumper); |
2068 | * logical start, end+1. */ | 2063 | while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) |
2069 | kdb_syslog_data(syslog_data); | 2064 | n++; |
2070 | if (syslog_data[2] == syslog_data[3]) | 2065 | |
2071 | return 0; | ||
2072 | logsize = syslog_data[1] - syslog_data[0]; | ||
2073 | start = syslog_data[2]; | ||
2074 | end = syslog_data[3]; | ||
2075 | #define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) | ||
2076 | for (n = 0, p = start; p < end; ++p) { | ||
2077 | c = *KDB_WRAP(p); | ||
2078 | if (c == '\n') | ||
2079 | ++n; | ||
2080 | } | ||
2081 | if (c != '\n') | ||
2082 | ++n; | ||
2083 | if (lines < 0) { | 2066 | if (lines < 0) { |
2084 | if (adjust >= n) | 2067 | if (adjust >= n) |
2085 | kdb_printf("buffer only contains %d lines, nothing " | 2068 | kdb_printf("buffer only contains %d lines, nothing " |
@@ -2087,21 +2070,11 @@ static int kdb_dmesg(int argc, const char **argv) | |||
2087 | else if (adjust - lines >= n) | 2070 | else if (adjust - lines >= n) |
2088 | kdb_printf("buffer only contains %d lines, last %d " | 2071 | kdb_printf("buffer only contains %d lines, last %d " |
2089 | "lines printed\n", n, n - adjust); | 2072 | "lines printed\n", n, n - adjust); |
2090 | if (adjust) { | 2073 | skip = adjust; |
2091 | for (; start < end && adjust; ++start) { | 2074 | lines = abs(lines); |
2092 | if (*KDB_WRAP(start) == '\n') | ||
2093 | --adjust; | ||
2094 | } | ||
2095 | if (start < end) | ||
2096 | ++start; | ||
2097 | } | ||
2098 | for (p = start; p < end && lines; ++p) { | ||
2099 | if (*KDB_WRAP(p) == '\n') | ||
2100 | ++lines; | ||
2101 | } | ||
2102 | end = p; | ||
2103 | } else if (lines > 0) { | 2075 | } else if (lines > 0) { |
2104 | int skip = n - (adjust + lines); | 2076 | skip = n - lines - adjust; |
2077 | lines = abs(lines); | ||
2105 | if (adjust >= n) { | 2078 | if (adjust >= n) { |
2106 | kdb_printf("buffer only contains %d lines, " | 2079 | kdb_printf("buffer only contains %d lines, " |
2107 | "nothing printed\n", n); | 2080 | "nothing printed\n", n); |
@@ -2112,35 +2085,24 @@ static int kdb_dmesg(int argc, const char **argv) | |||
2112 | kdb_printf("buffer only contains %d lines, first " | 2085 | kdb_printf("buffer only contains %d lines, first " |
2113 | "%d lines printed\n", n, lines); | 2086 | "%d lines printed\n", n, lines); |
2114 | } | 2087 | } |
2115 | for (; start < end && skip; ++start) { | 2088 | } else { |
2116 | if (*KDB_WRAP(start) == '\n') | 2089 | lines = n; |
2117 | --skip; | ||
2118 | } | ||
2119 | for (p = start; p < end && lines; ++p) { | ||
2120 | if (*KDB_WRAP(p) == '\n') | ||
2121 | --lines; | ||
2122 | } | ||
2123 | end = p; | ||
2124 | } | 2090 | } |
2125 | /* Do a line at a time (max 200 chars) to reduce protocol overhead */ | 2091 | |
2126 | c = '\n'; | 2092 | if (skip >= n || skip < 0) |
2127 | while (start != end) { | 2093 | return 0; |
2128 | char buf[201]; | 2094 | |
2129 | p = buf; | 2095 | kmsg_dump_rewind_nolock(&dumper); |
2130 | if (KDB_FLAG(CMD_INTERRUPT)) | 2096 | while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { |
2131 | return 0; | 2097 | if (skip) { |
2132 | while (start < end && (c = *KDB_WRAP(start)) && | 2098 | skip--; |
2133 | (p - buf) < sizeof(buf)-1) { | 2099 | continue; |
2134 | ++start; | ||
2135 | *p++ = c; | ||
2136 | if (c == '\n') | ||
2137 | break; | ||
2138 | } | 2100 | } |
2139 | *p = '\0'; | 2101 | if (!lines--) |
2140 | kdb_printf("%s", buf); | 2102 | break; |
2103 | |||
2104 | kdb_printf("%.*s\n", (int)len - 1, buf); | ||
2141 | } | 2105 | } |
2142 | if (c != '\n') | ||
2143 | kdb_printf("\n"); | ||
2144 | 2106 | ||
2145 | return 0; | 2107 | return 0; |
2146 | } | 2108 | } |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 47c4e56e513b..392ec6a25844 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -205,7 +205,6 @@ extern char kdb_grep_string[]; | |||
205 | extern int kdb_grep_leading; | 205 | extern int kdb_grep_leading; |
206 | extern int kdb_grep_trailing; | 206 | extern int kdb_grep_trailing; |
207 | extern char *kdb_cmds[]; | 207 | extern char *kdb_cmds[]; |
208 | extern void kdb_syslog_data(char *syslog_data[]); | ||
209 | extern unsigned long kdb_task_state_string(const char *); | 208 | extern unsigned long kdb_task_state_string(const char *); |
210 | extern char kdb_task_state_char (const struct task_struct *); | 209 | extern char kdb_task_state_char (const struct task_struct *); |
211 | extern unsigned long kdb_task_state(const struct task_struct *p, | 210 | extern unsigned long kdb_task_state(const struct task_struct *p, |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6581a040f399..98d4597f43d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
@@ -153,7 +153,8 @@ put_callchain_entry(int rctx) | |||
153 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | 153 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); |
154 | } | 154 | } |
155 | 155 | ||
156 | struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | 156 | struct perf_callchain_entry * |
157 | perf_callchain(struct perf_event *event, struct pt_regs *regs) | ||
157 | { | 158 | { |
158 | int rctx; | 159 | int rctx; |
159 | struct perf_callchain_entry *entry; | 160 | struct perf_callchain_entry *entry; |
@@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
178 | } | 179 | } |
179 | 180 | ||
180 | if (regs) { | 181 | if (regs) { |
182 | /* | ||
183 | * Disallow cross-task user callchains. | ||
184 | */ | ||
185 | if (event->ctx->task && event->ctx->task != current) | ||
186 | goto exit_put; | ||
187 | |||
181 | perf_callchain_store(entry, PERF_CONTEXT_USER); | 188 | perf_callchain_store(entry, PERF_CONTEXT_USER); |
182 | perf_callchain_user(entry, regs); | 189 | perf_callchain_user(entry, regs); |
183 | } | 190 | } |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f85c0154b333..b7935fcec7d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) | |||
253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | 253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; |
254 | } | 254 | } |
255 | 255 | ||
256 | static inline void perf_get_cgroup(struct perf_event *event) | 256 | static inline bool perf_tryget_cgroup(struct perf_event *event) |
257 | { | 257 | { |
258 | css_get(&event->cgrp->css); | 258 | return css_tryget(&event->cgrp->css); |
259 | } | 259 | } |
260 | 260 | ||
261 | static inline void perf_put_cgroup(struct perf_event *event) | 261 | static inline void perf_put_cgroup(struct perf_event *event) |
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
484 | event->cgrp = cgrp; | 484 | event->cgrp = cgrp; |
485 | 485 | ||
486 | /* must be done before we fput() the file */ | 486 | /* must be done before we fput() the file */ |
487 | perf_get_cgroup(event); | 487 | if (!perf_tryget_cgroup(event)) { |
488 | event->cgrp = NULL; | ||
489 | ret = -ENOENT; | ||
490 | goto out; | ||
491 | } | ||
488 | 492 | ||
489 | /* | 493 | /* |
490 | * all events in a group must monitor | 494 | * all events in a group must monitor |
@@ -1641,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
1641 | lockdep_assert_held(&ctx->mutex); | 1645 | lockdep_assert_held(&ctx->mutex); |
1642 | 1646 | ||
1643 | event->ctx = ctx; | 1647 | event->ctx = ctx; |
1648 | if (event->cpu != -1) | ||
1649 | event->cpu = cpu; | ||
1644 | 1650 | ||
1645 | if (!task) { | 1651 | if (!task) { |
1646 | /* | 1652 | /* |
@@ -4033,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
4033 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { | 4039 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { |
4034 | int size = 1; | 4040 | int size = 1; |
4035 | 4041 | ||
4036 | data->callchain = perf_callchain(regs); | 4042 | data->callchain = perf_callchain(event, regs); |
4037 | 4043 | ||
4038 | if (data->callchain) | 4044 | if (data->callchain) |
4039 | size += data->callchain->nr; | 4045 | size += data->callchain->nr; |
@@ -5203,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event, | |||
5203 | } | 5209 | } |
5204 | 5210 | ||
5205 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | 5211 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, |
5206 | struct pt_regs *regs, struct hlist_head *head, int rctx) | 5212 | struct pt_regs *regs, struct hlist_head *head, int rctx, |
5213 | struct task_struct *task) | ||
5207 | { | 5214 | { |
5208 | struct perf_sample_data data; | 5215 | struct perf_sample_data data; |
5209 | struct perf_event *event; | 5216 | struct perf_event *event; |
@@ -5222,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
5222 | perf_swevent_event(event, count, &data, regs); | 5229 | perf_swevent_event(event, count, &data, regs); |
5223 | } | 5230 | } |
5224 | 5231 | ||
5232 | /* | ||
5233 | * If we got specified a target task, also iterate its context and | ||
5234 | * deliver this event there too. | ||
5235 | */ | ||
5236 | if (task && task != current) { | ||
5237 | struct perf_event_context *ctx; | ||
5238 | struct trace_entry *entry = record; | ||
5239 | |||
5240 | rcu_read_lock(); | ||
5241 | ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); | ||
5242 | if (!ctx) | ||
5243 | goto unlock; | ||
5244 | |||
5245 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | ||
5246 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
5247 | continue; | ||
5248 | if (event->attr.config != entry->type) | ||
5249 | continue; | ||
5250 | if (perf_tp_event_match(event, &data, regs)) | ||
5251 | perf_swevent_event(event, count, &data, regs); | ||
5252 | } | ||
5253 | unlock: | ||
5254 | rcu_read_unlock(); | ||
5255 | } | ||
5256 | |||
5225 | perf_swevent_put_recursion_context(rctx); | 5257 | perf_swevent_put_recursion_context(rctx); |
5226 | } | 5258 | } |
5227 | EXPORT_SYMBOL_GPL(perf_tp_event); | 5259 | EXPORT_SYMBOL_GPL(perf_tp_event); |
@@ -6248,6 +6280,8 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6248 | } | 6280 | } |
6249 | } | 6281 | } |
6250 | 6282 | ||
6283 | get_online_cpus(); | ||
6284 | |||
6251 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, | 6285 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, |
6252 | NULL, NULL); | 6286 | NULL, NULL); |
6253 | if (IS_ERR(event)) { | 6287 | if (IS_ERR(event)) { |
@@ -6300,7 +6334,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6300 | /* | 6334 | /* |
6301 | * Get the target context (task or percpu): | 6335 | * Get the target context (task or percpu): |
6302 | */ | 6336 | */ |
6303 | ctx = find_get_context(pmu, task, cpu); | 6337 | ctx = find_get_context(pmu, task, event->cpu); |
6304 | if (IS_ERR(ctx)) { | 6338 | if (IS_ERR(ctx)) { |
6305 | err = PTR_ERR(ctx); | 6339 | err = PTR_ERR(ctx); |
6306 | goto err_alloc; | 6340 | goto err_alloc; |
@@ -6373,20 +6407,23 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6373 | mutex_lock(&ctx->mutex); | 6407 | mutex_lock(&ctx->mutex); |
6374 | 6408 | ||
6375 | if (move_group) { | 6409 | if (move_group) { |
6376 | perf_install_in_context(ctx, group_leader, cpu); | 6410 | synchronize_rcu(); |
6411 | perf_install_in_context(ctx, group_leader, event->cpu); | ||
6377 | get_ctx(ctx); | 6412 | get_ctx(ctx); |
6378 | list_for_each_entry(sibling, &group_leader->sibling_list, | 6413 | list_for_each_entry(sibling, &group_leader->sibling_list, |
6379 | group_entry) { | 6414 | group_entry) { |
6380 | perf_install_in_context(ctx, sibling, cpu); | 6415 | perf_install_in_context(ctx, sibling, event->cpu); |
6381 | get_ctx(ctx); | 6416 | get_ctx(ctx); |
6382 | } | 6417 | } |
6383 | } | 6418 | } |
6384 | 6419 | ||
6385 | perf_install_in_context(ctx, event, cpu); | 6420 | perf_install_in_context(ctx, event, event->cpu); |
6386 | ++ctx->generation; | 6421 | ++ctx->generation; |
6387 | perf_unpin_context(ctx); | 6422 | perf_unpin_context(ctx); |
6388 | mutex_unlock(&ctx->mutex); | 6423 | mutex_unlock(&ctx->mutex); |
6389 | 6424 | ||
6425 | put_online_cpus(); | ||
6426 | |||
6390 | event->owner = current; | 6427 | event->owner = current; |
6391 | 6428 | ||
6392 | mutex_lock(¤t->perf_event_mutex); | 6429 | mutex_lock(¤t->perf_event_mutex); |
@@ -6415,6 +6452,7 @@ err_context: | |||
6415 | err_alloc: | 6452 | err_alloc: |
6416 | free_event(event); | 6453 | free_event(event); |
6417 | err_task: | 6454 | err_task: |
6455 | put_online_cpus(); | ||
6418 | if (task) | 6456 | if (task) |
6419 | put_task_struct(task); | 6457 | put_task_struct(task); |
6420 | err_group_fd: | 6458 | err_group_fd: |
@@ -6475,6 +6513,39 @@ err: | |||
6475 | } | 6513 | } |
6476 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); | 6514 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
6477 | 6515 | ||
6516 | void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | ||
6517 | { | ||
6518 | struct perf_event_context *src_ctx; | ||
6519 | struct perf_event_context *dst_ctx; | ||
6520 | struct perf_event *event, *tmp; | ||
6521 | LIST_HEAD(events); | ||
6522 | |||
6523 | src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; | ||
6524 | dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; | ||
6525 | |||
6526 | mutex_lock(&src_ctx->mutex); | ||
6527 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | ||
6528 | event_entry) { | ||
6529 | perf_remove_from_context(event); | ||
6530 | put_ctx(src_ctx); | ||
6531 | list_add(&event->event_entry, &events); | ||
6532 | } | ||
6533 | mutex_unlock(&src_ctx->mutex); | ||
6534 | |||
6535 | synchronize_rcu(); | ||
6536 | |||
6537 | mutex_lock(&dst_ctx->mutex); | ||
6538 | list_for_each_entry_safe(event, tmp, &events, event_entry) { | ||
6539 | list_del(&event->event_entry); | ||
6540 | if (event->state >= PERF_EVENT_STATE_OFF) | ||
6541 | event->state = PERF_EVENT_STATE_INACTIVE; | ||
6542 | perf_install_in_context(dst_ctx, event, dst_cpu); | ||
6543 | get_ctx(dst_ctx); | ||
6544 | } | ||
6545 | mutex_unlock(&dst_ctx->mutex); | ||
6546 | } | ||
6547 | EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); | ||
6548 | |||
6478 | static void sync_child_event(struct perf_event *child_event, | 6549 | static void sync_child_event(struct perf_event *child_event, |
6479 | struct task_struct *child) | 6550 | struct task_struct *child) |
6480 | { | 6551 | { |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b0b107f90afc..a096c19f2c2a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle, | |||
101 | } | 101 | } |
102 | 102 | ||
103 | /* Callchain handling */ | 103 | /* Callchain handling */ |
104 | extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); | 104 | extern struct perf_callchain_entry * |
105 | perf_callchain(struct perf_event *event, struct pt_regs *regs); | ||
105 | extern int get_callchain_buffers(void); | 106 | extern int get_callchain_buffers(void); |
106 | extern void put_callchain_buffers(void); | 107 | extern void put_callchain_buffers(void); |
107 | 108 | ||
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 985be4d80fe8..c08a22d02f72 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -32,19 +32,36 @@ | |||
32 | #include <linux/swap.h> /* try_to_free_swap */ | 32 | #include <linux/swap.h> /* try_to_free_swap */ |
33 | #include <linux/ptrace.h> /* user_enable_single_step */ | 33 | #include <linux/ptrace.h> /* user_enable_single_step */ |
34 | #include <linux/kdebug.h> /* notifier mechanism */ | 34 | #include <linux/kdebug.h> /* notifier mechanism */ |
35 | #include "../../mm/internal.h" /* munlock_vma_page */ | ||
35 | 36 | ||
36 | #include <linux/uprobes.h> | 37 | #include <linux/uprobes.h> |
37 | 38 | ||
38 | #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) | 39 | #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) |
39 | #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE | 40 | #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE |
40 | 41 | ||
41 | static struct srcu_struct uprobes_srcu; | ||
42 | static struct rb_root uprobes_tree = RB_ROOT; | 42 | static struct rb_root uprobes_tree = RB_ROOT; |
43 | 43 | ||
44 | static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ | 44 | static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ |
45 | 45 | ||
46 | #define UPROBES_HASH_SZ 13 | 46 | #define UPROBES_HASH_SZ 13 |
47 | 47 | ||
48 | /* | ||
49 | * We need separate register/unregister and mmap/munmap lock hashes because | ||
50 | * of mmap_sem nesting. | ||
51 | * | ||
52 | * uprobe_register() needs to install probes on (potentially) all processes | ||
53 | * and thus needs to acquire multiple mmap_sems (consequtively, not | ||
54 | * concurrently), whereas uprobe_mmap() is called while holding mmap_sem | ||
55 | * for the particular process doing the mmap. | ||
56 | * | ||
57 | * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem | ||
58 | * because of lock order against i_mmap_mutex. This means there's a hole in | ||
59 | * the register vma iteration where a mmap() can happen. | ||
60 | * | ||
61 | * Thus uprobe_register() can race with uprobe_mmap() and we can try and | ||
62 | * install a probe where one is already installed. | ||
63 | */ | ||
64 | |||
48 | /* serialize (un)register */ | 65 | /* serialize (un)register */ |
49 | static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; | 66 | static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; |
50 | 67 | ||
@@ -61,17 +78,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | |||
61 | */ | 78 | */ |
62 | static atomic_t uprobe_events = ATOMIC_INIT(0); | 79 | static atomic_t uprobe_events = ATOMIC_INIT(0); |
63 | 80 | ||
64 | /* | ||
65 | * Maintain a temporary per vma info that can be used to search if a vma | ||
66 | * has already been handled. This structure is introduced since extending | ||
67 | * vm_area_struct wasnt recommended. | ||
68 | */ | ||
69 | struct vma_info { | ||
70 | struct list_head probe_list; | ||
71 | struct mm_struct *mm; | ||
72 | loff_t vaddr; | ||
73 | }; | ||
74 | |||
75 | struct uprobe { | 81 | struct uprobe { |
76 | struct rb_node rb_node; /* node in the rb tree */ | 82 | struct rb_node rb_node; /* node in the rb tree */ |
77 | atomic_t ref; | 83 | atomic_t ref; |
@@ -100,20 +106,21 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) | |||
100 | if (!is_register) | 106 | if (!is_register) |
101 | return true; | 107 | return true; |
102 | 108 | ||
103 | if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) | 109 | if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) |
110 | == (VM_READ|VM_EXEC)) | ||
104 | return true; | 111 | return true; |
105 | 112 | ||
106 | return false; | 113 | return false; |
107 | } | 114 | } |
108 | 115 | ||
109 | static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) | 116 | static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) |
110 | { | 117 | { |
111 | loff_t vaddr; | 118 | return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
112 | 119 | } | |
113 | vaddr = vma->vm_start + offset; | ||
114 | vaddr -= vma->vm_pgoff << PAGE_SHIFT; | ||
115 | 120 | ||
116 | return vaddr; | 121 | static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) |
122 | { | ||
123 | return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); | ||
117 | } | 124 | } |
118 | 125 | ||
119 | /** | 126 | /** |
@@ -121,41 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) | |||
121 | * based on replace_page in mm/ksm.c | 128 | * based on replace_page in mm/ksm.c |
122 | * | 129 | * |
123 | * @vma: vma that holds the pte pointing to page | 130 | * @vma: vma that holds the pte pointing to page |
131 | * @addr: address the old @page is mapped at | ||
124 | * @page: the cowed page we are replacing by kpage | 132 | * @page: the cowed page we are replacing by kpage |
125 | * @kpage: the modified page we replace page by | 133 | * @kpage: the modified page we replace page by |
126 | * | 134 | * |
127 | * Returns 0 on success, -EFAULT on failure. | 135 | * Returns 0 on success, -EFAULT on failure. |
128 | */ | 136 | */ |
129 | static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) | 137 | static int __replace_page(struct vm_area_struct *vma, unsigned long addr, |
138 | struct page *page, struct page *kpage) | ||
130 | { | 139 | { |
131 | struct mm_struct *mm = vma->vm_mm; | 140 | struct mm_struct *mm = vma->vm_mm; |
132 | pgd_t *pgd; | ||
133 | pud_t *pud; | ||
134 | pmd_t *pmd; | ||
135 | pte_t *ptep; | ||
136 | spinlock_t *ptl; | 141 | spinlock_t *ptl; |
137 | unsigned long addr; | 142 | pte_t *ptep; |
138 | int err = -EFAULT; | 143 | int err; |
139 | |||
140 | addr = page_address_in_vma(page, vma); | ||
141 | if (addr == -EFAULT) | ||
142 | goto out; | ||
143 | |||
144 | pgd = pgd_offset(mm, addr); | ||
145 | if (!pgd_present(*pgd)) | ||
146 | goto out; | ||
147 | |||
148 | pud = pud_offset(pgd, addr); | ||
149 | if (!pud_present(*pud)) | ||
150 | goto out; | ||
151 | 144 | ||
152 | pmd = pmd_offset(pud, addr); | 145 | /* For try_to_free_swap() and munlock_vma_page() below */ |
153 | if (!pmd_present(*pmd)) | 146 | lock_page(page); |
154 | goto out; | ||
155 | 147 | ||
156 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | 148 | err = -EAGAIN; |
149 | ptep = page_check_address(page, mm, addr, &ptl, 0); | ||
157 | if (!ptep) | 150 | if (!ptep) |
158 | goto out; | 151 | goto unlock; |
159 | 152 | ||
160 | get_page(kpage); | 153 | get_page(kpage); |
161 | page_add_new_anon_rmap(kpage, vma, addr); | 154 | page_add_new_anon_rmap(kpage, vma, addr); |
@@ -172,11 +165,15 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct | |||
172 | page_remove_rmap(page); | 165 | page_remove_rmap(page); |
173 | if (!page_mapped(page)) | 166 | if (!page_mapped(page)) |
174 | try_to_free_swap(page); | 167 | try_to_free_swap(page); |
175 | put_page(page); | ||
176 | pte_unmap_unlock(ptep, ptl); | 168 | pte_unmap_unlock(ptep, ptl); |
177 | err = 0; | ||
178 | 169 | ||
179 | out: | 170 | if (vma->vm_flags & VM_LOCKED) |
171 | munlock_vma_page(page); | ||
172 | put_page(page); | ||
173 | |||
174 | err = 0; | ||
175 | unlock: | ||
176 | unlock_page(page); | ||
180 | return err; | 177 | return err; |
181 | } | 178 | } |
182 | 179 | ||
@@ -218,79 +215,46 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | |||
218 | unsigned long vaddr, uprobe_opcode_t opcode) | 215 | unsigned long vaddr, uprobe_opcode_t opcode) |
219 | { | 216 | { |
220 | struct page *old_page, *new_page; | 217 | struct page *old_page, *new_page; |
221 | struct address_space *mapping; | ||
222 | void *vaddr_old, *vaddr_new; | 218 | void *vaddr_old, *vaddr_new; |
223 | struct vm_area_struct *vma; | 219 | struct vm_area_struct *vma; |
224 | struct uprobe *uprobe; | ||
225 | loff_t addr; | ||
226 | int ret; | 220 | int ret; |
227 | 221 | ||
222 | retry: | ||
228 | /* Read the page with vaddr into memory */ | 223 | /* Read the page with vaddr into memory */ |
229 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); | 224 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); |
230 | if (ret <= 0) | 225 | if (ret <= 0) |
231 | return ret; | 226 | return ret; |
232 | 227 | ||
233 | ret = -EINVAL; | ||
234 | |||
235 | /* | ||
236 | * We are interested in text pages only. Our pages of interest | ||
237 | * should be mapped for read and execute only. We desist from | ||
238 | * adding probes in write mapped pages since the breakpoints | ||
239 | * might end up in the file copy. | ||
240 | */ | ||
241 | if (!valid_vma(vma, is_swbp_insn(&opcode))) | ||
242 | goto put_out; | ||
243 | |||
244 | uprobe = container_of(auprobe, struct uprobe, arch); | ||
245 | mapping = uprobe->inode->i_mapping; | ||
246 | if (mapping != vma->vm_file->f_mapping) | ||
247 | goto put_out; | ||
248 | |||
249 | addr = vma_address(vma, uprobe->offset); | ||
250 | if (vaddr != (unsigned long)addr) | ||
251 | goto put_out; | ||
252 | |||
253 | ret = -ENOMEM; | 228 | ret = -ENOMEM; |
254 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); | 229 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); |
255 | if (!new_page) | 230 | if (!new_page) |
256 | goto put_out; | 231 | goto put_old; |
257 | 232 | ||
258 | __SetPageUptodate(new_page); | 233 | __SetPageUptodate(new_page); |
259 | 234 | ||
260 | /* | ||
261 | * lock page will serialize against do_wp_page()'s | ||
262 | * PageAnon() handling | ||
263 | */ | ||
264 | lock_page(old_page); | ||
265 | /* copy the page now that we've got it stable */ | 235 | /* copy the page now that we've got it stable */ |
266 | vaddr_old = kmap_atomic(old_page); | 236 | vaddr_old = kmap_atomic(old_page); |
267 | vaddr_new = kmap_atomic(new_page); | 237 | vaddr_new = kmap_atomic(new_page); |
268 | 238 | ||
269 | memcpy(vaddr_new, vaddr_old, PAGE_SIZE); | 239 | memcpy(vaddr_new, vaddr_old, PAGE_SIZE); |
270 | 240 | memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); | |
271 | /* poke the new insn in, ASSUMES we don't cross page boundary */ | ||
272 | vaddr &= ~PAGE_MASK; | ||
273 | BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | ||
274 | memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); | ||
275 | 241 | ||
276 | kunmap_atomic(vaddr_new); | 242 | kunmap_atomic(vaddr_new); |
277 | kunmap_atomic(vaddr_old); | 243 | kunmap_atomic(vaddr_old); |
278 | 244 | ||
279 | ret = anon_vma_prepare(vma); | 245 | ret = anon_vma_prepare(vma); |
280 | if (ret) | 246 | if (ret) |
281 | goto unlock_out; | 247 | goto put_new; |
282 | 248 | ||
283 | lock_page(new_page); | 249 | ret = __replace_page(vma, vaddr, old_page, new_page); |
284 | ret = __replace_page(vma, old_page, new_page); | ||
285 | unlock_page(new_page); | ||
286 | 250 | ||
287 | unlock_out: | 251 | put_new: |
288 | unlock_page(old_page); | ||
289 | page_cache_release(new_page); | 252 | page_cache_release(new_page); |
290 | 253 | put_old: | |
291 | put_out: | ||
292 | put_page(old_page); | 254 | put_page(old_page); |
293 | 255 | ||
256 | if (unlikely(ret == -EAGAIN)) | ||
257 | goto retry; | ||
294 | return ret; | 258 | return ret; |
295 | } | 259 | } |
296 | 260 | ||
@@ -312,7 +276,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ | |||
312 | void *vaddr_new; | 276 | void *vaddr_new; |
313 | int ret; | 277 | int ret; |
314 | 278 | ||
315 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); | 279 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); |
316 | if (ret <= 0) | 280 | if (ret <= 0) |
317 | return ret; | 281 | return ret; |
318 | 282 | ||
@@ -333,10 +297,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
333 | uprobe_opcode_t opcode; | 297 | uprobe_opcode_t opcode; |
334 | int result; | 298 | int result; |
335 | 299 | ||
300 | if (current->mm == mm) { | ||
301 | pagefault_disable(); | ||
302 | result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, | ||
303 | sizeof(opcode)); | ||
304 | pagefault_enable(); | ||
305 | |||
306 | if (likely(result == 0)) | ||
307 | goto out; | ||
308 | } | ||
309 | |||
336 | result = read_opcode(mm, vaddr, &opcode); | 310 | result = read_opcode(mm, vaddr, &opcode); |
337 | if (result) | 311 | if (result) |
338 | return result; | 312 | return result; |
339 | 313 | out: | |
340 | if (is_swbp_insn(&opcode)) | 314 | if (is_swbp_insn(&opcode)) |
341 | return 1; | 315 | return 1; |
342 | 316 | ||
@@ -355,7 +329,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
355 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 329 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
356 | { | 330 | { |
357 | int result; | 331 | int result; |
358 | 332 | /* | |
333 | * See the comment near uprobes_hash(). | ||
334 | */ | ||
359 | result = is_swbp_at_addr(mm, vaddr); | 335 | result = is_swbp_at_addr(mm, vaddr); |
360 | if (result == 1) | 336 | if (result == 1) |
361 | return -EEXIST; | 337 | return -EEXIST; |
@@ -520,7 +496,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | |||
520 | uprobe->inode = igrab(inode); | 496 | uprobe->inode = igrab(inode); |
521 | uprobe->offset = offset; | 497 | uprobe->offset = offset; |
522 | init_rwsem(&uprobe->consumer_rwsem); | 498 | init_rwsem(&uprobe->consumer_rwsem); |
523 | INIT_LIST_HEAD(&uprobe->pending_list); | ||
524 | 499 | ||
525 | /* add to uprobes_tree, sorted on inode:offset */ | 500 | /* add to uprobes_tree, sorted on inode:offset */ |
526 | cur_uprobe = insert_uprobe(uprobe); | 501 | cur_uprobe = insert_uprobe(uprobe); |
@@ -588,20 +563,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) | |||
588 | } | 563 | } |
589 | 564 | ||
590 | static int | 565 | static int |
591 | __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, | 566 | __copy_insn(struct address_space *mapping, struct file *filp, char *insn, |
592 | unsigned long nbytes, unsigned long offset) | 567 | unsigned long nbytes, loff_t offset) |
593 | { | 568 | { |
594 | struct file *filp = vma->vm_file; | ||
595 | struct page *page; | 569 | struct page *page; |
596 | void *vaddr; | 570 | void *vaddr; |
597 | unsigned long off1; | 571 | unsigned long off; |
598 | unsigned long idx; | 572 | pgoff_t idx; |
599 | 573 | ||
600 | if (!filp) | 574 | if (!filp) |
601 | return -EINVAL; | 575 | return -EINVAL; |
602 | 576 | ||
603 | idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); | 577 | if (!mapping->a_ops->readpage) |
604 | off1 = offset &= ~PAGE_MASK; | 578 | return -EIO; |
579 | |||
580 | idx = offset >> PAGE_CACHE_SHIFT; | ||
581 | off = offset & ~PAGE_MASK; | ||
605 | 582 | ||
606 | /* | 583 | /* |
607 | * Ensure that the page that has the original instruction is | 584 | * Ensure that the page that has the original instruction is |
@@ -612,22 +589,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins | |||
612 | return PTR_ERR(page); | 589 | return PTR_ERR(page); |
613 | 590 | ||
614 | vaddr = kmap_atomic(page); | 591 | vaddr = kmap_atomic(page); |
615 | memcpy(insn, vaddr + off1, nbytes); | 592 | memcpy(insn, vaddr + off, nbytes); |
616 | kunmap_atomic(vaddr); | 593 | kunmap_atomic(vaddr); |
617 | page_cache_release(page); | 594 | page_cache_release(page); |
618 | 595 | ||
619 | return 0; | 596 | return 0; |
620 | } | 597 | } |
621 | 598 | ||
622 | static int | 599 | static int copy_insn(struct uprobe *uprobe, struct file *filp) |
623 | copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) | ||
624 | { | 600 | { |
625 | struct address_space *mapping; | 601 | struct address_space *mapping; |
626 | unsigned long nbytes; | 602 | unsigned long nbytes; |
627 | int bytes; | 603 | int bytes; |
628 | 604 | ||
629 | addr &= ~PAGE_MASK; | 605 | nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); |
630 | nbytes = PAGE_SIZE - addr; | ||
631 | mapping = uprobe->inode->i_mapping; | 606 | mapping = uprobe->inode->i_mapping; |
632 | 607 | ||
633 | /* Instruction at end of binary; copy only available bytes */ | 608 | /* Instruction at end of binary; copy only available bytes */ |
@@ -638,13 +613,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) | |||
638 | 613 | ||
639 | /* Instruction at the page-boundary; copy bytes in second page */ | 614 | /* Instruction at the page-boundary; copy bytes in second page */ |
640 | if (nbytes < bytes) { | 615 | if (nbytes < bytes) { |
641 | if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, | 616 | int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, |
642 | bytes - nbytes, uprobe->offset + nbytes)) | 617 | bytes - nbytes, uprobe->offset + nbytes); |
643 | return -ENOMEM; | 618 | if (err) |
644 | 619 | return err; | |
645 | bytes = nbytes; | 620 | bytes = nbytes; |
646 | } | 621 | } |
647 | return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); | 622 | return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); |
648 | } | 623 | } |
649 | 624 | ||
650 | /* | 625 | /* |
@@ -672,9 +647,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) | |||
672 | */ | 647 | */ |
673 | static int | 648 | static int |
674 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | 649 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, |
675 | struct vm_area_struct *vma, loff_t vaddr) | 650 | struct vm_area_struct *vma, unsigned long vaddr) |
676 | { | 651 | { |
677 | unsigned long addr; | ||
678 | int ret; | 652 | int ret; |
679 | 653 | ||
680 | /* | 654 | /* |
@@ -687,20 +661,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
687 | if (!uprobe->consumers) | 661 | if (!uprobe->consumers) |
688 | return -EEXIST; | 662 | return -EEXIST; |
689 | 663 | ||
690 | addr = (unsigned long)vaddr; | ||
691 | |||
692 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { | 664 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { |
693 | ret = copy_insn(uprobe, vma, addr); | 665 | ret = copy_insn(uprobe, vma->vm_file); |
694 | if (ret) | 666 | if (ret) |
695 | return ret; | 667 | return ret; |
696 | 668 | ||
697 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) | 669 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) |
698 | return -EEXIST; | 670 | return -ENOTSUPP; |
699 | 671 | ||
700 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); | 672 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); |
701 | if (ret) | 673 | if (ret) |
702 | return ret; | 674 | return ret; |
703 | 675 | ||
676 | /* write_opcode() assumes we don't cross page boundary */ | ||
677 | BUG_ON((uprobe->offset & ~PAGE_MASK) + | ||
678 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | ||
679 | |||
704 | uprobe->flags |= UPROBE_COPY_INSN; | 680 | uprobe->flags |= UPROBE_COPY_INSN; |
705 | } | 681 | } |
706 | 682 | ||
@@ -713,7 +689,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
713 | * Hence increment before and decrement on failure. | 689 | * Hence increment before and decrement on failure. |
714 | */ | 690 | */ |
715 | atomic_inc(&mm->uprobes_state.count); | 691 | atomic_inc(&mm->uprobes_state.count); |
716 | ret = set_swbp(&uprobe->arch, mm, addr); | 692 | ret = set_swbp(&uprobe->arch, mm, vaddr); |
717 | if (ret) | 693 | if (ret) |
718 | atomic_dec(&mm->uprobes_state.count); | 694 | atomic_dec(&mm->uprobes_state.count); |
719 | 695 | ||
@@ -721,27 +697,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
721 | } | 697 | } |
722 | 698 | ||
723 | static void | 699 | static void |
724 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) | 700 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) |
725 | { | 701 | { |
726 | if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) | 702 | if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) |
727 | atomic_dec(&mm->uprobes_state.count); | 703 | atomic_dec(&mm->uprobes_state.count); |
728 | } | 704 | } |
729 | 705 | ||
730 | /* | 706 | /* |
731 | * There could be threads that have hit the breakpoint and are entering the | 707 | * There could be threads that have already hit the breakpoint. They |
732 | * notifier code and trying to acquire the uprobes_treelock. The thread | 708 | * will recheck the current insn and restart if find_uprobe() fails. |
733 | * calling delete_uprobe() that is removing the uprobe from the rb_tree can | 709 | * See find_active_uprobe(). |
734 | * race with these threads and might acquire the uprobes_treelock compared | ||
735 | * to some of the breakpoint hit threads. In such a case, the breakpoint | ||
736 | * hit threads will not find the uprobe. The current unregistering thread | ||
737 | * waits till all other threads have hit a breakpoint, to acquire the | ||
738 | * uprobes_treelock before the uprobe is removed from the rbtree. | ||
739 | */ | 710 | */ |
740 | static void delete_uprobe(struct uprobe *uprobe) | 711 | static void delete_uprobe(struct uprobe *uprobe) |
741 | { | 712 | { |
742 | unsigned long flags; | 713 | unsigned long flags; |
743 | 714 | ||
744 | synchronize_srcu(&uprobes_srcu); | ||
745 | spin_lock_irqsave(&uprobes_treelock, flags); | 715 | spin_lock_irqsave(&uprobes_treelock, flags); |
746 | rb_erase(&uprobe->rb_node, &uprobes_tree); | 716 | rb_erase(&uprobe->rb_node, &uprobes_tree); |
747 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 717 | spin_unlock_irqrestore(&uprobes_treelock, flags); |
@@ -750,139 +720,136 @@ static void delete_uprobe(struct uprobe *uprobe) | |||
750 | atomic_dec(&uprobe_events); | 720 | atomic_dec(&uprobe_events); |
751 | } | 721 | } |
752 | 722 | ||
753 | static struct vma_info * | 723 | struct map_info { |
754 | __find_next_vma_info(struct address_space *mapping, struct list_head *head, | 724 | struct map_info *next; |
755 | struct vma_info *vi, loff_t offset, bool is_register) | 725 | struct mm_struct *mm; |
726 | unsigned long vaddr; | ||
727 | }; | ||
728 | |||
729 | static inline struct map_info *free_map_info(struct map_info *info) | ||
730 | { | ||
731 | struct map_info *next = info->next; | ||
732 | kfree(info); | ||
733 | return next; | ||
734 | } | ||
735 | |||
736 | static struct map_info * | ||
737 | build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | ||
756 | { | 738 | { |
739 | unsigned long pgoff = offset >> PAGE_SHIFT; | ||
757 | struct prio_tree_iter iter; | 740 | struct prio_tree_iter iter; |
758 | struct vm_area_struct *vma; | 741 | struct vm_area_struct *vma; |
759 | struct vma_info *tmpvi; | 742 | struct map_info *curr = NULL; |
760 | unsigned long pgoff; | 743 | struct map_info *prev = NULL; |
761 | int existing_vma; | 744 | struct map_info *info; |
762 | loff_t vaddr; | 745 | int more = 0; |
763 | |||
764 | pgoff = offset >> PAGE_SHIFT; | ||
765 | 746 | ||
747 | again: | ||
748 | mutex_lock(&mapping->i_mmap_mutex); | ||
766 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 749 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
767 | if (!valid_vma(vma, is_register)) | 750 | if (!valid_vma(vma, is_register)) |
768 | continue; | 751 | continue; |
769 | 752 | ||
770 | existing_vma = 0; | 753 | if (!prev && !more) { |
771 | vaddr = vma_address(vma, offset); | 754 | /* |
772 | 755 | * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through | |
773 | list_for_each_entry(tmpvi, head, probe_list) { | 756 | * reclaim. This is optimistic, no harm done if it fails. |
774 | if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { | 757 | */ |
775 | existing_vma = 1; | 758 | prev = kmalloc(sizeof(struct map_info), |
776 | break; | 759 | GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); |
777 | } | 760 | if (prev) |
761 | prev->next = NULL; | ||
778 | } | 762 | } |
779 | 763 | if (!prev) { | |
780 | /* | 764 | more++; |
781 | * Another vma needs a probe to be installed. However skip | 765 | continue; |
782 | * installing the probe if the vma is about to be unlinked. | ||
783 | */ | ||
784 | if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { | ||
785 | vi->mm = vma->vm_mm; | ||
786 | vi->vaddr = vaddr; | ||
787 | list_add(&vi->probe_list, head); | ||
788 | |||
789 | return vi; | ||
790 | } | 766 | } |
791 | } | ||
792 | |||
793 | return NULL; | ||
794 | } | ||
795 | 767 | ||
796 | /* | 768 | if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) |
797 | * Iterate in the rmap prio tree and find a vma where a probe has not | 769 | continue; |
798 | * yet been inserted. | ||
799 | */ | ||
800 | static struct vma_info * | ||
801 | find_next_vma_info(struct address_space *mapping, struct list_head *head, | ||
802 | loff_t offset, bool is_register) | ||
803 | { | ||
804 | struct vma_info *vi, *retvi; | ||
805 | 770 | ||
806 | vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); | 771 | info = prev; |
807 | if (!vi) | 772 | prev = prev->next; |
808 | return ERR_PTR(-ENOMEM); | 773 | info->next = curr; |
774 | curr = info; | ||
809 | 775 | ||
810 | mutex_lock(&mapping->i_mmap_mutex); | 776 | info->mm = vma->vm_mm; |
811 | retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); | 777 | info->vaddr = offset_to_vaddr(vma, offset); |
778 | } | ||
812 | mutex_unlock(&mapping->i_mmap_mutex); | 779 | mutex_unlock(&mapping->i_mmap_mutex); |
813 | 780 | ||
814 | if (!retvi) | 781 | if (!more) |
815 | kfree(vi); | 782 | goto out; |
783 | |||
784 | prev = curr; | ||
785 | while (curr) { | ||
786 | mmput(curr->mm); | ||
787 | curr = curr->next; | ||
788 | } | ||
816 | 789 | ||
817 | return retvi; | 790 | do { |
791 | info = kmalloc(sizeof(struct map_info), GFP_KERNEL); | ||
792 | if (!info) { | ||
793 | curr = ERR_PTR(-ENOMEM); | ||
794 | goto out; | ||
795 | } | ||
796 | info->next = prev; | ||
797 | prev = info; | ||
798 | } while (--more); | ||
799 | |||
800 | goto again; | ||
801 | out: | ||
802 | while (prev) | ||
803 | prev = free_map_info(prev); | ||
804 | return curr; | ||
818 | } | 805 | } |
819 | 806 | ||
820 | static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | 807 | static int register_for_each_vma(struct uprobe *uprobe, bool is_register) |
821 | { | 808 | { |
822 | struct list_head try_list; | 809 | struct map_info *info; |
823 | struct vm_area_struct *vma; | 810 | int err = 0; |
824 | struct address_space *mapping; | ||
825 | struct vma_info *vi, *tmpvi; | ||
826 | struct mm_struct *mm; | ||
827 | loff_t vaddr; | ||
828 | int ret; | ||
829 | |||
830 | mapping = uprobe->inode->i_mapping; | ||
831 | INIT_LIST_HEAD(&try_list); | ||
832 | 811 | ||
833 | ret = 0; | 812 | info = build_map_info(uprobe->inode->i_mapping, |
813 | uprobe->offset, is_register); | ||
814 | if (IS_ERR(info)) | ||
815 | return PTR_ERR(info); | ||
834 | 816 | ||
835 | for (;;) { | 817 | while (info) { |
836 | vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); | 818 | struct mm_struct *mm = info->mm; |
837 | if (!vi) | 819 | struct vm_area_struct *vma; |
838 | break; | ||
839 | 820 | ||
840 | if (IS_ERR(vi)) { | 821 | if (err) |
841 | ret = PTR_ERR(vi); | 822 | goto free; |
842 | break; | ||
843 | } | ||
844 | 823 | ||
845 | mm = vi->mm; | 824 | down_write(&mm->mmap_sem); |
846 | down_read(&mm->mmap_sem); | 825 | vma = find_vma(mm, info->vaddr); |
847 | vma = find_vma(mm, (unsigned long)vi->vaddr); | 826 | if (!vma || !valid_vma(vma, is_register) || |
848 | if (!vma || !valid_vma(vma, is_register)) { | 827 | vma->vm_file->f_mapping->host != uprobe->inode) |
849 | list_del(&vi->probe_list); | 828 | goto unlock; |
850 | kfree(vi); | ||
851 | up_read(&mm->mmap_sem); | ||
852 | mmput(mm); | ||
853 | continue; | ||
854 | } | ||
855 | vaddr = vma_address(vma, uprobe->offset); | ||
856 | if (vma->vm_file->f_mapping->host != uprobe->inode || | ||
857 | vaddr != vi->vaddr) { | ||
858 | list_del(&vi->probe_list); | ||
859 | kfree(vi); | ||
860 | up_read(&mm->mmap_sem); | ||
861 | mmput(mm); | ||
862 | continue; | ||
863 | } | ||
864 | 829 | ||
865 | if (is_register) | 830 | if (vma->vm_start > info->vaddr || |
866 | ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); | 831 | vaddr_to_offset(vma, info->vaddr) != uprobe->offset) |
867 | else | 832 | goto unlock; |
868 | remove_breakpoint(uprobe, mm, vi->vaddr); | ||
869 | 833 | ||
870 | up_read(&mm->mmap_sem); | ||
871 | mmput(mm); | ||
872 | if (is_register) { | 834 | if (is_register) { |
873 | if (ret && ret == -EEXIST) | 835 | err = install_breakpoint(uprobe, mm, vma, info->vaddr); |
874 | ret = 0; | 836 | /* |
875 | if (ret) | 837 | * We can race against uprobe_mmap(), see the |
876 | break; | 838 | * comment near uprobe_hash(). |
839 | */ | ||
840 | if (err == -EEXIST) | ||
841 | err = 0; | ||
842 | } else { | ||
843 | remove_breakpoint(uprobe, mm, info->vaddr); | ||
877 | } | 844 | } |
845 | unlock: | ||
846 | up_write(&mm->mmap_sem); | ||
847 | free: | ||
848 | mmput(mm); | ||
849 | info = free_map_info(info); | ||
878 | } | 850 | } |
879 | 851 | ||
880 | list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { | 852 | return err; |
881 | list_del(&vi->probe_list); | ||
882 | kfree(vi); | ||
883 | } | ||
884 | |||
885 | return ret; | ||
886 | } | 853 | } |
887 | 854 | ||
888 | static int __uprobe_register(struct uprobe *uprobe) | 855 | static int __uprobe_register(struct uprobe *uprobe) |
@@ -977,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume | |||
977 | put_uprobe(uprobe); | 944 | put_uprobe(uprobe); |
978 | } | 945 | } |
979 | 946 | ||
980 | /* | 947 | static struct rb_node * |
981 | * Of all the nodes that correspond to the given inode, return the node | 948 | find_node_in_range(struct inode *inode, loff_t min, loff_t max) |
982 | * with the least offset. | ||
983 | */ | ||
984 | static struct rb_node *find_least_offset_node(struct inode *inode) | ||
985 | { | 949 | { |
986 | struct uprobe u = { .inode = inode, .offset = 0}; | ||
987 | struct rb_node *n = uprobes_tree.rb_node; | 950 | struct rb_node *n = uprobes_tree.rb_node; |
988 | struct rb_node *close_node = NULL; | ||
989 | struct uprobe *uprobe; | ||
990 | int match; | ||
991 | 951 | ||
992 | while (n) { | 952 | while (n) { |
993 | uprobe = rb_entry(n, struct uprobe, rb_node); | 953 | struct uprobe *u = rb_entry(n, struct uprobe, rb_node); |
994 | match = match_uprobe(&u, uprobe); | ||
995 | 954 | ||
996 | if (uprobe->inode == inode) | 955 | if (inode < u->inode) { |
997 | close_node = n; | ||
998 | |||
999 | if (!match) | ||
1000 | return close_node; | ||
1001 | |||
1002 | if (match < 0) | ||
1003 | n = n->rb_left; | 956 | n = n->rb_left; |
1004 | else | 957 | } else if (inode > u->inode) { |
1005 | n = n->rb_right; | 958 | n = n->rb_right; |
959 | } else { | ||
960 | if (max < u->offset) | ||
961 | n = n->rb_left; | ||
962 | else if (min > u->offset) | ||
963 | n = n->rb_right; | ||
964 | else | ||
965 | break; | ||
966 | } | ||
1006 | } | 967 | } |
1007 | 968 | ||
1008 | return close_node; | 969 | return n; |
1009 | } | 970 | } |
1010 | 971 | ||
1011 | /* | 972 | /* |
1012 | * For a given inode, build a list of probes that need to be inserted. | 973 | * For a given range in vma, build a list of probes that need to be inserted. |
1013 | */ | 974 | */ |
1014 | static void build_probe_list(struct inode *inode, struct list_head *head) | 975 | static void build_probe_list(struct inode *inode, |
976 | struct vm_area_struct *vma, | ||
977 | unsigned long start, unsigned long end, | ||
978 | struct list_head *head) | ||
1015 | { | 979 | { |
1016 | struct uprobe *uprobe; | 980 | loff_t min, max; |
1017 | unsigned long flags; | 981 | unsigned long flags; |
1018 | struct rb_node *n; | 982 | struct rb_node *n, *t; |
1019 | 983 | struct uprobe *u; | |
1020 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
1021 | |||
1022 | n = find_least_offset_node(inode); | ||
1023 | 984 | ||
1024 | for (; n; n = rb_next(n)) { | 985 | INIT_LIST_HEAD(head); |
1025 | uprobe = rb_entry(n, struct uprobe, rb_node); | 986 | min = vaddr_to_offset(vma, start); |
1026 | if (uprobe->inode != inode) | 987 | max = min + (end - start) - 1; |
1027 | break; | ||
1028 | 988 | ||
1029 | list_add(&uprobe->pending_list, head); | 989 | spin_lock_irqsave(&uprobes_treelock, flags); |
1030 | atomic_inc(&uprobe->ref); | 990 | n = find_node_in_range(inode, min, max); |
991 | if (n) { | ||
992 | for (t = n; t; t = rb_prev(t)) { | ||
993 | u = rb_entry(t, struct uprobe, rb_node); | ||
994 | if (u->inode != inode || u->offset < min) | ||
995 | break; | ||
996 | list_add(&u->pending_list, head); | ||
997 | atomic_inc(&u->ref); | ||
998 | } | ||
999 | for (t = n; (t = rb_next(t)); ) { | ||
1000 | u = rb_entry(t, struct uprobe, rb_node); | ||
1001 | if (u->inode != inode || u->offset > max) | ||
1002 | break; | ||
1003 | list_add(&u->pending_list, head); | ||
1004 | atomic_inc(&u->ref); | ||
1005 | } | ||
1031 | } | 1006 | } |
1032 | |||
1033 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 1007 | spin_unlock_irqrestore(&uprobes_treelock, flags); |
1034 | } | 1008 | } |
1035 | 1009 | ||
@@ -1059,28 +1033,21 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
1059 | if (!inode) | 1033 | if (!inode) |
1060 | return 0; | 1034 | return 0; |
1061 | 1035 | ||
1062 | INIT_LIST_HEAD(&tmp_list); | ||
1063 | mutex_lock(uprobes_mmap_hash(inode)); | 1036 | mutex_lock(uprobes_mmap_hash(inode)); |
1064 | build_probe_list(inode, &tmp_list); | 1037 | build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); |
1065 | 1038 | ||
1066 | ret = 0; | 1039 | ret = 0; |
1067 | count = 0; | 1040 | count = 0; |
1068 | 1041 | ||
1069 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | 1042 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { |
1070 | loff_t vaddr; | ||
1071 | |||
1072 | list_del(&uprobe->pending_list); | ||
1073 | if (!ret) { | 1043 | if (!ret) { |
1074 | vaddr = vma_address(vma, uprobe->offset); | 1044 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); |
1075 | |||
1076 | if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { | ||
1077 | put_uprobe(uprobe); | ||
1078 | continue; | ||
1079 | } | ||
1080 | 1045 | ||
1081 | ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | 1046 | ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); |
1082 | 1047 | /* | |
1083 | /* Ignore double add: */ | 1048 | * We can race against uprobe_register(), see the |
1049 | * comment near uprobe_hash(). | ||
1050 | */ | ||
1084 | if (ret == -EEXIST) { | 1051 | if (ret == -EEXIST) { |
1085 | ret = 0; | 1052 | ret = 0; |
1086 | 1053 | ||
@@ -1121,6 +1088,9 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
1121 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) | 1088 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) |
1122 | return; | 1089 | return; |
1123 | 1090 | ||
1091 | if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ | ||
1092 | return; | ||
1093 | |||
1124 | if (!atomic_read(&vma->vm_mm->uprobes_state.count)) | 1094 | if (!atomic_read(&vma->vm_mm->uprobes_state.count)) |
1125 | return; | 1095 | return; |
1126 | 1096 | ||
@@ -1128,24 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
1128 | if (!inode) | 1098 | if (!inode) |
1129 | return; | 1099 | return; |
1130 | 1100 | ||
1131 | INIT_LIST_HEAD(&tmp_list); | ||
1132 | mutex_lock(uprobes_mmap_hash(inode)); | 1101 | mutex_lock(uprobes_mmap_hash(inode)); |
1133 | build_probe_list(inode, &tmp_list); | 1102 | build_probe_list(inode, vma, start, end, &tmp_list); |
1134 | 1103 | ||
1135 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | 1104 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { |
1136 | loff_t vaddr; | 1105 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); |
1137 | 1106 | /* | |
1138 | list_del(&uprobe->pending_list); | 1107 | * An unregister could have removed the probe before |
1139 | vaddr = vma_address(vma, uprobe->offset); | 1108 | * unmap. So check before we decrement the count. |
1140 | 1109 | */ | |
1141 | if (vaddr >= start && vaddr < end) { | 1110 | if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) |
1142 | /* | 1111 | atomic_dec(&vma->vm_mm->uprobes_state.count); |
1143 | * An unregister could have removed the probe before | ||
1144 | * unmap. So check before we decrement the count. | ||
1145 | */ | ||
1146 | if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) | ||
1147 | atomic_dec(&vma->vm_mm->uprobes_state.count); | ||
1148 | } | ||
1149 | put_uprobe(uprobe); | 1112 | put_uprobe(uprobe); |
1150 | } | 1113 | } |
1151 | mutex_unlock(uprobes_mmap_hash(inode)); | 1114 | mutex_unlock(uprobes_mmap_hash(inode)); |
@@ -1378,9 +1341,6 @@ void uprobe_free_utask(struct task_struct *t) | |||
1378 | { | 1341 | { |
1379 | struct uprobe_task *utask = t->utask; | 1342 | struct uprobe_task *utask = t->utask; |
1380 | 1343 | ||
1381 | if (t->uprobe_srcu_id != -1) | ||
1382 | srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); | ||
1383 | |||
1384 | if (!utask) | 1344 | if (!utask) |
1385 | return; | 1345 | return; |
1386 | 1346 | ||
@@ -1398,7 +1358,6 @@ void uprobe_free_utask(struct task_struct *t) | |||
1398 | void uprobe_copy_process(struct task_struct *t) | 1358 | void uprobe_copy_process(struct task_struct *t) |
1399 | { | 1359 | { |
1400 | t->utask = NULL; | 1360 | t->utask = NULL; |
1401 | t->uprobe_srcu_id = -1; | ||
1402 | } | 1361 | } |
1403 | 1362 | ||
1404 | /* | 1363 | /* |
@@ -1417,7 +1376,6 @@ static struct uprobe_task *add_utask(void) | |||
1417 | if (unlikely(!utask)) | 1376 | if (unlikely(!utask)) |
1418 | return NULL; | 1377 | return NULL; |
1419 | 1378 | ||
1420 | utask->active_uprobe = NULL; | ||
1421 | current->utask = utask; | 1379 | current->utask = utask; |
1422 | return utask; | 1380 | return utask; |
1423 | } | 1381 | } |
@@ -1479,41 +1437,61 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) | |||
1479 | return false; | 1437 | return false; |
1480 | } | 1438 | } |
1481 | 1439 | ||
1440 | static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | ||
1441 | { | ||
1442 | struct mm_struct *mm = current->mm; | ||
1443 | struct uprobe *uprobe = NULL; | ||
1444 | struct vm_area_struct *vma; | ||
1445 | |||
1446 | down_read(&mm->mmap_sem); | ||
1447 | vma = find_vma(mm, bp_vaddr); | ||
1448 | if (vma && vma->vm_start <= bp_vaddr) { | ||
1449 | if (valid_vma(vma, false)) { | ||
1450 | struct inode *inode = vma->vm_file->f_mapping->host; | ||
1451 | loff_t offset = vaddr_to_offset(vma, bp_vaddr); | ||
1452 | |||
1453 | uprobe = find_uprobe(inode, offset); | ||
1454 | } | ||
1455 | |||
1456 | if (!uprobe) | ||
1457 | *is_swbp = is_swbp_at_addr(mm, bp_vaddr); | ||
1458 | } else { | ||
1459 | *is_swbp = -EFAULT; | ||
1460 | } | ||
1461 | up_read(&mm->mmap_sem); | ||
1462 | |||
1463 | return uprobe; | ||
1464 | } | ||
1465 | |||
1482 | /* | 1466 | /* |
1483 | * Run handler and ask thread to singlestep. | 1467 | * Run handler and ask thread to singlestep. |
1484 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | 1468 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. |
1485 | */ | 1469 | */ |
1486 | static void handle_swbp(struct pt_regs *regs) | 1470 | static void handle_swbp(struct pt_regs *regs) |
1487 | { | 1471 | { |
1488 | struct vm_area_struct *vma; | ||
1489 | struct uprobe_task *utask; | 1472 | struct uprobe_task *utask; |
1490 | struct uprobe *uprobe; | 1473 | struct uprobe *uprobe; |
1491 | struct mm_struct *mm; | ||
1492 | unsigned long bp_vaddr; | 1474 | unsigned long bp_vaddr; |
1475 | int uninitialized_var(is_swbp); | ||
1493 | 1476 | ||
1494 | uprobe = NULL; | ||
1495 | bp_vaddr = uprobe_get_swbp_addr(regs); | 1477 | bp_vaddr = uprobe_get_swbp_addr(regs); |
1496 | mm = current->mm; | 1478 | uprobe = find_active_uprobe(bp_vaddr, &is_swbp); |
1497 | down_read(&mm->mmap_sem); | ||
1498 | vma = find_vma(mm, bp_vaddr); | ||
1499 | |||
1500 | if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { | ||
1501 | struct inode *inode; | ||
1502 | loff_t offset; | ||
1503 | |||
1504 | inode = vma->vm_file->f_mapping->host; | ||
1505 | offset = bp_vaddr - vma->vm_start; | ||
1506 | offset += (vma->vm_pgoff << PAGE_SHIFT); | ||
1507 | uprobe = find_uprobe(inode, offset); | ||
1508 | } | ||
1509 | |||
1510 | srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); | ||
1511 | current->uprobe_srcu_id = -1; | ||
1512 | up_read(&mm->mmap_sem); | ||
1513 | 1479 | ||
1514 | if (!uprobe) { | 1480 | if (!uprobe) { |
1515 | /* No matching uprobe; signal SIGTRAP. */ | 1481 | if (is_swbp > 0) { |
1516 | send_sig(SIGTRAP, current, 0); | 1482 | /* No matching uprobe; signal SIGTRAP. */ |
1483 | send_sig(SIGTRAP, current, 0); | ||
1484 | } else { | ||
1485 | /* | ||
1486 | * Either we raced with uprobe_unregister() or we can't | ||
1487 | * access this memory. The latter is only possible if | ||
1488 | * another thread plays with our ->mm. In both cases | ||
1489 | * we can simply restart. If this vma was unmapped we | ||
1490 | * can pretend this insn was not executed yet and get | ||
1491 | * the (correct) SIGSEGV after restart. | ||
1492 | */ | ||
1493 | instruction_pointer_set(regs, bp_vaddr); | ||
1494 | } | ||
1517 | return; | 1495 | return; |
1518 | } | 1496 | } |
1519 | 1497 | ||
@@ -1620,7 +1598,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) | |||
1620 | utask->state = UTASK_BP_HIT; | 1598 | utask->state = UTASK_BP_HIT; |
1621 | 1599 | ||
1622 | set_thread_flag(TIF_UPROBE); | 1600 | set_thread_flag(TIF_UPROBE); |
1623 | current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); | ||
1624 | 1601 | ||
1625 | return 1; | 1602 | return 1; |
1626 | } | 1603 | } |
@@ -1655,7 +1632,6 @@ static int __init init_uprobes(void) | |||
1655 | mutex_init(&uprobes_mutex[i]); | 1632 | mutex_init(&uprobes_mutex[i]); |
1656 | mutex_init(&uprobes_mmap_mutex[i]); | 1633 | mutex_init(&uprobes_mmap_mutex[i]); |
1657 | } | 1634 | } |
1658 | init_srcu_struct(&uprobes_srcu); | ||
1659 | 1635 | ||
1660 | return register_die_notifier(&uprobe_exception_nb); | 1636 | return register_die_notifier(&uprobe_exception_nb); |
1661 | } | 1637 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..f65345f9e5bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
72 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
73 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | /* | ||
76 | * If we are the last child process in a pid namespace to be | ||
77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). | ||
78 | */ | ||
79 | if (IS_ENABLED(CONFIG_PID_NS)) { | ||
80 | struct task_struct *parent = p->real_parent; | ||
81 | |||
82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && | ||
83 | list_empty(&parent->children) && | ||
84 | (parent->flags & PF_EXITING)) | ||
85 | wake_up_process(parent); | ||
86 | } | ||
75 | } | 87 | } |
76 | list_del_rcu(&p->thread_group); | 88 | list_del_rcu(&p->thread_group); |
77 | } | 89 | } |
@@ -471,7 +483,7 @@ static void close_files(struct files_struct * files) | |||
471 | rcu_read_unlock(); | 483 | rcu_read_unlock(); |
472 | for (;;) { | 484 | for (;;) { |
473 | unsigned long set; | 485 | unsigned long set; |
474 | i = j * __NFDBITS; | 486 | i = j * BITS_PER_LONG; |
475 | if (i >= fdt->max_fds) | 487 | if (i >= fdt->max_fds) |
476 | break; | 488 | break; |
477 | set = fdt->open_fds[j++]; | 489 | set = fdt->open_fds[j++]; |
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk) | |||
643 | mm_release(tsk, mm); | 655 | mm_release(tsk, mm); |
644 | if (!mm) | 656 | if (!mm) |
645 | return; | 657 | return; |
658 | sync_mm_rss(mm); | ||
646 | /* | 659 | /* |
647 | * Serialize with any possible pending coredump. | 660 | * Serialize with any possible pending coredump. |
648 | * We must hold mmap_sem around checking core_state | 661 | * We must hold mmap_sem around checking core_state |
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
719 | 732 | ||
720 | zap_pid_ns_processes(pid_ns); | 733 | zap_pid_ns_processes(pid_ns); |
721 | write_lock_irq(&tasklist_lock); | 734 | write_lock_irq(&tasklist_lock); |
722 | /* | ||
723 | * We can not clear ->child_reaper or leave it alone. | ||
724 | * There may by stealth EXIT_DEAD tasks on ->children, | ||
725 | * forget_original_parent() must move them somewhere. | ||
726 | */ | ||
727 | pid_ns->child_reaper = init_pid_ns.child_reaper; | ||
728 | } else if (father->signal->has_child_subreaper) { | 735 | } else if (father->signal->has_child_subreaper) { |
729 | struct task_struct *reaper; | 736 | struct task_struct *reaper; |
730 | 737 | ||
@@ -946,14 +953,11 @@ void do_exit(long code) | |||
946 | exit_signals(tsk); /* sets PF_EXITING */ | 953 | exit_signals(tsk); /* sets PF_EXITING */ |
947 | /* | 954 | /* |
948 | * tsk->flags are checked in the futex code to protect against | 955 | * tsk->flags are checked in the futex code to protect against |
949 | * an exiting task cleaning up the robust pi futexes, and in | 956 | * an exiting task cleaning up the robust pi futexes. |
950 | * task_work_add() to avoid the race with exit_task_work(). | ||
951 | */ | 957 | */ |
952 | smp_mb(); | 958 | smp_mb(); |
953 | raw_spin_unlock_wait(&tsk->pi_lock); | 959 | raw_spin_unlock_wait(&tsk->pi_lock); |
954 | 960 | ||
955 | exit_task_work(tsk); | ||
956 | |||
957 | if (unlikely(in_atomic())) | 961 | if (unlikely(in_atomic())) |
958 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 962 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", |
959 | current->comm, task_pid_nr(current), | 963 | current->comm, task_pid_nr(current), |
@@ -988,6 +992,7 @@ void do_exit(long code) | |||
988 | exit_shm(tsk); | 992 | exit_shm(tsk); |
989 | exit_files(tsk); | 993 | exit_files(tsk); |
990 | exit_fs(tsk); | 994 | exit_fs(tsk); |
995 | exit_task_work(tsk); | ||
991 | check_stack_usage(); | 996 | check_stack_usage(); |
992 | exit_thread(); | 997 | exit_thread(); |
993 | 998 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..3bd2280d79f6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -114,6 +114,10 @@ int nr_processes(void) | |||
114 | return total; | 114 | return total; |
115 | } | 115 | } |
116 | 116 | ||
117 | void __weak arch_release_task_struct(struct task_struct *tsk) | ||
118 | { | ||
119 | } | ||
120 | |||
117 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR | 121 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
118 | static struct kmem_cache *task_struct_cachep; | 122 | static struct kmem_cache *task_struct_cachep; |
119 | 123 | ||
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node) | |||
122 | return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); | 126 | return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); |
123 | } | 127 | } |
124 | 128 | ||
125 | void __weak arch_release_task_struct(struct task_struct *tsk) { } | ||
126 | |||
127 | static inline void free_task_struct(struct task_struct *tsk) | 129 | static inline void free_task_struct(struct task_struct *tsk) |
128 | { | 130 | { |
129 | arch_release_task_struct(tsk); | ||
130 | kmem_cache_free(task_struct_cachep, tsk); | 131 | kmem_cache_free(task_struct_cachep, tsk); |
131 | } | 132 | } |
132 | #endif | 133 | #endif |
133 | 134 | ||
135 | void __weak arch_release_thread_info(struct thread_info *ti) | ||
136 | { | ||
137 | } | ||
138 | |||
134 | #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR | 139 | #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR |
135 | void __weak arch_release_thread_info(struct thread_info *ti) { } | ||
136 | 140 | ||
137 | /* | 141 | /* |
138 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | 142 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a |
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | |||
150 | 154 | ||
151 | static inline void free_thread_info(struct thread_info *ti) | 155 | static inline void free_thread_info(struct thread_info *ti) |
152 | { | 156 | { |
153 | arch_release_thread_info(ti); | ||
154 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 157 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
155 | } | 158 | } |
156 | # else | 159 | # else |
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | |||
164 | 167 | ||
165 | static void free_thread_info(struct thread_info *ti) | 168 | static void free_thread_info(struct thread_info *ti) |
166 | { | 169 | { |
167 | arch_release_thread_info(ti); | ||
168 | kmem_cache_free(thread_info_cache, ti); | 170 | kmem_cache_free(thread_info_cache, ti); |
169 | } | 171 | } |
170 | 172 | ||
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account) | |||
205 | void free_task(struct task_struct *tsk) | 207 | void free_task(struct task_struct *tsk) |
206 | { | 208 | { |
207 | account_kernel_stack(tsk->stack, -1); | 209 | account_kernel_stack(tsk->stack, -1); |
210 | arch_release_thread_info(tsk->stack); | ||
208 | free_thread_info(tsk->stack); | 211 | free_thread_info(tsk->stack); |
209 | rt_mutex_debug_task_free(tsk); | 212 | rt_mutex_debug_task_free(tsk); |
210 | ftrace_graph_exit_task(tsk); | 213 | ftrace_graph_exit_task(tsk); |
211 | put_seccomp_filter(tsk); | 214 | put_seccomp_filter(tsk); |
215 | arch_release_task_struct(tsk); | ||
212 | free_task_struct(tsk); | 216 | free_task_struct(tsk); |
213 | } | 217 | } |
214 | EXPORT_SYMBOL(free_task); | 218 | EXPORT_SYMBOL(free_task); |
@@ -298,14 +302,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
298 | return NULL; | 302 | return NULL; |
299 | 303 | ||
300 | ti = alloc_thread_info_node(tsk, node); | 304 | ti = alloc_thread_info_node(tsk, node); |
301 | if (!ti) { | 305 | if (!ti) |
302 | free_task_struct(tsk); | 306 | goto free_tsk; |
303 | return NULL; | ||
304 | } | ||
305 | 307 | ||
306 | err = arch_dup_task_struct(tsk, orig); | 308 | err = arch_dup_task_struct(tsk, orig); |
307 | if (err) | 309 | if (err) |
308 | goto out; | 310 | goto free_ti; |
309 | 311 | ||
310 | tsk->stack = ti; | 312 | tsk->stack = ti; |
311 | 313 | ||
@@ -333,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
333 | 335 | ||
334 | return tsk; | 336 | return tsk; |
335 | 337 | ||
336 | out: | 338 | free_ti: |
337 | free_thread_info(ti); | 339 | free_thread_info(ti); |
340 | free_tsk: | ||
338 | free_task_struct(tsk); | 341 | free_task_struct(tsk); |
339 | return NULL; | 342 | return NULL; |
340 | } | 343 | } |
@@ -378,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
378 | struct file *file; | 381 | struct file *file; |
379 | 382 | ||
380 | if (mpnt->vm_flags & VM_DONTCOPY) { | 383 | if (mpnt->vm_flags & VM_DONTCOPY) { |
381 | long pages = vma_pages(mpnt); | ||
382 | mm->total_vm -= pages; | ||
383 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, | 384 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, |
384 | -pages); | 385 | -vma_pages(mpnt)); |
385 | continue; | 386 | continue; |
386 | } | 387 | } |
387 | charge = 0; | 388 | charge = 0; |
388 | if (mpnt->vm_flags & VM_ACCOUNT) { | 389 | if (mpnt->vm_flags & VM_ACCOUNT) { |
389 | unsigned long len; | 390 | unsigned long len = vma_pages(mpnt); |
390 | len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; | 391 | |
391 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ | 392 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ |
392 | goto fail_nomem; | 393 | goto fail_nomem; |
393 | charge = len; | 394 | charge = len; |
@@ -1305,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1305 | #ifdef CONFIG_DEBUG_MUTEXES | 1306 | #ifdef CONFIG_DEBUG_MUTEXES |
1306 | p->blocked_on = NULL; /* not blocked yet */ | 1307 | p->blocked_on = NULL; /* not blocked yet */ |
1307 | #endif | 1308 | #endif |
1308 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1309 | #ifdef CONFIG_MEMCG |
1309 | p->memcg_batch.do_batch = 0; | 1310 | p->memcg_batch.do_batch = 0; |
1310 | p->memcg_batch.memcg = NULL; | 1311 | p->memcg_batch.memcg = NULL; |
1311 | #endif | 1312 | #endif |
@@ -1415,7 +1416,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1415 | */ | 1416 | */ |
1416 | p->group_leader = p; | 1417 | p->group_leader = p; |
1417 | INIT_LIST_HEAD(&p->thread_group); | 1418 | INIT_LIST_HEAD(&p->thread_group); |
1418 | INIT_HLIST_HEAD(&p->task_works); | 1419 | p->task_works = NULL; |
1419 | 1420 | ||
1420 | /* Now that the task is set up, run cgroup callbacks if | 1421 | /* Now that the task is set up, run cgroup callbacks if |
1421 | * necessary. We need to run them before the task is visible | 1422 | * necessary. We need to run them before the task is visible |
diff --git a/kernel/futex.c b/kernel/futex.c index e2b0fb9a0b3b..3717e7b306e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2231 | * @uaddr2: the pi futex we will take prior to returning to user-space | 2231 | * @uaddr2: the pi futex we will take prior to returning to user-space |
2232 | * | 2232 | * |
2233 | * The caller will wait on uaddr and will be requeued by futex_requeue() to | 2233 | * The caller will wait on uaddr and will be requeued by futex_requeue() to |
2234 | * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and | 2234 | * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake |
2235 | * complete the acquisition of the rt_mutex prior to returning to userspace. | 2235 | * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to |
2236 | * This ensures the rt_mutex maintains an owner when it has waiters; without | 2236 | * userspace. This ensures the rt_mutex maintains an owner when it has waiters; |
2237 | * one, the pi logic wouldn't know which task to boost/deboost, if there was a | 2237 | * without one, the pi logic would not know which task to boost/deboost, if |
2238 | * need to. | 2238 | * there was a need to. |
2239 | * | 2239 | * |
2240 | * We call schedule in futex_wait_queue_me() when we enqueue and return there | 2240 | * We call schedule in futex_wait_queue_me() when we enqueue and return there |
2241 | * via the following: | 2241 | * via the following: |
@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2272 | struct futex_q q = futex_q_init; | 2272 | struct futex_q q = futex_q_init; |
2273 | int res, ret; | 2273 | int res, ret; |
2274 | 2274 | ||
2275 | if (uaddr == uaddr2) | ||
2276 | return -EINVAL; | ||
2277 | |||
2275 | if (!bitset) | 2278 | if (!bitset) |
2276 | return -EINVAL; | 2279 | return -EINVAL; |
2277 | 2280 | ||
@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2343 | * signal. futex_unlock_pi() will not destroy the lock_ptr nor | 2346 | * signal. futex_unlock_pi() will not destroy the lock_ptr nor |
2344 | * the pi_state. | 2347 | * the pi_state. |
2345 | */ | 2348 | */ |
2346 | WARN_ON(!&q.pi_state); | 2349 | WARN_ON(!q.pi_state); |
2347 | pi_mutex = &q.pi_state->pi_mutex; | 2350 | pi_mutex = &q.pi_state->pi_mutex; |
2348 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); | 2351 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); |
2349 | debug_rt_mutex_free_waiter(&rt_waiter); | 2352 | debug_rt_mutex_free_waiter(&rt_waiter); |
@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2370 | * fault, unlock the rt_mutex and return the fault to userspace. | 2373 | * fault, unlock the rt_mutex and return the fault to userspace. |
2371 | */ | 2374 | */ |
2372 | if (ret == -EFAULT) { | 2375 | if (ret == -EFAULT) { |
2373 | if (rt_mutex_owner(pi_mutex) == current) | 2376 | if (pi_mutex && rt_mutex_owner(pi_mutex) == current) |
2374 | rt_mutex_unlock(pi_mutex); | 2377 | rt_mutex_unlock(pi_mutex); |
2375 | } else if (ret == -EINTR) { | 2378 | } else if (ret == -EINTR) { |
2376 | /* | 2379 | /* |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..6db7a5ed52b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
657 | return 0; | 657 | return 0; |
658 | } | 658 | } |
659 | 659 | ||
660 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | ||
661 | { | ||
662 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; | ||
663 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; | ||
664 | |||
665 | return ktime_get_update_offsets(offs_real, offs_boot); | ||
666 | } | ||
667 | |||
660 | /* | 668 | /* |
661 | * Retrigger next event is called after clock was set | 669 | * Retrigger next event is called after clock was set |
662 | * | 670 | * |
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
665 | static void retrigger_next_event(void *arg) | 673 | static void retrigger_next_event(void *arg) |
666 | { | 674 | { |
667 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 675 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); |
668 | struct timespec realtime_offset, xtim, wtm, sleep; | ||
669 | 676 | ||
670 | if (!hrtimer_hres_active()) | 677 | if (!hrtimer_hres_active()) |
671 | return; | 678 | return; |
672 | 679 | ||
673 | /* Optimized out for !HIGH_RES */ | ||
674 | get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); | ||
675 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
676 | |||
677 | /* Adjust CLOCK_REALTIME offset */ | ||
678 | raw_spin_lock(&base->lock); | 680 | raw_spin_lock(&base->lock); |
679 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | 681 | hrtimer_update_base(base); |
680 | timespec_to_ktime(realtime_offset); | ||
681 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
682 | timespec_to_ktime(sleep); | ||
683 | |||
684 | hrtimer_force_reprogram(base, 0); | 682 | hrtimer_force_reprogram(base, 0); |
685 | raw_spin_unlock(&base->lock); | 683 | raw_spin_unlock(&base->lock); |
686 | } | 684 | } |
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void) | |||
710 | base->clock_base[i].resolution = KTIME_HIGH_RES; | 708 | base->clock_base[i].resolution = KTIME_HIGH_RES; |
711 | 709 | ||
712 | tick_setup_sched_timer(); | 710 | tick_setup_sched_timer(); |
713 | |||
714 | /* "Retrigger" the interrupt to get things going */ | 711 | /* "Retrigger" the interrupt to get things going */ |
715 | retrigger_next_event(NULL); | 712 | retrigger_next_event(NULL); |
716 | local_irq_restore(flags); | 713 | local_irq_restore(flags); |
717 | return 1; | 714 | return 1; |
718 | } | 715 | } |
719 | 716 | ||
717 | /* | ||
718 | * Called from timekeeping code to reprogramm the hrtimer interrupt | ||
719 | * device. If called from the timer interrupt context we defer it to | ||
720 | * softirq context. | ||
721 | */ | ||
722 | void clock_was_set_delayed(void) | ||
723 | { | ||
724 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
725 | |||
726 | cpu_base->clock_was_set = 1; | ||
727 | __raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
728 | } | ||
729 | |||
720 | #else | 730 | #else |
721 | 731 | ||
722 | static inline int hrtimer_hres_active(void) { return 0; } | 732 | static inline int hrtimer_hres_active(void) { return 0; } |
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1250 | cpu_base->nr_events++; | 1260 | cpu_base->nr_events++; |
1251 | dev->next_event.tv64 = KTIME_MAX; | 1261 | dev->next_event.tv64 = KTIME_MAX; |
1252 | 1262 | ||
1253 | entry_time = now = ktime_get(); | 1263 | raw_spin_lock(&cpu_base->lock); |
1264 | entry_time = now = hrtimer_update_base(cpu_base); | ||
1254 | retry: | 1265 | retry: |
1255 | expires_next.tv64 = KTIME_MAX; | 1266 | expires_next.tv64 = KTIME_MAX; |
1256 | |||
1257 | raw_spin_lock(&cpu_base->lock); | ||
1258 | /* | 1267 | /* |
1259 | * We set expires_next to KTIME_MAX here with cpu_base->lock | 1268 | * We set expires_next to KTIME_MAX here with cpu_base->lock |
1260 | * held to prevent that a timer is enqueued in our queue via | 1269 | * held to prevent that a timer is enqueued in our queue via |
@@ -1330,8 +1339,12 @@ retry: | |||
1330 | * We need to prevent that we loop forever in the hrtimer | 1339 | * We need to prevent that we loop forever in the hrtimer |
1331 | * interrupt routine. We give it 3 attempts to avoid | 1340 | * interrupt routine. We give it 3 attempts to avoid |
1332 | * overreacting on some spurious event. | 1341 | * overreacting on some spurious event. |
1342 | * | ||
1343 | * Acquire base lock for updating the offsets and retrieving | ||
1344 | * the current time. | ||
1333 | */ | 1345 | */ |
1334 | now = ktime_get(); | 1346 | raw_spin_lock(&cpu_base->lock); |
1347 | now = hrtimer_update_base(cpu_base); | ||
1335 | cpu_base->nr_retries++; | 1348 | cpu_base->nr_retries++; |
1336 | if (++retries < 3) | 1349 | if (++retries < 3) |
1337 | goto retry; | 1350 | goto retry; |
@@ -1343,6 +1356,7 @@ retry: | |||
1343 | */ | 1356 | */ |
1344 | cpu_base->nr_hangs++; | 1357 | cpu_base->nr_hangs++; |
1345 | cpu_base->hang_detected = 1; | 1358 | cpu_base->hang_detected = 1; |
1359 | raw_spin_unlock(&cpu_base->lock); | ||
1346 | delta = ktime_sub(now, entry_time); | 1360 | delta = ktime_sub(now, entry_time); |
1347 | if (delta.tv64 > cpu_base->max_hang_time.tv64) | 1361 | if (delta.tv64 > cpu_base->max_hang_time.tv64) |
1348 | cpu_base->max_hang_time = delta; | 1362 | cpu_base->max_hang_time = delta; |
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void) | |||
1395 | 1409 | ||
1396 | static void run_hrtimer_softirq(struct softirq_action *h) | 1410 | static void run_hrtimer_softirq(struct softirq_action *h) |
1397 | { | 1411 | { |
1412 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1413 | |||
1414 | if (cpu_base->clock_was_set) { | ||
1415 | cpu_base->clock_was_set = 0; | ||
1416 | clock_was_set(); | ||
1417 | } | ||
1418 | |||
1398 | hrtimer_peek_ahead_timers(); | 1419 | hrtimer_peek_ahead_timers(); |
1399 | } | 1420 | } |
1400 | 1421 | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index bdb180325551..131ca176b497 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -133,7 +133,7 @@ irqreturn_t | |||
133 | handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | 133 | handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) |
134 | { | 134 | { |
135 | irqreturn_t retval = IRQ_NONE; | 135 | irqreturn_t retval = IRQ_NONE; |
136 | unsigned int random = 0, irq = desc->irq_data.irq; | 136 | unsigned int flags = 0, irq = desc->irq_data.irq; |
137 | 137 | ||
138 | do { | 138 | do { |
139 | irqreturn_t res; | 139 | irqreturn_t res; |
@@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | |||
161 | 161 | ||
162 | /* Fall through to add to randomness */ | 162 | /* Fall through to add to randomness */ |
163 | case IRQ_HANDLED: | 163 | case IRQ_HANDLED: |
164 | random |= action->flags; | 164 | flags |= action->flags; |
165 | break; | 165 | break; |
166 | 166 | ||
167 | default: | 167 | default: |
@@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | |||
172 | action = action->next; | 172 | action = action->next; |
173 | } while (action); | 173 | } while (action); |
174 | 174 | ||
175 | if (random & IRQF_SAMPLE_RANDOM) | 175 | add_interrupt_randomness(irq, flags); |
176 | add_interrupt_randomness(irq); | ||
177 | 176 | ||
178 | if (!noirqdebug) | 177 | if (!noirqdebug) |
179 | note_interrupt(irq, desc, retval); | 178 | note_interrupt(irq, desc, retval); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 41c1564103f1..49a77727db42 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/mutex.h> | 10 | #include <linux/mutex.h> |
11 | #include <linux/of.h> | 11 | #include <linux/of.h> |
12 | #include <linux/of_address.h> | 12 | #include <linux/of_address.h> |
13 | #include <linux/topology.h> | ||
13 | #include <linux/seq_file.h> | 14 | #include <linux/seq_file.h> |
14 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
15 | #include <linux/smp.h> | 16 | #include <linux/smp.h> |
@@ -45,7 +46,8 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, | |||
45 | { | 46 | { |
46 | struct irq_domain *domain; | 47 | struct irq_domain *domain; |
47 | 48 | ||
48 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); | 49 | domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, |
50 | of_node_to_nid(of_node)); | ||
49 | if (WARN_ON(!domain)) | 51 | if (WARN_ON(!domain)) |
50 | return NULL; | 52 | return NULL; |
51 | 53 | ||
@@ -138,6 +140,36 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, | |||
138 | } | 140 | } |
139 | 141 | ||
140 | /** | 142 | /** |
143 | * irq_domain_add_simple() - Allocate and register a simple irq_domain. | ||
144 | * @of_node: pointer to interrupt controller's device tree node. | ||
145 | * @size: total number of irqs in mapping | ||
146 | * @first_irq: first number of irq block assigned to the domain | ||
147 | * @ops: map/unmap domain callbacks | ||
148 | * @host_data: Controller private data pointer | ||
149 | * | ||
150 | * Allocates a legacy irq_domain if irq_base is positive or a linear | ||
151 | * domain otherwise. | ||
152 | * | ||
153 | * This is intended to implement the expected behaviour for most | ||
154 | * interrupt controllers which is that a linear mapping should | ||
155 | * normally be used unless the system requires a legacy mapping in | ||
156 | * order to support supplying interrupt numbers during non-DT | ||
157 | * registration of devices. | ||
158 | */ | ||
159 | struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | ||
160 | unsigned int size, | ||
161 | unsigned int first_irq, | ||
162 | const struct irq_domain_ops *ops, | ||
163 | void *host_data) | ||
164 | { | ||
165 | if (first_irq > 0) | ||
166 | return irq_domain_add_legacy(of_node, size, first_irq, 0, | ||
167 | ops, host_data); | ||
168 | else | ||
169 | return irq_domain_add_linear(of_node, size, ops, host_data); | ||
170 | } | ||
171 | |||
172 | /** | ||
141 | * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. | 173 | * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. |
142 | * @of_node: pointer to interrupt controller's device tree node. | 174 | * @of_node: pointer to interrupt controller's device tree node. |
143 | * @size: total number of irqs in legacy mapping | 175 | * @size: total number of irqs in legacy mapping |
@@ -203,7 +235,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
203 | * one can then use irq_create_mapping() to | 235 | * one can then use irq_create_mapping() to |
204 | * explicitly change them | 236 | * explicitly change them |
205 | */ | 237 | */ |
206 | ops->map(domain, irq, hwirq); | 238 | if (ops->map) |
239 | ops->map(domain, irq, hwirq); | ||
207 | 240 | ||
208 | /* Clear norequest flags */ | 241 | /* Clear norequest flags */ |
209 | irq_clear_status_flags(irq, IRQ_NOREQUEST); | 242 | irq_clear_status_flags(irq, IRQ_NOREQUEST); |
@@ -215,7 +248,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
215 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); | 248 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); |
216 | 249 | ||
217 | /** | 250 | /** |
218 | * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. | 251 | * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. |
219 | * @of_node: pointer to interrupt controller's device tree node. | 252 | * @of_node: pointer to interrupt controller's device tree node. |
220 | * @size: Number of interrupts in the domain. | 253 | * @size: Number of interrupts in the domain. |
221 | * @ops: map/unmap domain callbacks | 254 | * @ops: map/unmap domain callbacks |
@@ -229,7 +262,8 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, | |||
229 | struct irq_domain *domain; | 262 | struct irq_domain *domain; |
230 | unsigned int *revmap; | 263 | unsigned int *revmap; |
231 | 264 | ||
232 | revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); | 265 | revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, |
266 | of_node_to_nid(of_node)); | ||
233 | if (WARN_ON(!revmap)) | 267 | if (WARN_ON(!revmap)) |
234 | return NULL; | 268 | return NULL; |
235 | 269 | ||
@@ -330,24 +364,112 @@ void irq_set_default_host(struct irq_domain *domain) | |||
330 | } | 364 | } |
331 | EXPORT_SYMBOL_GPL(irq_set_default_host); | 365 | EXPORT_SYMBOL_GPL(irq_set_default_host); |
332 | 366 | ||
333 | static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, | 367 | static void irq_domain_disassociate_many(struct irq_domain *domain, |
334 | irq_hw_number_t hwirq) | 368 | unsigned int irq_base, int count) |
335 | { | 369 | { |
336 | struct irq_data *irq_data = irq_get_irq_data(virq); | 370 | /* |
371 | * disassociate in reverse order; | ||
372 | * not strictly necessary, but nice for unwinding | ||
373 | */ | ||
374 | while (count--) { | ||
375 | int irq = irq_base + count; | ||
376 | struct irq_data *irq_data = irq_get_irq_data(irq); | ||
377 | irq_hw_number_t hwirq = irq_data->hwirq; | ||
378 | |||
379 | if (WARN_ON(!irq_data || irq_data->domain != domain)) | ||
380 | continue; | ||
381 | |||
382 | irq_set_status_flags(irq, IRQ_NOREQUEST); | ||
383 | |||
384 | /* remove chip and handler */ | ||
385 | irq_set_chip_and_handler(irq, NULL, NULL); | ||
386 | |||
387 | /* Make sure it's completed */ | ||
388 | synchronize_irq(irq); | ||
389 | |||
390 | /* Tell the PIC about it */ | ||
391 | if (domain->ops->unmap) | ||
392 | domain->ops->unmap(domain, irq); | ||
393 | smp_mb(); | ||
337 | 394 | ||
338 | irq_data->hwirq = hwirq; | ||
339 | irq_data->domain = domain; | ||
340 | if (domain->ops->map(domain, virq, hwirq)) { | ||
341 | pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq); | ||
342 | irq_data->domain = NULL; | 395 | irq_data->domain = NULL; |
343 | irq_data->hwirq = 0; | 396 | irq_data->hwirq = 0; |
344 | return -1; | 397 | |
398 | /* Clear reverse map */ | ||
399 | switch(domain->revmap_type) { | ||
400 | case IRQ_DOMAIN_MAP_LINEAR: | ||
401 | if (hwirq < domain->revmap_data.linear.size) | ||
402 | domain->revmap_data.linear.revmap[hwirq] = 0; | ||
403 | break; | ||
404 | case IRQ_DOMAIN_MAP_TREE: | ||
405 | mutex_lock(&revmap_trees_mutex); | ||
406 | radix_tree_delete(&domain->revmap_data.tree, hwirq); | ||
407 | mutex_unlock(&revmap_trees_mutex); | ||
408 | break; | ||
409 | } | ||
345 | } | 410 | } |
411 | } | ||
412 | |||
413 | int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, | ||
414 | irq_hw_number_t hwirq_base, int count) | ||
415 | { | ||
416 | unsigned int virq = irq_base; | ||
417 | irq_hw_number_t hwirq = hwirq_base; | ||
418 | int i, ret; | ||
419 | |||
420 | pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, | ||
421 | of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); | ||
422 | |||
423 | for (i = 0; i < count; i++) { | ||
424 | struct irq_data *irq_data = irq_get_irq_data(virq + i); | ||
425 | |||
426 | if (WARN(!irq_data, "error: irq_desc not allocated; " | ||
427 | "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) | ||
428 | return -EINVAL; | ||
429 | if (WARN(irq_data->domain, "error: irq_desc already associated; " | ||
430 | "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) | ||
431 | return -EINVAL; | ||
432 | }; | ||
433 | |||
434 | for (i = 0; i < count; i++, virq++, hwirq++) { | ||
435 | struct irq_data *irq_data = irq_get_irq_data(virq); | ||
436 | |||
437 | irq_data->hwirq = hwirq; | ||
438 | irq_data->domain = domain; | ||
439 | if (domain->ops->map) { | ||
440 | ret = domain->ops->map(domain, virq, hwirq); | ||
441 | if (ret != 0) { | ||
442 | pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", | ||
443 | virq, hwirq, ret); | ||
444 | WARN_ON(1); | ||
445 | irq_data->domain = NULL; | ||
446 | irq_data->hwirq = 0; | ||
447 | goto err_unmap; | ||
448 | } | ||
449 | } | ||
346 | 450 | ||
347 | irq_clear_status_flags(virq, IRQ_NOREQUEST); | 451 | switch (domain->revmap_type) { |
452 | case IRQ_DOMAIN_MAP_LINEAR: | ||
453 | if (hwirq < domain->revmap_data.linear.size) | ||
454 | domain->revmap_data.linear.revmap[hwirq] = virq; | ||
455 | break; | ||
456 | case IRQ_DOMAIN_MAP_TREE: | ||
457 | mutex_lock(&revmap_trees_mutex); | ||
458 | radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); | ||
459 | mutex_unlock(&revmap_trees_mutex); | ||
460 | break; | ||
461 | } | ||
462 | |||
463 | irq_clear_status_flags(virq, IRQ_NOREQUEST); | ||
464 | } | ||
348 | 465 | ||
349 | return 0; | 466 | return 0; |
467 | |||
468 | err_unmap: | ||
469 | irq_domain_disassociate_many(domain, irq_base, i); | ||
470 | return -EINVAL; | ||
350 | } | 471 | } |
472 | EXPORT_SYMBOL_GPL(irq_domain_associate_many); | ||
351 | 473 | ||
352 | /** | 474 | /** |
353 | * irq_create_direct_mapping() - Allocate an irq for direct mapping | 475 | * irq_create_direct_mapping() - Allocate an irq for direct mapping |
@@ -364,10 +486,10 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
364 | if (domain == NULL) | 486 | if (domain == NULL) |
365 | domain = irq_default_domain; | 487 | domain = irq_default_domain; |
366 | 488 | ||
367 | BUG_ON(domain == NULL); | 489 | if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) |
368 | WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); | 490 | return 0; |
369 | 491 | ||
370 | virq = irq_alloc_desc_from(1, 0); | 492 | virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); |
371 | if (!virq) { | 493 | if (!virq) { |
372 | pr_debug("create_direct virq allocation failed\n"); | 494 | pr_debug("create_direct virq allocation failed\n"); |
373 | return 0; | 495 | return 0; |
@@ -380,7 +502,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
380 | } | 502 | } |
381 | pr_debug("create_direct obtained virq %d\n", virq); | 503 | pr_debug("create_direct obtained virq %d\n", virq); |
382 | 504 | ||
383 | if (irq_setup_virq(domain, virq, virq)) { | 505 | if (irq_domain_associate(domain, virq, virq)) { |
384 | irq_free_desc(virq); | 506 | irq_free_desc(virq); |
385 | return 0; | 507 | return 0; |
386 | } | 508 | } |
@@ -433,27 +555,64 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
433 | hint = hwirq % nr_irqs; | 555 | hint = hwirq % nr_irqs; |
434 | if (hint == 0) | 556 | if (hint == 0) |
435 | hint++; | 557 | hint++; |
436 | virq = irq_alloc_desc_from(hint, 0); | 558 | virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); |
437 | if (virq <= 0) | 559 | if (virq <= 0) |
438 | virq = irq_alloc_desc_from(1, 0); | 560 | virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); |
439 | if (virq <= 0) { | 561 | if (virq <= 0) { |
440 | pr_debug("-> virq allocation failed\n"); | 562 | pr_debug("-> virq allocation failed\n"); |
441 | return 0; | 563 | return 0; |
442 | } | 564 | } |
443 | 565 | ||
444 | if (irq_setup_virq(domain, virq, hwirq)) { | 566 | if (irq_domain_associate(domain, virq, hwirq)) { |
445 | if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) | 567 | irq_free_desc(virq); |
446 | irq_free_desc(virq); | ||
447 | return 0; | 568 | return 0; |
448 | } | 569 | } |
449 | 570 | ||
450 | pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", | 571 | pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", |
451 | hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); | 572 | hwirq, of_node_full_name(domain->of_node), virq); |
452 | 573 | ||
453 | return virq; | 574 | return virq; |
454 | } | 575 | } |
455 | EXPORT_SYMBOL_GPL(irq_create_mapping); | 576 | EXPORT_SYMBOL_GPL(irq_create_mapping); |
456 | 577 | ||
578 | /** | ||
579 | * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs | ||
580 | * @domain: domain owning the interrupt range | ||
581 | * @irq_base: beginning of linux IRQ range | ||
582 | * @hwirq_base: beginning of hardware IRQ range | ||
583 | * @count: Number of interrupts to map | ||
584 | * | ||
585 | * This routine is used for allocating and mapping a range of hardware | ||
586 | * irqs to linux irqs where the linux irq numbers are at pre-defined | ||
587 | * locations. For use by controllers that already have static mappings | ||
588 | * to insert in to the domain. | ||
589 | * | ||
590 | * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time | ||
591 | * domain insertion. | ||
592 | * | ||
593 | * 0 is returned upon success, while any failure to establish a static | ||
594 | * mapping is treated as an error. | ||
595 | */ | ||
596 | int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, | ||
597 | irq_hw_number_t hwirq_base, int count) | ||
598 | { | ||
599 | int ret; | ||
600 | |||
601 | ret = irq_alloc_descs(irq_base, irq_base, count, | ||
602 | of_node_to_nid(domain->of_node)); | ||
603 | if (unlikely(ret < 0)) | ||
604 | return ret; | ||
605 | |||
606 | ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); | ||
607 | if (unlikely(ret < 0)) { | ||
608 | irq_free_descs(irq_base, count); | ||
609 | return ret; | ||
610 | } | ||
611 | |||
612 | return 0; | ||
613 | } | ||
614 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); | ||
615 | |||
457 | unsigned int irq_create_of_mapping(struct device_node *controller, | 616 | unsigned int irq_create_of_mapping(struct device_node *controller, |
458 | const u32 *intspec, unsigned int intsize) | 617 | const u32 *intspec, unsigned int intsize) |
459 | { | 618 | { |
@@ -477,7 +636,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, | |||
477 | return intspec[0]; | 636 | return intspec[0]; |
478 | #endif | 637 | #endif |
479 | pr_warning("no irq domain found for %s !\n", | 638 | pr_warning("no irq domain found for %s !\n", |
480 | controller->full_name); | 639 | of_node_full_name(controller)); |
481 | return 0; | 640 | return 0; |
482 | } | 641 | } |
483 | 642 | ||
@@ -511,7 +670,6 @@ void irq_dispose_mapping(unsigned int virq) | |||
511 | { | 670 | { |
512 | struct irq_data *irq_data = irq_get_irq_data(virq); | 671 | struct irq_data *irq_data = irq_get_irq_data(virq); |
513 | struct irq_domain *domain; | 672 | struct irq_domain *domain; |
514 | irq_hw_number_t hwirq; | ||
515 | 673 | ||
516 | if (!virq || !irq_data) | 674 | if (!virq || !irq_data) |
517 | return; | 675 | return; |
@@ -524,33 +682,7 @@ void irq_dispose_mapping(unsigned int virq) | |||
524 | if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) | 682 | if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) |
525 | return; | 683 | return; |
526 | 684 | ||
527 | irq_set_status_flags(virq, IRQ_NOREQUEST); | 685 | irq_domain_disassociate_many(domain, virq, 1); |
528 | |||
529 | /* remove chip and handler */ | ||
530 | irq_set_chip_and_handler(virq, NULL, NULL); | ||
531 | |||
532 | /* Make sure it's completed */ | ||
533 | synchronize_irq(virq); | ||
534 | |||
535 | /* Tell the PIC about it */ | ||
536 | if (domain->ops->unmap) | ||
537 | domain->ops->unmap(domain, virq); | ||
538 | smp_mb(); | ||
539 | |||
540 | /* Clear reverse map */ | ||
541 | hwirq = irq_data->hwirq; | ||
542 | switch(domain->revmap_type) { | ||
543 | case IRQ_DOMAIN_MAP_LINEAR: | ||
544 | if (hwirq < domain->revmap_data.linear.size) | ||
545 | domain->revmap_data.linear.revmap[hwirq] = 0; | ||
546 | break; | ||
547 | case IRQ_DOMAIN_MAP_TREE: | ||
548 | mutex_lock(&revmap_trees_mutex); | ||
549 | radix_tree_delete(&domain->revmap_data.tree, hwirq); | ||
550 | mutex_unlock(&revmap_trees_mutex); | ||
551 | break; | ||
552 | } | ||
553 | |||
554 | irq_free_desc(virq); | 686 | irq_free_desc(virq); |
555 | } | 687 | } |
556 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); | 688 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); |
@@ -559,16 +691,11 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping); | |||
559 | * irq_find_mapping() - Find a linux irq from an hw irq number. | 691 | * irq_find_mapping() - Find a linux irq from an hw irq number. |
560 | * @domain: domain owning this hardware interrupt | 692 | * @domain: domain owning this hardware interrupt |
561 | * @hwirq: hardware irq number in that domain space | 693 | * @hwirq: hardware irq number in that domain space |
562 | * | ||
563 | * This is a slow path, for use by generic code. It's expected that an | ||
564 | * irq controller implementation directly calls the appropriate low level | ||
565 | * mapping function. | ||
566 | */ | 694 | */ |
567 | unsigned int irq_find_mapping(struct irq_domain *domain, | 695 | unsigned int irq_find_mapping(struct irq_domain *domain, |
568 | irq_hw_number_t hwirq) | 696 | irq_hw_number_t hwirq) |
569 | { | 697 | { |
570 | unsigned int i; | 698 | struct irq_data *data; |
571 | unsigned int hint = hwirq % nr_irqs; | ||
572 | 699 | ||
573 | /* Look for default domain if nececssary */ | 700 | /* Look for default domain if nececssary */ |
574 | if (domain == NULL) | 701 | if (domain == NULL) |
@@ -576,115 +703,47 @@ unsigned int irq_find_mapping(struct irq_domain *domain, | |||
576 | if (domain == NULL) | 703 | if (domain == NULL) |
577 | return 0; | 704 | return 0; |
578 | 705 | ||
579 | /* legacy -> bail early */ | 706 | switch (domain->revmap_type) { |
580 | if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) | 707 | case IRQ_DOMAIN_MAP_LEGACY: |
581 | return irq_domain_legacy_revmap(domain, hwirq); | 708 | return irq_domain_legacy_revmap(domain, hwirq); |
582 | 709 | case IRQ_DOMAIN_MAP_LINEAR: | |
583 | /* Slow path does a linear search of the map */ | 710 | return irq_linear_revmap(domain, hwirq); |
584 | if (hint == 0) | 711 | case IRQ_DOMAIN_MAP_TREE: |
585 | hint = 1; | 712 | rcu_read_lock(); |
586 | i = hint; | 713 | data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); |
587 | do { | 714 | rcu_read_unlock(); |
588 | struct irq_data *data = irq_get_irq_data(i); | 715 | if (data) |
716 | return data->irq; | ||
717 | break; | ||
718 | case IRQ_DOMAIN_MAP_NOMAP: | ||
719 | data = irq_get_irq_data(hwirq); | ||
589 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) | 720 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) |
590 | return i; | 721 | return hwirq; |
591 | i++; | 722 | break; |
592 | if (i >= nr_irqs) | 723 | } |
593 | i = 1; | 724 | |
594 | } while(i != hint); | ||
595 | return 0; | 725 | return 0; |
596 | } | 726 | } |
597 | EXPORT_SYMBOL_GPL(irq_find_mapping); | 727 | EXPORT_SYMBOL_GPL(irq_find_mapping); |
598 | 728 | ||
599 | /** | 729 | /** |
600 | * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. | ||
601 | * @domain: domain owning this hardware interrupt | ||
602 | * @hwirq: hardware irq number in that domain space | ||
603 | * | ||
604 | * This is a fast path, for use by irq controller code that uses radix tree | ||
605 | * revmaps | ||
606 | */ | ||
607 | unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, | ||
608 | irq_hw_number_t hwirq) | ||
609 | { | ||
610 | struct irq_data *irq_data; | ||
611 | |||
612 | if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) | ||
613 | return irq_find_mapping(domain, hwirq); | ||
614 | |||
615 | /* | ||
616 | * Freeing an irq can delete nodes along the path to | ||
617 | * do the lookup via call_rcu. | ||
618 | */ | ||
619 | rcu_read_lock(); | ||
620 | irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); | ||
621 | rcu_read_unlock(); | ||
622 | |||
623 | /* | ||
624 | * If found in radix tree, then fine. | ||
625 | * Else fallback to linear lookup - this should not happen in practice | ||
626 | * as it means that we failed to insert the node in the radix tree. | ||
627 | */ | ||
628 | return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); | ||
629 | } | ||
630 | EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup); | ||
631 | |||
632 | /** | ||
633 | * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. | ||
634 | * @domain: domain owning this hardware interrupt | ||
635 | * @virq: linux irq number | ||
636 | * @hwirq: hardware irq number in that domain space | ||
637 | * | ||
638 | * This is for use by irq controllers that use a radix tree reverse | ||
639 | * mapping for fast lookup. | ||
640 | */ | ||
641 | void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, | ||
642 | irq_hw_number_t hwirq) | ||
643 | { | ||
644 | struct irq_data *irq_data = irq_get_irq_data(virq); | ||
645 | |||
646 | if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) | ||
647 | return; | ||
648 | |||
649 | if (virq) { | ||
650 | mutex_lock(&revmap_trees_mutex); | ||
651 | radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); | ||
652 | mutex_unlock(&revmap_trees_mutex); | ||
653 | } | ||
654 | } | ||
655 | EXPORT_SYMBOL_GPL(irq_radix_revmap_insert); | ||
656 | |||
657 | /** | ||
658 | * irq_linear_revmap() - Find a linux irq from a hw irq number. | 730 | * irq_linear_revmap() - Find a linux irq from a hw irq number. |
659 | * @domain: domain owning this hardware interrupt | 731 | * @domain: domain owning this hardware interrupt |
660 | * @hwirq: hardware irq number in that domain space | 732 | * @hwirq: hardware irq number in that domain space |
661 | * | 733 | * |
662 | * This is a fast path, for use by irq controller code that uses linear | 734 | * This is a fast path that can be called directly by irq controller code to |
663 | * revmaps. It does fallback to the slow path if the revmap doesn't exist | 735 | * save a handful of instructions. |
664 | * yet and will create the revmap entry with appropriate locking | ||
665 | */ | 736 | */ |
666 | unsigned int irq_linear_revmap(struct irq_domain *domain, | 737 | unsigned int irq_linear_revmap(struct irq_domain *domain, |
667 | irq_hw_number_t hwirq) | 738 | irq_hw_number_t hwirq) |
668 | { | 739 | { |
669 | unsigned int *revmap; | 740 | BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); |
670 | |||
671 | if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) | ||
672 | return irq_find_mapping(domain, hwirq); | ||
673 | 741 | ||
674 | /* Check revmap bounds */ | 742 | /* Check revmap bounds; complain if exceeded */ |
675 | if (unlikely(hwirq >= domain->revmap_data.linear.size)) | 743 | if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) |
676 | return irq_find_mapping(domain, hwirq); | 744 | return 0; |
677 | |||
678 | /* Check if revmap was allocated */ | ||
679 | revmap = domain->revmap_data.linear.revmap; | ||
680 | if (unlikely(revmap == NULL)) | ||
681 | return irq_find_mapping(domain, hwirq); | ||
682 | |||
683 | /* Fill up revmap with slow path if no mapping found */ | ||
684 | if (unlikely(!revmap[hwirq])) | ||
685 | revmap[hwirq] = irq_find_mapping(domain, hwirq); | ||
686 | 745 | ||
687 | return revmap[hwirq]; | 746 | return domain->revmap_data.linear.revmap[hwirq]; |
688 | } | 747 | } |
689 | EXPORT_SYMBOL_GPL(irq_linear_revmap); | 748 | EXPORT_SYMBOL_GPL(irq_linear_revmap); |
690 | 749 | ||
@@ -725,8 +784,8 @@ static int virq_debug_show(struct seq_file *m, void *private) | |||
725 | data = irq_desc_get_chip_data(desc); | 784 | data = irq_desc_get_chip_data(desc); |
726 | seq_printf(m, data ? "0x%p " : " %p ", data); | 785 | seq_printf(m, data ? "0x%p " : " %p ", data); |
727 | 786 | ||
728 | if (desc->irq_data.domain && desc->irq_data.domain->of_node) | 787 | if (desc->irq_data.domain) |
729 | p = desc->irq_data.domain->of_node->full_name; | 788 | p = of_node_full_name(desc->irq_data.domain->of_node); |
730 | else | 789 | else |
731 | p = none; | 790 | p = none; |
732 | seq_printf(m, "%s\n", p); | 791 | seq_printf(m, "%s\n", p); |
@@ -761,12 +820,6 @@ static int __init irq_debugfs_init(void) | |||
761 | __initcall(irq_debugfs_init); | 820 | __initcall(irq_debugfs_init); |
762 | #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ | 821 | #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ |
763 | 822 | ||
764 | static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, | ||
765 | irq_hw_number_t hwirq) | ||
766 | { | ||
767 | return 0; | ||
768 | } | ||
769 | |||
770 | /** | 823 | /** |
771 | * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings | 824 | * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings |
772 | * | 825 | * |
@@ -829,7 +882,6 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, | |||
829 | EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); | 882 | EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); |
830 | 883 | ||
831 | const struct irq_domain_ops irq_domain_simple_ops = { | 884 | const struct irq_domain_ops irq_domain_simple_ops = { |
832 | .map = irq_domain_simple_map, | ||
833 | .xlate = irq_domain_xlate_onetwocell, | 885 | .xlate = irq_domain_xlate_onetwocell, |
834 | }; | 886 | }; |
835 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | 887 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8c548232ba39..4c69326aa773 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc) | |||
781 | wake_up(&desc->wait_for_threads); | 781 | wake_up(&desc->wait_for_threads); |
782 | } | 782 | } |
783 | 783 | ||
784 | static void irq_thread_dtor(struct task_work *unused) | 784 | static void irq_thread_dtor(struct callback_head *unused) |
785 | { | 785 | { |
786 | struct task_struct *tsk = current; | 786 | struct task_struct *tsk = current; |
787 | struct irq_desc *desc; | 787 | struct irq_desc *desc; |
@@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused) | |||
813 | */ | 813 | */ |
814 | static int irq_thread(void *data) | 814 | static int irq_thread(void *data) |
815 | { | 815 | { |
816 | struct task_work on_exit_work; | 816 | struct callback_head on_exit_work; |
817 | static const struct sched_param param = { | 817 | static const struct sched_param param = { |
818 | .sched_priority = MAX_USER_RT_PRIO/2, | 818 | .sched_priority = MAX_USER_RT_PRIO/2, |
819 | }; | 819 | }; |
@@ -830,7 +830,7 @@ static int irq_thread(void *data) | |||
830 | 830 | ||
831 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 831 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
832 | 832 | ||
833 | init_task_work(&on_exit_work, irq_thread_dtor, NULL); | 833 | init_task_work(&on_exit_work, irq_thread_dtor); |
834 | task_work_add(current, &on_exit_work, false); | 834 | task_work_add(current, &on_exit_work, false); |
835 | 835 | ||
836 | while (!irq_wait_for_interrupt(action)) { | 836 | while (!irq_wait_for_interrupt(action)) { |
@@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
893 | return -ENOSYS; | 893 | return -ENOSYS; |
894 | if (!try_module_get(desc->owner)) | 894 | if (!try_module_get(desc->owner)) |
895 | return -ENODEV; | 895 | return -ENODEV; |
896 | /* | ||
897 | * Some drivers like serial.c use request_irq() heavily, | ||
898 | * so we have to be careful not to interfere with a | ||
899 | * running system. | ||
900 | */ | ||
901 | if (new->flags & IRQF_SAMPLE_RANDOM) { | ||
902 | /* | ||
903 | * This function might sleep, we want to call it first, | ||
904 | * outside of the atomic block. | ||
905 | * Yes, this might clear the entropy pool if the wrong | ||
906 | * driver is attempted to be loaded, without actually | ||
907 | * installing a new handler, but is this really a problem, | ||
908 | * only the sysadmin is able to do this. | ||
909 | */ | ||
910 | rand_initialize_irq(irq); | ||
911 | } | ||
912 | 896 | ||
913 | /* | 897 | /* |
914 | * Check whether the interrupt nests into another interrupt | 898 | * Check whether the interrupt nests into another interrupt |
@@ -960,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
960 | } | 944 | } |
961 | 945 | ||
962 | /* | 946 | /* |
947 | * Drivers are often written to work w/o knowledge about the | ||
948 | * underlying irq chip implementation, so a request for a | ||
949 | * threaded irq without a primary hard irq context handler | ||
950 | * requires the ONESHOT flag to be set. Some irq chips like | ||
951 | * MSI based interrupts are per se one shot safe. Check the | ||
952 | * chip flags, so we can avoid the unmask dance at the end of | ||
953 | * the threaded handler for those. | ||
954 | */ | ||
955 | if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) | ||
956 | new->flags &= ~IRQF_ONESHOT; | ||
957 | |||
958 | /* | ||
963 | * The following block of code has to be executed atomically | 959 | * The following block of code has to be executed atomically |
964 | */ | 960 | */ |
965 | raw_spin_lock_irqsave(&desc->lock, flags); | 961 | raw_spin_lock_irqsave(&desc->lock, flags); |
@@ -1033,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1033 | */ | 1029 | */ |
1034 | new->thread_mask = 1 << ffz(thread_mask); | 1030 | new->thread_mask = 1 << ffz(thread_mask); |
1035 | 1031 | ||
1036 | } else if (new->handler == irq_default_primary_handler) { | 1032 | } else if (new->handler == irq_default_primary_handler && |
1033 | !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { | ||
1037 | /* | 1034 | /* |
1038 | * The interrupt was requested with handler = NULL, so | 1035 | * The interrupt was requested with handler = NULL, so |
1039 | * we use the default primary handler for it. But it | 1036 | * we use the default primary handler for it. But it |
@@ -1354,7 +1351,6 @@ EXPORT_SYMBOL(free_irq); | |||
1354 | * Flags: | 1351 | * Flags: |
1355 | * | 1352 | * |
1356 | * IRQF_SHARED Interrupt is shared | 1353 | * IRQF_SHARED Interrupt is shared |
1357 | * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy | ||
1358 | * IRQF_TRIGGER_* Specify active edge(s) or level | 1354 | * IRQF_TRIGGER_* Specify active edge(s) or level |
1359 | * | 1355 | * |
1360 | */ | 1356 | */ |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 4e2e472f6aeb..0668d58d6413 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void) | |||
1424 | 1424 | ||
1425 | void crash_save_vmcoreinfo(void) | 1425 | void crash_save_vmcoreinfo(void) |
1426 | { | 1426 | { |
1427 | vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); | 1427 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); |
1428 | update_vmcoreinfo_note(); | 1428 | update_vmcoreinfo_note(); |
1429 | } | 1429 | } |
1430 | 1430 | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index ff2c7cb86d77..6f99aead66c6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -45,6 +45,13 @@ extern int max_threads; | |||
45 | 45 | ||
46 | static struct workqueue_struct *khelper_wq; | 46 | static struct workqueue_struct *khelper_wq; |
47 | 47 | ||
48 | /* | ||
49 | * kmod_thread_locker is used for deadlock avoidance. There is no explicit | ||
50 | * locking to protect this global - it is private to the singleton khelper | ||
51 | * thread and should only ever be modified by that thread. | ||
52 | */ | ||
53 | static const struct task_struct *kmod_thread_locker; | ||
54 | |||
48 | #define CAP_BSET (void *)1 | 55 | #define CAP_BSET (void *)1 |
49 | #define CAP_PI (void *)2 | 56 | #define CAP_PI (void *)2 |
50 | 57 | ||
@@ -221,6 +228,13 @@ fail: | |||
221 | return 0; | 228 | return 0; |
222 | } | 229 | } |
223 | 230 | ||
231 | static int call_helper(void *data) | ||
232 | { | ||
233 | /* Worker thread started blocking khelper thread. */ | ||
234 | kmod_thread_locker = current; | ||
235 | return ____call_usermodehelper(data); | ||
236 | } | ||
237 | |||
224 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) | 238 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) |
225 | { | 239 | { |
226 | if (info->cleanup) | 240 | if (info->cleanup) |
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work) | |||
295 | if (wait == UMH_WAIT_PROC) | 309 | if (wait == UMH_WAIT_PROC) |
296 | pid = kernel_thread(wait_for_helper, sub_info, | 310 | pid = kernel_thread(wait_for_helper, sub_info, |
297 | CLONE_FS | CLONE_FILES | SIGCHLD); | 311 | CLONE_FS | CLONE_FILES | SIGCHLD); |
298 | else | 312 | else { |
299 | pid = kernel_thread(____call_usermodehelper, sub_info, | 313 | pid = kernel_thread(call_helper, sub_info, |
300 | CLONE_VFORK | SIGCHLD); | 314 | CLONE_VFORK | SIGCHLD); |
315 | /* Worker thread stopped blocking khelper thread. */ | ||
316 | kmod_thread_locker = NULL; | ||
317 | } | ||
301 | 318 | ||
302 | switch (wait) { | 319 | switch (wait) { |
303 | case UMH_NO_WAIT: | 320 | case UMH_NO_WAIT: |
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
548 | retval = -EBUSY; | 565 | retval = -EBUSY; |
549 | goto out; | 566 | goto out; |
550 | } | 567 | } |
568 | /* | ||
569 | * Worker thread must not wait for khelper thread at below | ||
570 | * wait_for_completion() if the thread was created with CLONE_VFORK | ||
571 | * flag, for khelper thread is already waiting for the thread at | ||
572 | * wait_for_completion() in do_fork(). | ||
573 | */ | ||
574 | if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { | ||
575 | retval = -EBUSY; | ||
576 | goto out; | ||
577 | } | ||
551 | 578 | ||
552 | sub_info->complete = &done; | 579 | sub_info->complete = &done; |
553 | sub_info->wait = wait; | 580 | sub_info->wait = wait; |
@@ -577,6 +604,12 @@ unlock: | |||
577 | return retval; | 604 | return retval; |
578 | } | 605 | } |
579 | 606 | ||
607 | /* | ||
608 | * call_usermodehelper_fns() will not run the caller-provided cleanup function | ||
609 | * if a memory allocation failure is experienced. So the caller might need to | ||
610 | * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform | ||
611 | * the necessaary cleanup within the caller. | ||
612 | */ | ||
580 | int call_usermodehelper_fns( | 613 | int call_usermodehelper_fns( |
581 | char *path, char **argv, char **envp, int wait, | 614 | char *path, char **argv, char **envp, int wait, |
582 | int (*init)(struct subprocess_info *info, struct cred *new), | 615 | int (*init)(struct subprocess_info *info, struct cred *new), |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de633702e..b579af57ea10 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -360,16 +360,12 @@ repeat: | |||
360 | struct kthread_work, node); | 360 | struct kthread_work, node); |
361 | list_del_init(&work->node); | 361 | list_del_init(&work->node); |
362 | } | 362 | } |
363 | worker->current_work = work; | ||
363 | spin_unlock_irq(&worker->lock); | 364 | spin_unlock_irq(&worker->lock); |
364 | 365 | ||
365 | if (work) { | 366 | if (work) { |
366 | __set_current_state(TASK_RUNNING); | 367 | __set_current_state(TASK_RUNNING); |
367 | work->func(work); | 368 | work->func(work); |
368 | smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ | ||
369 | work->done_seq = work->queue_seq; | ||
370 | smp_mb(); /* mb worker-b1 paired with flush-b0 */ | ||
371 | if (atomic_read(&work->flushing)) | ||
372 | wake_up_all(&work->done); | ||
373 | } else if (!freezing(current)) | 369 | } else if (!freezing(current)) |
374 | schedule(); | 370 | schedule(); |
375 | 371 | ||
@@ -378,6 +374,19 @@ repeat: | |||
378 | } | 374 | } |
379 | EXPORT_SYMBOL_GPL(kthread_worker_fn); | 375 | EXPORT_SYMBOL_GPL(kthread_worker_fn); |
380 | 376 | ||
377 | /* insert @work before @pos in @worker */ | ||
378 | static void insert_kthread_work(struct kthread_worker *worker, | ||
379 | struct kthread_work *work, | ||
380 | struct list_head *pos) | ||
381 | { | ||
382 | lockdep_assert_held(&worker->lock); | ||
383 | |||
384 | list_add_tail(&work->node, pos); | ||
385 | work->worker = worker; | ||
386 | if (likely(worker->task)) | ||
387 | wake_up_process(worker->task); | ||
388 | } | ||
389 | |||
381 | /** | 390 | /** |
382 | * queue_kthread_work - queue a kthread_work | 391 | * queue_kthread_work - queue a kthread_work |
383 | * @worker: target kthread_worker | 392 | * @worker: target kthread_worker |
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker, | |||
395 | 404 | ||
396 | spin_lock_irqsave(&worker->lock, flags); | 405 | spin_lock_irqsave(&worker->lock, flags); |
397 | if (list_empty(&work->node)) { | 406 | if (list_empty(&work->node)) { |
398 | list_add_tail(&work->node, &worker->work_list); | 407 | insert_kthread_work(worker, work, &worker->work_list); |
399 | work->queue_seq++; | ||
400 | if (likely(worker->task)) | ||
401 | wake_up_process(worker->task); | ||
402 | ret = true; | 408 | ret = true; |
403 | } | 409 | } |
404 | spin_unlock_irqrestore(&worker->lock, flags); | 410 | spin_unlock_irqrestore(&worker->lock, flags); |
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker, | |||
406 | } | 412 | } |
407 | EXPORT_SYMBOL_GPL(queue_kthread_work); | 413 | EXPORT_SYMBOL_GPL(queue_kthread_work); |
408 | 414 | ||
415 | struct kthread_flush_work { | ||
416 | struct kthread_work work; | ||
417 | struct completion done; | ||
418 | }; | ||
419 | |||
420 | static void kthread_flush_work_fn(struct kthread_work *work) | ||
421 | { | ||
422 | struct kthread_flush_work *fwork = | ||
423 | container_of(work, struct kthread_flush_work, work); | ||
424 | complete(&fwork->done); | ||
425 | } | ||
426 | |||
409 | /** | 427 | /** |
410 | * flush_kthread_work - flush a kthread_work | 428 | * flush_kthread_work - flush a kthread_work |
411 | * @work: work to flush | 429 | * @work: work to flush |
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work); | |||
414 | */ | 432 | */ |
415 | void flush_kthread_work(struct kthread_work *work) | 433 | void flush_kthread_work(struct kthread_work *work) |
416 | { | 434 | { |
417 | int seq = work->queue_seq; | 435 | struct kthread_flush_work fwork = { |
418 | 436 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | |
419 | atomic_inc(&work->flushing); | 437 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), |
438 | }; | ||
439 | struct kthread_worker *worker; | ||
440 | bool noop = false; | ||
420 | 441 | ||
421 | /* | 442 | retry: |
422 | * mb flush-b0 paired with worker-b1, to make sure either | 443 | worker = work->worker; |
423 | * worker sees the above increment or we see done_seq update. | 444 | if (!worker) |
424 | */ | 445 | return; |
425 | smp_mb__after_atomic_inc(); | ||
426 | 446 | ||
427 | /* A - B <= 0 tests whether B is in front of A regardless of overflow */ | 447 | spin_lock_irq(&worker->lock); |
428 | wait_event(work->done, seq - work->done_seq <= 0); | 448 | if (work->worker != worker) { |
429 | atomic_dec(&work->flushing); | 449 | spin_unlock_irq(&worker->lock); |
450 | goto retry; | ||
451 | } | ||
430 | 452 | ||
431 | /* | 453 | if (!list_empty(&work->node)) |
432 | * rmb flush-b1 paired with worker-b0, to make sure our caller | 454 | insert_kthread_work(worker, &fwork.work, work->node.next); |
433 | * sees every change made by work->func(). | 455 | else if (worker->current_work == work) |
434 | */ | 456 | insert_kthread_work(worker, &fwork.work, worker->work_list.next); |
435 | smp_mb__after_atomic_dec(); | 457 | else |
436 | } | 458 | noop = true; |
437 | EXPORT_SYMBOL_GPL(flush_kthread_work); | ||
438 | 459 | ||
439 | struct kthread_flush_work { | 460 | spin_unlock_irq(&worker->lock); |
440 | struct kthread_work work; | ||
441 | struct completion done; | ||
442 | }; | ||
443 | 461 | ||
444 | static void kthread_flush_work_fn(struct kthread_work *work) | 462 | if (!noop) |
445 | { | 463 | wait_for_completion(&fwork.done); |
446 | struct kthread_flush_work *fwork = | ||
447 | container_of(work, struct kthread_flush_work, work); | ||
448 | complete(&fwork->done); | ||
449 | } | 464 | } |
465 | EXPORT_SYMBOL_GPL(flush_kthread_work); | ||
450 | 466 | ||
451 | /** | 467 | /** |
452 | * flush_kthread_worker - flush all current works on a kthread_worker | 468 | * flush_kthread_worker - flush all current works on a kthread_worker |
diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d79..e1b2822fff97 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #define PANIC_TIMER_STEP 100 | 27 | #define PANIC_TIMER_STEP 100 |
28 | #define PANIC_BLINK_SPD 18 | 28 | #define PANIC_BLINK_SPD 18 |
29 | 29 | ||
30 | int panic_on_oops; | 30 | int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; |
31 | static unsigned long tainted_mask; | 31 | static unsigned long tainted_mask; |
32 | static int pause_on_oops; | 32 | static int pause_on_oops; |
33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...) | |||
75 | int state = 0; | 75 | int state = 0; |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * Disable local interrupts. This will prevent panic_smp_self_stop | ||
79 | * from deadlocking the first cpu that invokes the panic, since | ||
80 | * there is nothing to prevent an interrupt handler (that runs | ||
81 | * after the panic_lock is acquired) from invoking panic again. | ||
82 | */ | ||
83 | local_irq_disable(); | ||
84 | |||
85 | /* | ||
78 | * It's possible to come here directly from a panic-assertion and | 86 | * It's possible to come here directly from a panic-assertion and |
79 | * not have preempt disabled. Some functions called from here want | 87 | * not have preempt disabled. Some functions called from here want |
80 | * preempt to be disabled. No point enabling it later though... | 88 | * preempt to be disabled. No point enabling it later though... |
@@ -108,8 +116,6 @@ void panic(const char *fmt, ...) | |||
108 | */ | 116 | */ |
109 | crash_kexec(NULL); | 117 | crash_kexec(NULL); |
110 | 118 | ||
111 | kmsg_dump(KMSG_DUMP_PANIC); | ||
112 | |||
113 | /* | 119 | /* |
114 | * Note smp_send_stop is the usual smp shutdown function, which | 120 | * Note smp_send_stop is the usual smp shutdown function, which |
115 | * unfortunately means it may not be hardened to work in a panic | 121 | * unfortunately means it may not be hardened to work in a panic |
@@ -117,6 +123,8 @@ void panic(const char *fmt, ...) | |||
117 | */ | 123 | */ |
118 | smp_send_stop(); | 124 | smp_send_stop(); |
119 | 125 | ||
126 | kmsg_dump(KMSG_DUMP_PANIC); | ||
127 | |||
120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); | 128 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); |
121 | 129 | ||
122 | bust_spinlocks(0); | 130 | bust_spinlocks(0); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e38c4a1..b3c7fd554250 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
184 | } | 184 | } |
185 | read_unlock(&tasklist_lock); | 185 | read_unlock(&tasklist_lock); |
186 | 186 | ||
187 | /* Firstly reap the EXIT_ZOMBIE children we may have. */ | ||
187 | do { | 188 | do { |
188 | clear_thread_flag(TIF_SIGPENDING); | 189 | clear_thread_flag(TIF_SIGPENDING); |
189 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 190 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
190 | } while (rc != -ECHILD); | 191 | } while (rc != -ECHILD); |
191 | 192 | ||
193 | /* | ||
194 | * sys_wait4() above can't reap the TASK_DEAD children. | ||
195 | * Make sure they all go away, see __unhash_process(). | ||
196 | */ | ||
197 | for (;;) { | ||
198 | bool need_wait = false; | ||
199 | |||
200 | read_lock(&tasklist_lock); | ||
201 | if (!list_empty(¤t->children)) { | ||
202 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
203 | need_wait = true; | ||
204 | } | ||
205 | read_unlock(&tasklist_lock); | ||
206 | |||
207 | if (!need_wait) | ||
208 | break; | ||
209 | schedule(); | ||
210 | } | ||
211 | |||
192 | if (pid_ns->reboot) | 212 | if (pid_ns->reboot) |
193 | current->signal->group_exit_code = pid_ns->reboot; | 213 | current->signal->group_exit_code = pid_ns->reboot; |
194 | 214 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 8f9b4eb974e0..a70518c9d82f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -175,7 +175,7 @@ config PM_TEST_SUSPEND | |||
175 | You probably want to have your system's RTC driver statically | 175 | You probably want to have your system's RTC driver statically |
176 | linked, ensuring that it's available when this test runs. | 176 | linked, ensuring that it's available when this test runs. |
177 | 177 | ||
178 | config CAN_PM_TRACE | 178 | config PM_SLEEP_DEBUG |
179 | def_bool y | 179 | def_bool y |
180 | depends on PM_DEBUG && PM_SLEEP | 180 | depends on PM_DEBUG && PM_SLEEP |
181 | 181 | ||
@@ -196,7 +196,7 @@ config PM_TRACE | |||
196 | 196 | ||
197 | config PM_TRACE_RTC | 197 | config PM_TRACE_RTC |
198 | bool "Suspend/resume event tracing" | 198 | bool "Suspend/resume event tracing" |
199 | depends on CAN_PM_TRACE | 199 | depends on PM_SLEEP_DEBUG |
200 | depends on X86 | 200 | depends on X86 |
201 | select PM_TRACE | 201 | select PM_TRACE |
202 | ---help--- | 202 | ---help--- |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8b53db38a279..b26f5f1e773e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * Copyright (c) 2003 Open Source Development Lab | 5 | * Copyright (c) 2003 Open Source Development Lab |
6 | * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> | 6 | * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> |
7 | * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. | 7 | * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. |
8 | * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> | ||
8 | * | 9 | * |
9 | * This file is released under the GPLv2. | 10 | * This file is released under the GPLv2. |
10 | */ | 11 | */ |
@@ -27,7 +28,6 @@ | |||
27 | #include <linux/syscore_ops.h> | 28 | #include <linux/syscore_ops.h> |
28 | #include <linux/ctype.h> | 29 | #include <linux/ctype.h> |
29 | #include <linux/genhd.h> | 30 | #include <linux/genhd.h> |
30 | #include <scsi/scsi_scan.h> | ||
31 | 31 | ||
32 | #include "power.h" | 32 | #include "power.h" |
33 | 33 | ||
@@ -46,6 +46,9 @@ enum { | |||
46 | HIBERNATION_PLATFORM, | 46 | HIBERNATION_PLATFORM, |
47 | HIBERNATION_SHUTDOWN, | 47 | HIBERNATION_SHUTDOWN, |
48 | HIBERNATION_REBOOT, | 48 | HIBERNATION_REBOOT, |
49 | #ifdef CONFIG_SUSPEND | ||
50 | HIBERNATION_SUSPEND, | ||
51 | #endif | ||
49 | /* keep last */ | 52 | /* keep last */ |
50 | __HIBERNATION_AFTER_LAST | 53 | __HIBERNATION_AFTER_LAST |
51 | }; | 54 | }; |
@@ -354,6 +357,7 @@ int hibernation_snapshot(int platform_mode) | |||
354 | } | 357 | } |
355 | 358 | ||
356 | suspend_console(); | 359 | suspend_console(); |
360 | ftrace_stop(); | ||
357 | pm_restrict_gfp_mask(); | 361 | pm_restrict_gfp_mask(); |
358 | 362 | ||
359 | error = dpm_suspend(PMSG_FREEZE); | 363 | error = dpm_suspend(PMSG_FREEZE); |
@@ -379,6 +383,7 @@ int hibernation_snapshot(int platform_mode) | |||
379 | if (error || !in_suspend) | 383 | if (error || !in_suspend) |
380 | pm_restore_gfp_mask(); | 384 | pm_restore_gfp_mask(); |
381 | 385 | ||
386 | ftrace_start(); | ||
382 | resume_console(); | 387 | resume_console(); |
383 | dpm_complete(msg); | 388 | dpm_complete(msg); |
384 | 389 | ||
@@ -481,6 +486,7 @@ int hibernation_restore(int platform_mode) | |||
481 | 486 | ||
482 | pm_prepare_console(); | 487 | pm_prepare_console(); |
483 | suspend_console(); | 488 | suspend_console(); |
489 | ftrace_stop(); | ||
484 | pm_restrict_gfp_mask(); | 490 | pm_restrict_gfp_mask(); |
485 | error = dpm_suspend_start(PMSG_QUIESCE); | 491 | error = dpm_suspend_start(PMSG_QUIESCE); |
486 | if (!error) { | 492 | if (!error) { |
@@ -488,6 +494,7 @@ int hibernation_restore(int platform_mode) | |||
488 | dpm_resume_end(PMSG_RECOVER); | 494 | dpm_resume_end(PMSG_RECOVER); |
489 | } | 495 | } |
490 | pm_restore_gfp_mask(); | 496 | pm_restore_gfp_mask(); |
497 | ftrace_start(); | ||
491 | resume_console(); | 498 | resume_console(); |
492 | pm_restore_console(); | 499 | pm_restore_console(); |
493 | return error; | 500 | return error; |
@@ -514,6 +521,7 @@ int hibernation_platform_enter(void) | |||
514 | 521 | ||
515 | entering_platform_hibernation = true; | 522 | entering_platform_hibernation = true; |
516 | suspend_console(); | 523 | suspend_console(); |
524 | ftrace_stop(); | ||
517 | error = dpm_suspend_start(PMSG_HIBERNATE); | 525 | error = dpm_suspend_start(PMSG_HIBERNATE); |
518 | if (error) { | 526 | if (error) { |
519 | if (hibernation_ops->recover) | 527 | if (hibernation_ops->recover) |
@@ -557,6 +565,7 @@ int hibernation_platform_enter(void) | |||
557 | Resume_devices: | 565 | Resume_devices: |
558 | entering_platform_hibernation = false; | 566 | entering_platform_hibernation = false; |
559 | dpm_resume_end(PMSG_RESTORE); | 567 | dpm_resume_end(PMSG_RESTORE); |
568 | ftrace_start(); | ||
560 | resume_console(); | 569 | resume_console(); |
561 | 570 | ||
562 | Close: | 571 | Close: |
@@ -574,6 +583,10 @@ int hibernation_platform_enter(void) | |||
574 | */ | 583 | */ |
575 | static void power_down(void) | 584 | static void power_down(void) |
576 | { | 585 | { |
586 | #ifdef CONFIG_SUSPEND | ||
587 | int error; | ||
588 | #endif | ||
589 | |||
577 | switch (hibernation_mode) { | 590 | switch (hibernation_mode) { |
578 | case HIBERNATION_REBOOT: | 591 | case HIBERNATION_REBOOT: |
579 | kernel_restart(NULL); | 592 | kernel_restart(NULL); |
@@ -583,6 +596,25 @@ static void power_down(void) | |||
583 | case HIBERNATION_SHUTDOWN: | 596 | case HIBERNATION_SHUTDOWN: |
584 | kernel_power_off(); | 597 | kernel_power_off(); |
585 | break; | 598 | break; |
599 | #ifdef CONFIG_SUSPEND | ||
600 | case HIBERNATION_SUSPEND: | ||
601 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); | ||
602 | if (error) { | ||
603 | if (hibernation_ops) | ||
604 | hibernation_mode = HIBERNATION_PLATFORM; | ||
605 | else | ||
606 | hibernation_mode = HIBERNATION_SHUTDOWN; | ||
607 | power_down(); | ||
608 | } | ||
609 | /* | ||
610 | * Restore swap signature. | ||
611 | */ | ||
612 | error = swsusp_unmark(); | ||
613 | if (error) | ||
614 | printk(KERN_ERR "PM: Swap will be unusable! " | ||
615 | "Try swapon -a.\n"); | ||
616 | return; | ||
617 | #endif | ||
586 | } | 618 | } |
587 | kernel_halt(); | 619 | kernel_halt(); |
588 | /* | 620 | /* |
@@ -748,13 +780,6 @@ static int software_resume(void) | |||
748 | async_synchronize_full(); | 780 | async_synchronize_full(); |
749 | } | 781 | } |
750 | 782 | ||
751 | /* | ||
752 | * We can't depend on SCSI devices being available after loading | ||
753 | * one of their modules until scsi_complete_async_scans() is | ||
754 | * called and the resume device usually is a SCSI one. | ||
755 | */ | ||
756 | scsi_complete_async_scans(); | ||
757 | |||
758 | swsusp_resume_device = name_to_dev_t(resume_file); | 783 | swsusp_resume_device = name_to_dev_t(resume_file); |
759 | if (!swsusp_resume_device) { | 784 | if (!swsusp_resume_device) { |
760 | error = -ENODEV; | 785 | error = -ENODEV; |
@@ -827,6 +852,9 @@ static const char * const hibernation_modes[] = { | |||
827 | [HIBERNATION_PLATFORM] = "platform", | 852 | [HIBERNATION_PLATFORM] = "platform", |
828 | [HIBERNATION_SHUTDOWN] = "shutdown", | 853 | [HIBERNATION_SHUTDOWN] = "shutdown", |
829 | [HIBERNATION_REBOOT] = "reboot", | 854 | [HIBERNATION_REBOOT] = "reboot", |
855 | #ifdef CONFIG_SUSPEND | ||
856 | [HIBERNATION_SUSPEND] = "suspend", | ||
857 | #endif | ||
830 | }; | 858 | }; |
831 | 859 | ||
832 | /* | 860 | /* |
@@ -867,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
867 | switch (i) { | 895 | switch (i) { |
868 | case HIBERNATION_SHUTDOWN: | 896 | case HIBERNATION_SHUTDOWN: |
869 | case HIBERNATION_REBOOT: | 897 | case HIBERNATION_REBOOT: |
898 | #ifdef CONFIG_SUSPEND | ||
899 | case HIBERNATION_SUSPEND: | ||
900 | #endif | ||
870 | break; | 901 | break; |
871 | case HIBERNATION_PLATFORM: | 902 | case HIBERNATION_PLATFORM: |
872 | if (hibernation_ops) | 903 | if (hibernation_ops) |
@@ -907,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
907 | switch (mode) { | 938 | switch (mode) { |
908 | case HIBERNATION_SHUTDOWN: | 939 | case HIBERNATION_SHUTDOWN: |
909 | case HIBERNATION_REBOOT: | 940 | case HIBERNATION_REBOOT: |
941 | #ifdef CONFIG_SUSPEND | ||
942 | case HIBERNATION_SUSPEND: | ||
943 | #endif | ||
910 | hibernation_mode = mode; | 944 | hibernation_mode = mode; |
911 | break; | 945 | break; |
912 | case HIBERNATION_PLATFORM: | 946 | case HIBERNATION_PLATFORM: |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 428f8a034e96..f458238109cc 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init); | |||
235 | 235 | ||
236 | #endif /* CONFIG_PM_SLEEP */ | 236 | #endif /* CONFIG_PM_SLEEP */ |
237 | 237 | ||
238 | #ifdef CONFIG_PM_SLEEP_DEBUG | ||
239 | /* | ||
240 | * pm_print_times: print time taken by devices to suspend and resume. | ||
241 | * | ||
242 | * show() returns whether printing of suspend and resume times is enabled. | ||
243 | * store() accepts 0 or 1. 0 disables printing and 1 enables it. | ||
244 | */ | ||
245 | bool pm_print_times_enabled; | ||
246 | |||
247 | static ssize_t pm_print_times_show(struct kobject *kobj, | ||
248 | struct kobj_attribute *attr, char *buf) | ||
249 | { | ||
250 | return sprintf(buf, "%d\n", pm_print_times_enabled); | ||
251 | } | ||
252 | |||
253 | static ssize_t pm_print_times_store(struct kobject *kobj, | ||
254 | struct kobj_attribute *attr, | ||
255 | const char *buf, size_t n) | ||
256 | { | ||
257 | unsigned long val; | ||
258 | |||
259 | if (kstrtoul(buf, 10, &val)) | ||
260 | return -EINVAL; | ||
261 | |||
262 | if (val > 1) | ||
263 | return -EINVAL; | ||
264 | |||
265 | pm_print_times_enabled = !!val; | ||
266 | return n; | ||
267 | } | ||
268 | |||
269 | power_attr(pm_print_times); | ||
270 | |||
271 | static inline void pm_print_times_init(void) | ||
272 | { | ||
273 | pm_print_times_enabled = !!initcall_debug; | ||
274 | } | ||
275 | #else /* !CONFIG_PP_SLEEP_DEBUG */ | ||
276 | static inline void pm_print_times_init(void) {} | ||
277 | #endif /* CONFIG_PM_SLEEP_DEBUG */ | ||
278 | |||
238 | struct kobject *power_kobj; | 279 | struct kobject *power_kobj; |
239 | 280 | ||
240 | /** | 281 | /** |
@@ -531,6 +572,9 @@ static struct attribute * g[] = { | |||
531 | #ifdef CONFIG_PM_DEBUG | 572 | #ifdef CONFIG_PM_DEBUG |
532 | &pm_test_attr.attr, | 573 | &pm_test_attr.attr, |
533 | #endif | 574 | #endif |
575 | #ifdef CONFIG_PM_SLEEP_DEBUG | ||
576 | &pm_print_times_attr.attr, | ||
577 | #endif | ||
534 | #endif | 578 | #endif |
535 | NULL, | 579 | NULL, |
536 | }; | 580 | }; |
@@ -566,6 +610,7 @@ static int __init pm_init(void) | |||
566 | error = sysfs_create_group(power_kobj, &attr_group); | 610 | error = sysfs_create_group(power_kobj, &attr_group); |
567 | if (error) | 611 | if (error) |
568 | return error; | 612 | return error; |
613 | pm_print_times_init(); | ||
569 | return pm_autosleep_init(); | 614 | return pm_autosleep_init(); |
570 | } | 615 | } |
571 | 616 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index b0bd4beaebfe..7d4b7ffb3c1d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -156,6 +156,9 @@ extern void swsusp_free(void); | |||
156 | extern int swsusp_read(unsigned int *flags_p); | 156 | extern int swsusp_read(unsigned int *flags_p); |
157 | extern int swsusp_write(unsigned int flags); | 157 | extern int swsusp_write(unsigned int flags); |
158 | extern void swsusp_close(fmode_t); | 158 | extern void swsusp_close(fmode_t); |
159 | #ifdef CONFIG_SUSPEND | ||
160 | extern int swsusp_unmark(void); | ||
161 | #endif | ||
159 | 162 | ||
160 | /* kernel/power/block_io.c */ | 163 | /* kernel/power/block_io.c */ |
161 | extern struct block_device *hib_resume_bdev; | 164 | extern struct block_device *hib_resume_bdev; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 396d262b8fd0..c8b7446b27df 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/export.h> | 24 | #include <linux/export.h> |
25 | #include <linux/suspend.h> | 25 | #include <linux/suspend.h> |
26 | #include <linux/syscore_ops.h> | 26 | #include <linux/syscore_ops.h> |
27 | #include <linux/ftrace.h> | ||
27 | #include <trace/events/power.h> | 28 | #include <trace/events/power.h> |
28 | 29 | ||
29 | #include "power.h" | 30 | #include "power.h" |
@@ -212,6 +213,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
212 | goto Close; | 213 | goto Close; |
213 | } | 214 | } |
214 | suspend_console(); | 215 | suspend_console(); |
216 | ftrace_stop(); | ||
215 | suspend_test_start(); | 217 | suspend_test_start(); |
216 | error = dpm_suspend_start(PMSG_SUSPEND); | 218 | error = dpm_suspend_start(PMSG_SUSPEND); |
217 | if (error) { | 219 | if (error) { |
@@ -231,6 +233,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
231 | suspend_test_start(); | 233 | suspend_test_start(); |
232 | dpm_resume_end(PMSG_RESUME); | 234 | dpm_resume_end(PMSG_RESUME); |
233 | suspend_test_finish("resume devices"); | 235 | suspend_test_finish("resume devices"); |
236 | ftrace_start(); | ||
234 | resume_console(); | 237 | resume_console(); |
235 | Close: | 238 | Close: |
236 | if (suspend_ops->end) | 239 | if (suspend_ops->end) |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11e22c068e8b..3c9d764eb0d8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle, | |||
448 | struct timeval start; | 448 | struct timeval start; |
449 | struct timeval stop; | 449 | struct timeval stop; |
450 | 450 | ||
451 | printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", | 451 | printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", |
452 | nr_to_write); | 452 | nr_to_write); |
453 | m = nr_to_write / 100; | 453 | m = nr_to_write / 10; |
454 | if (!m) | 454 | if (!m) |
455 | m = 1; | 455 | m = 1; |
456 | nr_pages = 0; | 456 | nr_pages = 0; |
@@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle, | |||
464 | if (ret) | 464 | if (ret) |
465 | break; | 465 | break; |
466 | if (!(nr_pages % m)) | 466 | if (!(nr_pages % m)) |
467 | printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); | 467 | printk(KERN_INFO "PM: Image saving progress: %3d%%\n", |
468 | nr_pages / m * 10); | ||
468 | nr_pages++; | 469 | nr_pages++; |
469 | } | 470 | } |
470 | err2 = hib_wait_on_bio_chain(&bio); | 471 | err2 = hib_wait_on_bio_chain(&bio); |
@@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle, | |||
472 | if (!ret) | 473 | if (!ret) |
473 | ret = err2; | 474 | ret = err2; |
474 | if (!ret) | 475 | if (!ret) |
475 | printk(KERN_CONT "\b\b\b\bdone\n"); | 476 | printk(KERN_INFO "PM: Image saving done.\n"); |
476 | else | ||
477 | printk(KERN_CONT "\n"); | ||
478 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 477 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); |
479 | return ret; | 478 | return ret; |
480 | } | 479 | } |
@@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
668 | 667 | ||
669 | printk(KERN_INFO | 668 | printk(KERN_INFO |
670 | "PM: Using %u thread(s) for compression.\n" | 669 | "PM: Using %u thread(s) for compression.\n" |
671 | "PM: Compressing and saving image data (%u pages) ... ", | 670 | "PM: Compressing and saving image data (%u pages)...\n", |
672 | nr_threads, nr_to_write); | 671 | nr_threads, nr_to_write); |
673 | m = nr_to_write / 100; | 672 | m = nr_to_write / 10; |
674 | if (!m) | 673 | if (!m) |
675 | m = 1; | 674 | m = 1; |
676 | nr_pages = 0; | 675 | nr_pages = 0; |
@@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
690 | data_of(*snapshot), PAGE_SIZE); | 689 | data_of(*snapshot), PAGE_SIZE); |
691 | 690 | ||
692 | if (!(nr_pages % m)) | 691 | if (!(nr_pages % m)) |
693 | printk(KERN_CONT "\b\b\b\b%3d%%", | 692 | printk(KERN_INFO |
694 | nr_pages / m); | 693 | "PM: Image saving progress: " |
694 | "%3d%%\n", | ||
695 | nr_pages / m * 10); | ||
695 | nr_pages++; | 696 | nr_pages++; |
696 | } | 697 | } |
697 | if (!off) | 698 | if (!off) |
@@ -761,11 +762,8 @@ out_finish: | |||
761 | do_gettimeofday(&stop); | 762 | do_gettimeofday(&stop); |
762 | if (!ret) | 763 | if (!ret) |
763 | ret = err2; | 764 | ret = err2; |
764 | if (!ret) { | 765 | if (!ret) |
765 | printk(KERN_CONT "\b\b\b\bdone\n"); | 766 | printk(KERN_INFO "PM: Image saving done.\n"); |
766 | } else { | ||
767 | printk(KERN_CONT "\n"); | ||
768 | } | ||
769 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 767 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); |
770 | out_clean: | 768 | out_clean: |
771 | if (crc) { | 769 | if (crc) { |
@@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle, | |||
973 | int err2; | 971 | int err2; |
974 | unsigned nr_pages; | 972 | unsigned nr_pages; |
975 | 973 | ||
976 | printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", | 974 | printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", |
977 | nr_to_read); | 975 | nr_to_read); |
978 | m = nr_to_read / 100; | 976 | m = nr_to_read / 10; |
979 | if (!m) | 977 | if (!m) |
980 | m = 1; | 978 | m = 1; |
981 | nr_pages = 0; | 979 | nr_pages = 0; |
@@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle, | |||
993 | if (ret) | 991 | if (ret) |
994 | break; | 992 | break; |
995 | if (!(nr_pages % m)) | 993 | if (!(nr_pages % m)) |
996 | printk("\b\b\b\b%3d%%", nr_pages / m); | 994 | printk(KERN_INFO "PM: Image loading progress: %3d%%\n", |
995 | nr_pages / m * 10); | ||
997 | nr_pages++; | 996 | nr_pages++; |
998 | } | 997 | } |
999 | err2 = hib_wait_on_bio_chain(&bio); | 998 | err2 = hib_wait_on_bio_chain(&bio); |
@@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle, | |||
1001 | if (!ret) | 1000 | if (!ret) |
1002 | ret = err2; | 1001 | ret = err2; |
1003 | if (!ret) { | 1002 | if (!ret) { |
1004 | printk("\b\b\b\bdone\n"); | 1003 | printk(KERN_INFO "PM: Image loading done.\n"); |
1005 | snapshot_write_finalize(snapshot); | 1004 | snapshot_write_finalize(snapshot); |
1006 | if (!snapshot_image_loaded(snapshot)) | 1005 | if (!snapshot_image_loaded(snapshot)) |
1007 | ret = -ENODATA; | 1006 | ret = -ENODATA; |
1008 | } else | 1007 | } |
1009 | printk("\n"); | ||
1010 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 1008 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
1011 | return ret; | 1009 | return ret; |
1012 | } | 1010 | } |
@@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1185 | 1183 | ||
1186 | printk(KERN_INFO | 1184 | printk(KERN_INFO |
1187 | "PM: Using %u thread(s) for decompression.\n" | 1185 | "PM: Using %u thread(s) for decompression.\n" |
1188 | "PM: Loading and decompressing image data (%u pages) ... ", | 1186 | "PM: Loading and decompressing image data (%u pages)...\n", |
1189 | nr_threads, nr_to_read); | 1187 | nr_threads, nr_to_read); |
1190 | m = nr_to_read / 100; | 1188 | m = nr_to_read / 10; |
1191 | if (!m) | 1189 | if (!m) |
1192 | m = 1; | 1190 | m = 1; |
1193 | nr_pages = 0; | 1191 | nr_pages = 0; |
@@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1319 | data[thr].unc + off, PAGE_SIZE); | 1317 | data[thr].unc + off, PAGE_SIZE); |
1320 | 1318 | ||
1321 | if (!(nr_pages % m)) | 1319 | if (!(nr_pages % m)) |
1322 | printk("\b\b\b\b%3d%%", nr_pages / m); | 1320 | printk(KERN_INFO |
1321 | "PM: Image loading progress: " | ||
1322 | "%3d%%\n", | ||
1323 | nr_pages / m * 10); | ||
1323 | nr_pages++; | 1324 | nr_pages++; |
1324 | 1325 | ||
1325 | ret = snapshot_write_next(snapshot); | 1326 | ret = snapshot_write_next(snapshot); |
@@ -1344,7 +1345,7 @@ out_finish: | |||
1344 | } | 1345 | } |
1345 | do_gettimeofday(&stop); | 1346 | do_gettimeofday(&stop); |
1346 | if (!ret) { | 1347 | if (!ret) { |
1347 | printk("\b\b\b\bdone\n"); | 1348 | printk(KERN_INFO "PM: Image loading done.\n"); |
1348 | snapshot_write_finalize(snapshot); | 1349 | snapshot_write_finalize(snapshot); |
1349 | if (!snapshot_image_loaded(snapshot)) | 1350 | if (!snapshot_image_loaded(snapshot)) |
1350 | ret = -ENODATA; | 1351 | ret = -ENODATA; |
@@ -1357,8 +1358,7 @@ out_finish: | |||
1357 | } | 1358 | } |
1358 | } | 1359 | } |
1359 | } | 1360 | } |
1360 | } else | 1361 | } |
1361 | printk("\n"); | ||
1362 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 1362 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
1363 | out_clean: | 1363 | out_clean: |
1364 | for (i = 0; i < ring_size; i++) | 1364 | for (i = 0; i < ring_size; i++) |
@@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode) | |||
1472 | blkdev_put(hib_resume_bdev, mode); | 1472 | blkdev_put(hib_resume_bdev, mode); |
1473 | } | 1473 | } |
1474 | 1474 | ||
1475 | /** | ||
1476 | * swsusp_unmark - Unmark swsusp signature in the resume device | ||
1477 | */ | ||
1478 | |||
1479 | #ifdef CONFIG_SUSPEND | ||
1480 | int swsusp_unmark(void) | ||
1481 | { | ||
1482 | int error; | ||
1483 | |||
1484 | hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); | ||
1485 | if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { | ||
1486 | memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); | ||
1487 | error = hib_bio_write_page(swsusp_resume_block, | ||
1488 | swsusp_header, NULL); | ||
1489 | } else { | ||
1490 | printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); | ||
1491 | error = -ENODEV; | ||
1492 | } | ||
1493 | |||
1494 | /* | ||
1495 | * We just returned from suspend, we don't need the image any more. | ||
1496 | */ | ||
1497 | free_all_swap_pages(root_swap); | ||
1498 | |||
1499 | return error; | ||
1500 | } | ||
1501 | #endif | ||
1502 | |||
1475 | static int swsusp_header_init(void) | 1503 | static int swsusp_header_init(void) |
1476 | { | 1504 | { |
1477 | swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); | 1505 | swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd021a95..4ed81e74f86f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/console.h> | 24 | #include <linux/console.h> |
25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
26 | #include <linux/freezer.h> | 26 | #include <linux/freezer.h> |
27 | #include <scsi/scsi_scan.h> | ||
28 | 27 | ||
29 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
30 | 29 | ||
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
84 | * appear. | 83 | * appear. |
85 | */ | 84 | */ |
86 | wait_for_device_probe(); | 85 | wait_for_device_probe(); |
87 | scsi_complete_async_scans(); | ||
88 | 86 | ||
89 | data->swap = -1; | 87 | data->swap = -1; |
90 | data->mode = O_WRONLY; | 88 | data->mode = O_WRONLY; |
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index c8fba3380076..8f50de394d22 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c | |||
@@ -9,6 +9,7 @@ | |||
9 | * manipulate wakelocks on Android. | 9 | * manipulate wakelocks on Android. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/capability.h> | ||
12 | #include <linux/ctype.h> | 13 | #include <linux/ctype.h> |
13 | #include <linux/device.h> | 14 | #include <linux/device.h> |
14 | #include <linux/err.h> | 15 | #include <linux/err.h> |
@@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf) | |||
188 | size_t len; | 189 | size_t len; |
189 | int ret = 0; | 190 | int ret = 0; |
190 | 191 | ||
192 | if (!capable(CAP_BLOCK_SUSPEND)) | ||
193 | return -EPERM; | ||
194 | |||
191 | while (*str && !isspace(*str)) | 195 | while (*str && !isspace(*str)) |
192 | str++; | 196 | str++; |
193 | 197 | ||
@@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf) | |||
231 | size_t len; | 235 | size_t len; |
232 | int ret = 0; | 236 | int ret = 0; |
233 | 237 | ||
238 | if (!capable(CAP_BLOCK_SUSPEND)) | ||
239 | return -EPERM; | ||
240 | |||
234 | len = strlen(buf); | 241 | len = strlen(buf); |
235 | if (!len) | 242 | if (!len) |
236 | return -EINVAL; | 243 | return -EINVAL; |
diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2b364a..66a2ea37b576 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -193,12 +193,21 @@ static int console_may_schedule; | |||
193 | * separated by ',', and find the message after the ';' character. | 193 | * separated by ',', and find the message after the ';' character. |
194 | */ | 194 | */ |
195 | 195 | ||
196 | enum log_flags { | ||
197 | LOG_NOCONS = 1, /* already flushed, do not print to console */ | ||
198 | LOG_NEWLINE = 2, /* text ended with a newline */ | ||
199 | LOG_PREFIX = 4, /* text started with a prefix */ | ||
200 | LOG_CONT = 8, /* text is a fragment of a continuation line */ | ||
201 | }; | ||
202 | |||
196 | struct log { | 203 | struct log { |
197 | u64 ts_nsec; /* timestamp in nanoseconds */ | 204 | u64 ts_nsec; /* timestamp in nanoseconds */ |
198 | u16 len; /* length of entire record */ | 205 | u16 len; /* length of entire record */ |
199 | u16 text_len; /* length of text buffer */ | 206 | u16 text_len; /* length of text buffer */ |
200 | u16 dict_len; /* length of dictionary buffer */ | 207 | u16 dict_len; /* length of dictionary buffer */ |
201 | u16 level; /* syslog level + facility */ | 208 | u8 facility; /* syslog facility */ |
209 | u8 flags:5; /* internal record flags */ | ||
210 | u8 level:3; /* syslog level */ | ||
202 | }; | 211 | }; |
203 | 212 | ||
204 | /* | 213 | /* |
@@ -207,9 +216,12 @@ struct log { | |||
207 | */ | 216 | */ |
208 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | 217 | static DEFINE_RAW_SPINLOCK(logbuf_lock); |
209 | 218 | ||
219 | #ifdef CONFIG_PRINTK | ||
210 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | 220 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ |
211 | static u64 syslog_seq; | 221 | static u64 syslog_seq; |
212 | static u32 syslog_idx; | 222 | static u32 syslog_idx; |
223 | static enum log_flags syslog_prev; | ||
224 | static size_t syslog_partial; | ||
213 | 225 | ||
214 | /* index and sequence number of the first record stored in the buffer */ | 226 | /* index and sequence number of the first record stored in the buffer */ |
215 | static u64 log_first_seq; | 227 | static u64 log_first_seq; |
@@ -217,20 +229,25 @@ static u32 log_first_idx; | |||
217 | 229 | ||
218 | /* index and sequence number of the next record to store in the buffer */ | 230 | /* index and sequence number of the next record to store in the buffer */ |
219 | static u64 log_next_seq; | 231 | static u64 log_next_seq; |
220 | #ifdef CONFIG_PRINTK | ||
221 | static u32 log_next_idx; | 232 | static u32 log_next_idx; |
222 | 233 | ||
234 | /* the next printk record to write to the console */ | ||
235 | static u64 console_seq; | ||
236 | static u32 console_idx; | ||
237 | static enum log_flags console_prev; | ||
238 | |||
223 | /* the next printk record to read after the last 'clear' command */ | 239 | /* the next printk record to read after the last 'clear' command */ |
224 | static u64 clear_seq; | 240 | static u64 clear_seq; |
225 | static u32 clear_idx; | 241 | static u32 clear_idx; |
226 | 242 | ||
227 | #define LOG_LINE_MAX 1024 | 243 | #define PREFIX_MAX 32 |
244 | #define LOG_LINE_MAX 1024 - PREFIX_MAX | ||
228 | 245 | ||
229 | /* record buffer */ | 246 | /* record buffer */ |
230 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | 247 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
231 | #define LOG_ALIGN 4 | 248 | #define LOG_ALIGN 4 |
232 | #else | 249 | #else |
233 | #define LOG_ALIGN 8 | 250 | #define LOG_ALIGN __alignof__(struct log) |
234 | #endif | 251 | #endif |
235 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 252 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
236 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | 253 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); |
@@ -286,6 +303,7 @@ static u32 log_next(u32 idx) | |||
286 | 303 | ||
287 | /* insert record into the buffer, discard old ones, update heads */ | 304 | /* insert record into the buffer, discard old ones, update heads */ |
288 | static void log_store(int facility, int level, | 305 | static void log_store(int facility, int level, |
306 | enum log_flags flags, u64 ts_nsec, | ||
289 | const char *dict, u16 dict_len, | 307 | const char *dict, u16 dict_len, |
290 | const char *text, u16 text_len) | 308 | const char *text, u16 text_len) |
291 | { | 309 | { |
@@ -329,8 +347,13 @@ static void log_store(int facility, int level, | |||
329 | msg->text_len = text_len; | 347 | msg->text_len = text_len; |
330 | memcpy(log_dict(msg), dict, dict_len); | 348 | memcpy(log_dict(msg), dict, dict_len); |
331 | msg->dict_len = dict_len; | 349 | msg->dict_len = dict_len; |
332 | msg->level = (facility << 3) | (level & 7); | 350 | msg->facility = facility; |
333 | msg->ts_nsec = local_clock(); | 351 | msg->level = level & 7; |
352 | msg->flags = flags & 0x1f; | ||
353 | if (ts_nsec > 0) | ||
354 | msg->ts_nsec = ts_nsec; | ||
355 | else | ||
356 | msg->ts_nsec = local_clock(); | ||
334 | memset(log_dict(msg) + dict_len, 0, pad_len); | 357 | memset(log_dict(msg) + dict_len, 0, pad_len); |
335 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | 358 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; |
336 | 359 | ||
@@ -343,6 +366,7 @@ static void log_store(int facility, int level, | |||
343 | struct devkmsg_user { | 366 | struct devkmsg_user { |
344 | u64 seq; | 367 | u64 seq; |
345 | u32 idx; | 368 | u32 idx; |
369 | enum log_flags prev; | ||
346 | struct mutex lock; | 370 | struct mutex lock; |
347 | char buf[8192]; | 371 | char buf[8192]; |
348 | }; | 372 | }; |
@@ -365,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | |||
365 | 389 | ||
366 | line = buf; | 390 | line = buf; |
367 | for (i = 0; i < count; i++) { | 391 | for (i = 0; i < count; i++) { |
368 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) | 392 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { |
393 | ret = -EFAULT; | ||
369 | goto out; | 394 | goto out; |
395 | } | ||
370 | line += iv[i].iov_len; | 396 | line += iv[i].iov_len; |
371 | } | 397 | } |
372 | 398 | ||
@@ -408,27 +434,30 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
408 | struct log *msg; | 434 | struct log *msg; |
409 | u64 ts_usec; | 435 | u64 ts_usec; |
410 | size_t i; | 436 | size_t i; |
437 | char cont = '-'; | ||
411 | size_t len; | 438 | size_t len; |
412 | ssize_t ret; | 439 | ssize_t ret; |
413 | 440 | ||
414 | if (!user) | 441 | if (!user) |
415 | return -EBADF; | 442 | return -EBADF; |
416 | 443 | ||
417 | mutex_lock(&user->lock); | 444 | ret = mutex_lock_interruptible(&user->lock); |
418 | raw_spin_lock(&logbuf_lock); | 445 | if (ret) |
446 | return ret; | ||
447 | raw_spin_lock_irq(&logbuf_lock); | ||
419 | while (user->seq == log_next_seq) { | 448 | while (user->seq == log_next_seq) { |
420 | if (file->f_flags & O_NONBLOCK) { | 449 | if (file->f_flags & O_NONBLOCK) { |
421 | ret = -EAGAIN; | 450 | ret = -EAGAIN; |
422 | raw_spin_unlock(&logbuf_lock); | 451 | raw_spin_unlock_irq(&logbuf_lock); |
423 | goto out; | 452 | goto out; |
424 | } | 453 | } |
425 | 454 | ||
426 | raw_spin_unlock(&logbuf_lock); | 455 | raw_spin_unlock_irq(&logbuf_lock); |
427 | ret = wait_event_interruptible(log_wait, | 456 | ret = wait_event_interruptible(log_wait, |
428 | user->seq != log_next_seq); | 457 | user->seq != log_next_seq); |
429 | if (ret) | 458 | if (ret) |
430 | goto out; | 459 | goto out; |
431 | raw_spin_lock(&logbuf_lock); | 460 | raw_spin_lock_irq(&logbuf_lock); |
432 | } | 461 | } |
433 | 462 | ||
434 | if (user->seq < log_first_seq) { | 463 | if (user->seq < log_first_seq) { |
@@ -436,21 +465,38 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
436 | user->idx = log_first_idx; | 465 | user->idx = log_first_idx; |
437 | user->seq = log_first_seq; | 466 | user->seq = log_first_seq; |
438 | ret = -EPIPE; | 467 | ret = -EPIPE; |
439 | raw_spin_unlock(&logbuf_lock); | 468 | raw_spin_unlock_irq(&logbuf_lock); |
440 | goto out; | 469 | goto out; |
441 | } | 470 | } |
442 | 471 | ||
443 | msg = log_from_idx(user->idx); | 472 | msg = log_from_idx(user->idx); |
444 | ts_usec = msg->ts_nsec; | 473 | ts_usec = msg->ts_nsec; |
445 | do_div(ts_usec, 1000); | 474 | do_div(ts_usec, 1000); |
446 | len = sprintf(user->buf, "%u,%llu,%llu;", | 475 | |
447 | msg->level, user->seq, ts_usec); | 476 | /* |
477 | * If we couldn't merge continuation line fragments during the print, | ||
478 | * export the stored flags to allow an optional external merge of the | ||
479 | * records. Merging the records isn't always neccessarily correct, like | ||
480 | * when we hit a race during printing. In most cases though, it produces | ||
481 | * better readable output. 'c' in the record flags mark the first | ||
482 | * fragment of a line, '+' the following. | ||
483 | */ | ||
484 | if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) | ||
485 | cont = 'c'; | ||
486 | else if ((msg->flags & LOG_CONT) || | ||
487 | ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) | ||
488 | cont = '+'; | ||
489 | |||
490 | len = sprintf(user->buf, "%u,%llu,%llu,%c;", | ||
491 | (msg->facility << 3) | msg->level, | ||
492 | user->seq, ts_usec, cont); | ||
493 | user->prev = msg->flags; | ||
448 | 494 | ||
449 | /* escape non-printable characters */ | 495 | /* escape non-printable characters */ |
450 | for (i = 0; i < msg->text_len; i++) { | 496 | for (i = 0; i < msg->text_len; i++) { |
451 | unsigned char c = log_text(msg)[i]; | 497 | unsigned char c = log_text(msg)[i]; |
452 | 498 | ||
453 | if (c < ' ' || c >= 128) | 499 | if (c < ' ' || c >= 127 || c == '\\') |
454 | len += sprintf(user->buf + len, "\\x%02x", c); | 500 | len += sprintf(user->buf + len, "\\x%02x", c); |
455 | else | 501 | else |
456 | user->buf[len++] = c; | 502 | user->buf[len++] = c; |
@@ -474,7 +520,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
474 | continue; | 520 | continue; |
475 | } | 521 | } |
476 | 522 | ||
477 | if (c < ' ' || c >= 128) { | 523 | if (c < ' ' || c >= 127 || c == '\\') { |
478 | len += sprintf(user->buf + len, "\\x%02x", c); | 524 | len += sprintf(user->buf + len, "\\x%02x", c); |
479 | continue; | 525 | continue; |
480 | } | 526 | } |
@@ -486,7 +532,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
486 | 532 | ||
487 | user->idx = log_next(user->idx); | 533 | user->idx = log_next(user->idx); |
488 | user->seq++; | 534 | user->seq++; |
489 | raw_spin_unlock(&logbuf_lock); | 535 | raw_spin_unlock_irq(&logbuf_lock); |
490 | 536 | ||
491 | if (len > count) { | 537 | if (len > count) { |
492 | ret = -EINVAL; | 538 | ret = -EINVAL; |
@@ -513,7 +559,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
513 | if (offset) | 559 | if (offset) |
514 | return -ESPIPE; | 560 | return -ESPIPE; |
515 | 561 | ||
516 | raw_spin_lock(&logbuf_lock); | 562 | raw_spin_lock_irq(&logbuf_lock); |
517 | switch (whence) { | 563 | switch (whence) { |
518 | case SEEK_SET: | 564 | case SEEK_SET: |
519 | /* the first record */ | 565 | /* the first record */ |
@@ -537,7 +583,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
537 | default: | 583 | default: |
538 | ret = -EINVAL; | 584 | ret = -EINVAL; |
539 | } | 585 | } |
540 | raw_spin_unlock(&logbuf_lock); | 586 | raw_spin_unlock_irq(&logbuf_lock); |
541 | return ret; | 587 | return ret; |
542 | } | 588 | } |
543 | 589 | ||
@@ -551,14 +597,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | |||
551 | 597 | ||
552 | poll_wait(file, &log_wait, wait); | 598 | poll_wait(file, &log_wait, wait); |
553 | 599 | ||
554 | raw_spin_lock(&logbuf_lock); | 600 | raw_spin_lock_irq(&logbuf_lock); |
555 | if (user->seq < log_next_seq) { | 601 | if (user->seq < log_next_seq) { |
556 | /* return error when data has vanished underneath us */ | 602 | /* return error when data has vanished underneath us */ |
557 | if (user->seq < log_first_seq) | 603 | if (user->seq < log_first_seq) |
558 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | 604 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; |
559 | ret = POLLIN|POLLRDNORM; | 605 | ret = POLLIN|POLLRDNORM; |
560 | } | 606 | } |
561 | raw_spin_unlock(&logbuf_lock); | 607 | raw_spin_unlock_irq(&logbuf_lock); |
562 | 608 | ||
563 | return ret; | 609 | return ret; |
564 | } | 610 | } |
@@ -582,10 +628,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) | |||
582 | 628 | ||
583 | mutex_init(&user->lock); | 629 | mutex_init(&user->lock); |
584 | 630 | ||
585 | raw_spin_lock(&logbuf_lock); | 631 | raw_spin_lock_irq(&logbuf_lock); |
586 | user->idx = log_first_idx; | 632 | user->idx = log_first_idx; |
587 | user->seq = log_first_seq; | 633 | user->seq = log_first_seq; |
588 | raw_spin_unlock(&logbuf_lock); | 634 | raw_spin_unlock_irq(&logbuf_lock); |
589 | 635 | ||
590 | file->private_data = user; | 636 | file->private_data = user; |
591 | return 0; | 637 | return 0; |
@@ -627,6 +673,15 @@ void log_buf_kexec_setup(void) | |||
627 | VMCOREINFO_SYMBOL(log_buf_len); | 673 | VMCOREINFO_SYMBOL(log_buf_len); |
628 | VMCOREINFO_SYMBOL(log_first_idx); | 674 | VMCOREINFO_SYMBOL(log_first_idx); |
629 | VMCOREINFO_SYMBOL(log_next_idx); | 675 | VMCOREINFO_SYMBOL(log_next_idx); |
676 | /* | ||
677 | * Export struct log size and field offsets. User space tools can | ||
678 | * parse it and detect any changes to structure down the line. | ||
679 | */ | ||
680 | VMCOREINFO_STRUCT_SIZE(log); | ||
681 | VMCOREINFO_OFFSET(log, ts_nsec); | ||
682 | VMCOREINFO_OFFSET(log, len); | ||
683 | VMCOREINFO_OFFSET(log, text_len); | ||
684 | VMCOREINFO_OFFSET(log, dict_len); | ||
630 | } | 685 | } |
631 | #endif | 686 | #endif |
632 | 687 | ||
@@ -785,44 +840,64 @@ static bool printk_time; | |||
785 | #endif | 840 | #endif |
786 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 841 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
787 | 842 | ||
843 | static size_t print_time(u64 ts, char *buf) | ||
844 | { | ||
845 | unsigned long rem_nsec; | ||
846 | |||
847 | if (!printk_time) | ||
848 | return 0; | ||
849 | |||
850 | if (!buf) | ||
851 | return 15; | ||
852 | |||
853 | rem_nsec = do_div(ts, 1000000000); | ||
854 | return sprintf(buf, "[%5lu.%06lu] ", | ||
855 | (unsigned long)ts, rem_nsec / 1000); | ||
856 | } | ||
857 | |||
788 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | 858 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) |
789 | { | 859 | { |
790 | size_t len = 0; | 860 | size_t len = 0; |
861 | unsigned int prefix = (msg->facility << 3) | msg->level; | ||
791 | 862 | ||
792 | if (syslog) { | 863 | if (syslog) { |
793 | if (buf) { | 864 | if (buf) { |
794 | len += sprintf(buf, "<%u>", msg->level); | 865 | len += sprintf(buf, "<%u>", prefix); |
795 | } else { | 866 | } else { |
796 | len += 3; | 867 | len += 3; |
797 | if (msg->level > 9) | 868 | if (prefix > 999) |
798 | len++; | 869 | len += 3; |
799 | if (msg->level > 99) | 870 | else if (prefix > 99) |
871 | len += 2; | ||
872 | else if (prefix > 9) | ||
800 | len++; | 873 | len++; |
801 | } | 874 | } |
802 | } | 875 | } |
803 | 876 | ||
804 | if (printk_time) { | 877 | len += print_time(msg->ts_nsec, buf ? buf + len : NULL); |
805 | if (buf) { | ||
806 | unsigned long long ts = msg->ts_nsec; | ||
807 | unsigned long rem_nsec = do_div(ts, 1000000000); | ||
808 | |||
809 | len += sprintf(buf + len, "[%5lu.%06lu] ", | ||
810 | (unsigned long) ts, rem_nsec / 1000); | ||
811 | } else { | ||
812 | len += 15; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | return len; | 878 | return len; |
817 | } | 879 | } |
818 | 880 | ||
819 | static size_t msg_print_text(const struct log *msg, bool syslog, | 881 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, |
820 | char *buf, size_t size) | 882 | bool syslog, char *buf, size_t size) |
821 | { | 883 | { |
822 | const char *text = log_text(msg); | 884 | const char *text = log_text(msg); |
823 | size_t text_size = msg->text_len; | 885 | size_t text_size = msg->text_len; |
886 | bool prefix = true; | ||
887 | bool newline = true; | ||
824 | size_t len = 0; | 888 | size_t len = 0; |
825 | 889 | ||
890 | if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) | ||
891 | prefix = false; | ||
892 | |||
893 | if (msg->flags & LOG_CONT) { | ||
894 | if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) | ||
895 | prefix = false; | ||
896 | |||
897 | if (!(msg->flags & LOG_NEWLINE)) | ||
898 | newline = false; | ||
899 | } | ||
900 | |||
826 | do { | 901 | do { |
827 | const char *next = memchr(text, '\n', text_size); | 902 | const char *next = memchr(text, '\n', text_size); |
828 | size_t text_len; | 903 | size_t text_len; |
@@ -837,19 +912,25 @@ static size_t msg_print_text(const struct log *msg, bool syslog, | |||
837 | 912 | ||
838 | if (buf) { | 913 | if (buf) { |
839 | if (print_prefix(msg, syslog, NULL) + | 914 | if (print_prefix(msg, syslog, NULL) + |
840 | text_len + 1>= size - len) | 915 | text_len + 1 >= size - len) |
841 | break; | 916 | break; |
842 | 917 | ||
843 | len += print_prefix(msg, syslog, buf + len); | 918 | if (prefix) |
919 | len += print_prefix(msg, syslog, buf + len); | ||
844 | memcpy(buf + len, text, text_len); | 920 | memcpy(buf + len, text, text_len); |
845 | len += text_len; | 921 | len += text_len; |
846 | buf[len++] = '\n'; | 922 | if (next || newline) |
923 | buf[len++] = '\n'; | ||
847 | } else { | 924 | } else { |
848 | /* SYSLOG_ACTION_* buffer size only calculation */ | 925 | /* SYSLOG_ACTION_* buffer size only calculation */ |
849 | len += print_prefix(msg, syslog, NULL); | 926 | if (prefix) |
850 | len += text_len + 1; | 927 | len += print_prefix(msg, syslog, NULL); |
928 | len += text_len; | ||
929 | if (next || newline) | ||
930 | len++; | ||
851 | } | 931 | } |
852 | 932 | ||
933 | prefix = true; | ||
853 | text = next; | 934 | text = next; |
854 | } while (text); | 935 | } while (text); |
855 | 936 | ||
@@ -860,26 +941,61 @@ static int syslog_print(char __user *buf, int size) | |||
860 | { | 941 | { |
861 | char *text; | 942 | char *text; |
862 | struct log *msg; | 943 | struct log *msg; |
863 | int len; | 944 | int len = 0; |
864 | 945 | ||
865 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | 946 | text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); |
866 | if (!text) | 947 | if (!text) |
867 | return -ENOMEM; | 948 | return -ENOMEM; |
868 | 949 | ||
869 | raw_spin_lock_irq(&logbuf_lock); | 950 | while (size > 0) { |
870 | if (syslog_seq < log_first_seq) { | 951 | size_t n; |
871 | /* messages are gone, move to first one */ | 952 | size_t skip; |
872 | syslog_seq = log_first_seq; | 953 | |
873 | syslog_idx = log_first_idx; | 954 | raw_spin_lock_irq(&logbuf_lock); |
874 | } | 955 | if (syslog_seq < log_first_seq) { |
875 | msg = log_from_idx(syslog_idx); | 956 | /* messages are gone, move to first one */ |
876 | len = msg_print_text(msg, true, text, LOG_LINE_MAX); | 957 | syslog_seq = log_first_seq; |
877 | syslog_idx = log_next(syslog_idx); | 958 | syslog_idx = log_first_idx; |
878 | syslog_seq++; | 959 | syslog_prev = 0; |
879 | raw_spin_unlock_irq(&logbuf_lock); | 960 | syslog_partial = 0; |
961 | } | ||
962 | if (syslog_seq == log_next_seq) { | ||
963 | raw_spin_unlock_irq(&logbuf_lock); | ||
964 | break; | ||
965 | } | ||
880 | 966 | ||
881 | if (len > 0 && copy_to_user(buf, text, len)) | 967 | skip = syslog_partial; |
882 | len = -EFAULT; | 968 | msg = log_from_idx(syslog_idx); |
969 | n = msg_print_text(msg, syslog_prev, true, text, | ||
970 | LOG_LINE_MAX + PREFIX_MAX); | ||
971 | if (n - syslog_partial <= size) { | ||
972 | /* message fits into buffer, move forward */ | ||
973 | syslog_idx = log_next(syslog_idx); | ||
974 | syslog_seq++; | ||
975 | syslog_prev = msg->flags; | ||
976 | n -= syslog_partial; | ||
977 | syslog_partial = 0; | ||
978 | } else if (!len){ | ||
979 | /* partial read(), remember position */ | ||
980 | n = size; | ||
981 | syslog_partial += n; | ||
982 | } else | ||
983 | n = 0; | ||
984 | raw_spin_unlock_irq(&logbuf_lock); | ||
985 | |||
986 | if (!n) | ||
987 | break; | ||
988 | |||
989 | if (copy_to_user(buf, text + skip, n)) { | ||
990 | if (!len) | ||
991 | len = -EFAULT; | ||
992 | break; | ||
993 | } | ||
994 | |||
995 | len += n; | ||
996 | size -= n; | ||
997 | buf += n; | ||
998 | } | ||
883 | 999 | ||
884 | kfree(text); | 1000 | kfree(text); |
885 | return len; | 1001 | return len; |
@@ -890,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
890 | char *text; | 1006 | char *text; |
891 | int len = 0; | 1007 | int len = 0; |
892 | 1008 | ||
893 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | 1009 | text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); |
894 | if (!text) | 1010 | if (!text) |
895 | return -ENOMEM; | 1011 | return -ENOMEM; |
896 | 1012 | ||
@@ -899,6 +1015,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
899 | u64 next_seq; | 1015 | u64 next_seq; |
900 | u64 seq; | 1016 | u64 seq; |
901 | u32 idx; | 1017 | u32 idx; |
1018 | enum log_flags prev; | ||
902 | 1019 | ||
903 | if (clear_seq < log_first_seq) { | 1020 | if (clear_seq < log_first_seq) { |
904 | /* messages are gone, move to first available one */ | 1021 | /* messages are gone, move to first available one */ |
@@ -909,41 +1026,50 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
909 | /* | 1026 | /* |
910 | * Find first record that fits, including all following records, | 1027 | * Find first record that fits, including all following records, |
911 | * into the user-provided buffer for this dump. | 1028 | * into the user-provided buffer for this dump. |
912 | */ | 1029 | */ |
913 | seq = clear_seq; | 1030 | seq = clear_seq; |
914 | idx = clear_idx; | 1031 | idx = clear_idx; |
1032 | prev = 0; | ||
915 | while (seq < log_next_seq) { | 1033 | while (seq < log_next_seq) { |
916 | struct log *msg = log_from_idx(idx); | 1034 | struct log *msg = log_from_idx(idx); |
917 | 1035 | ||
918 | len += msg_print_text(msg, true, NULL, 0); | 1036 | len += msg_print_text(msg, prev, true, NULL, 0); |
1037 | prev = msg->flags; | ||
919 | idx = log_next(idx); | 1038 | idx = log_next(idx); |
920 | seq++; | 1039 | seq++; |
921 | } | 1040 | } |
1041 | |||
1042 | /* move first record forward until length fits into the buffer */ | ||
922 | seq = clear_seq; | 1043 | seq = clear_seq; |
923 | idx = clear_idx; | 1044 | idx = clear_idx; |
1045 | prev = 0; | ||
924 | while (len > size && seq < log_next_seq) { | 1046 | while (len > size && seq < log_next_seq) { |
925 | struct log *msg = log_from_idx(idx); | 1047 | struct log *msg = log_from_idx(idx); |
926 | 1048 | ||
927 | len -= msg_print_text(msg, true, NULL, 0); | 1049 | len -= msg_print_text(msg, prev, true, NULL, 0); |
1050 | prev = msg->flags; | ||
928 | idx = log_next(idx); | 1051 | idx = log_next(idx); |
929 | seq++; | 1052 | seq++; |
930 | } | 1053 | } |
931 | 1054 | ||
932 | /* last message in this dump */ | 1055 | /* last message fitting into this dump */ |
933 | next_seq = log_next_seq; | 1056 | next_seq = log_next_seq; |
934 | 1057 | ||
935 | len = 0; | 1058 | len = 0; |
1059 | prev = 0; | ||
936 | while (len >= 0 && seq < next_seq) { | 1060 | while (len >= 0 && seq < next_seq) { |
937 | struct log *msg = log_from_idx(idx); | 1061 | struct log *msg = log_from_idx(idx); |
938 | int textlen; | 1062 | int textlen; |
939 | 1063 | ||
940 | textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); | 1064 | textlen = msg_print_text(msg, prev, true, text, |
1065 | LOG_LINE_MAX + PREFIX_MAX); | ||
941 | if (textlen < 0) { | 1066 | if (textlen < 0) { |
942 | len = textlen; | 1067 | len = textlen; |
943 | break; | 1068 | break; |
944 | } | 1069 | } |
945 | idx = log_next(idx); | 1070 | idx = log_next(idx); |
946 | seq++; | 1071 | seq++; |
1072 | prev = msg->flags; | ||
947 | 1073 | ||
948 | raw_spin_unlock_irq(&logbuf_lock); | 1074 | raw_spin_unlock_irq(&logbuf_lock); |
949 | if (copy_to_user(buf + len, text, textlen)) | 1075 | if (copy_to_user(buf + len, text, textlen)) |
@@ -956,6 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
956 | /* messages are gone, move to next one */ | 1082 | /* messages are gone, move to next one */ |
957 | seq = log_first_seq; | 1083 | seq = log_first_seq; |
958 | idx = log_first_idx; | 1084 | idx = log_first_idx; |
1085 | prev = 0; | ||
959 | } | 1086 | } |
960 | } | 1087 | } |
961 | } | 1088 | } |
@@ -1027,6 +1154,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1027 | /* Clear ring buffer */ | 1154 | /* Clear ring buffer */ |
1028 | case SYSLOG_ACTION_CLEAR: | 1155 | case SYSLOG_ACTION_CLEAR: |
1029 | syslog_print_all(NULL, 0, true); | 1156 | syslog_print_all(NULL, 0, true); |
1157 | break; | ||
1030 | /* Disable logging to console */ | 1158 | /* Disable logging to console */ |
1031 | case SYSLOG_ACTION_CONSOLE_OFF: | 1159 | case SYSLOG_ACTION_CONSOLE_OFF: |
1032 | if (saved_console_loglevel == -1) | 1160 | if (saved_console_loglevel == -1) |
@@ -1059,6 +1187,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1059 | /* messages are gone, move to first one */ | 1187 | /* messages are gone, move to first one */ |
1060 | syslog_seq = log_first_seq; | 1188 | syslog_seq = log_first_seq; |
1061 | syslog_idx = log_first_idx; | 1189 | syslog_idx = log_first_idx; |
1190 | syslog_prev = 0; | ||
1191 | syslog_partial = 0; | ||
1062 | } | 1192 | } |
1063 | if (from_file) { | 1193 | if (from_file) { |
1064 | /* | 1194 | /* |
@@ -1068,19 +1198,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1068 | */ | 1198 | */ |
1069 | error = log_next_idx - syslog_idx; | 1199 | error = log_next_idx - syslog_idx; |
1070 | } else { | 1200 | } else { |
1071 | u64 seq; | 1201 | u64 seq = syslog_seq; |
1072 | u32 idx; | 1202 | u32 idx = syslog_idx; |
1203 | enum log_flags prev = syslog_prev; | ||
1073 | 1204 | ||
1074 | error = 0; | 1205 | error = 0; |
1075 | seq = syslog_seq; | ||
1076 | idx = syslog_idx; | ||
1077 | while (seq < log_next_seq) { | 1206 | while (seq < log_next_seq) { |
1078 | struct log *msg = log_from_idx(idx); | 1207 | struct log *msg = log_from_idx(idx); |
1079 | 1208 | ||
1080 | error += msg_print_text(msg, true, NULL, 0); | 1209 | error += msg_print_text(msg, prev, true, NULL, 0); |
1081 | idx = log_next(idx); | 1210 | idx = log_next(idx); |
1082 | seq++; | 1211 | seq++; |
1212 | prev = msg->flags; | ||
1083 | } | 1213 | } |
1214 | error -= syslog_partial; | ||
1084 | } | 1215 | } |
1085 | raw_spin_unlock_irq(&logbuf_lock); | 1216 | raw_spin_unlock_irq(&logbuf_lock); |
1086 | break; | 1217 | break; |
@@ -1101,21 +1232,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | |||
1101 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); | 1232 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
1102 | } | 1233 | } |
1103 | 1234 | ||
1104 | #ifdef CONFIG_KGDB_KDB | ||
1105 | /* kdb dmesg command needs access to the syslog buffer. do_syslog() | ||
1106 | * uses locks so it cannot be used during debugging. Just tell kdb | ||
1107 | * where the start and end of the physical and logical logs are. This | ||
1108 | * is equivalent to do_syslog(3). | ||
1109 | */ | ||
1110 | void kdb_syslog_data(char *syslog_data[4]) | ||
1111 | { | ||
1112 | syslog_data[0] = log_buf; | ||
1113 | syslog_data[1] = log_buf + log_buf_len; | ||
1114 | syslog_data[2] = log_buf + log_first_idx; | ||
1115 | syslog_data[3] = log_buf + log_next_idx; | ||
1116 | } | ||
1117 | #endif /* CONFIG_KGDB_KDB */ | ||
1118 | |||
1119 | static bool __read_mostly ignore_loglevel; | 1235 | static bool __read_mostly ignore_loglevel; |
1120 | 1236 | ||
1121 | static int __init ignore_loglevel_setup(char *str) | 1237 | static int __init ignore_loglevel_setup(char *str) |
@@ -1259,22 +1375,121 @@ static inline void printk_delay(void) | |||
1259 | } | 1375 | } |
1260 | } | 1376 | } |
1261 | 1377 | ||
1378 | /* | ||
1379 | * Continuation lines are buffered, and not committed to the record buffer | ||
1380 | * until the line is complete, or a race forces it. The line fragments | ||
1381 | * though, are printed immediately to the consoles to ensure everything has | ||
1382 | * reached the console in case of a kernel crash. | ||
1383 | */ | ||
1384 | static struct cont { | ||
1385 | char buf[LOG_LINE_MAX]; | ||
1386 | size_t len; /* length == 0 means unused buffer */ | ||
1387 | size_t cons; /* bytes written to console */ | ||
1388 | struct task_struct *owner; /* task of first print*/ | ||
1389 | u64 ts_nsec; /* time of first print */ | ||
1390 | u8 level; /* log level of first message */ | ||
1391 | u8 facility; /* log level of first message */ | ||
1392 | enum log_flags flags; /* prefix, newline flags */ | ||
1393 | bool flushed:1; /* buffer sealed and committed */ | ||
1394 | } cont; | ||
1395 | |||
1396 | static void cont_flush(enum log_flags flags) | ||
1397 | { | ||
1398 | if (cont.flushed) | ||
1399 | return; | ||
1400 | if (cont.len == 0) | ||
1401 | return; | ||
1402 | |||
1403 | if (cont.cons) { | ||
1404 | /* | ||
1405 | * If a fragment of this line was directly flushed to the | ||
1406 | * console; wait for the console to pick up the rest of the | ||
1407 | * line. LOG_NOCONS suppresses a duplicated output. | ||
1408 | */ | ||
1409 | log_store(cont.facility, cont.level, flags | LOG_NOCONS, | ||
1410 | cont.ts_nsec, NULL, 0, cont.buf, cont.len); | ||
1411 | cont.flags = flags; | ||
1412 | cont.flushed = true; | ||
1413 | } else { | ||
1414 | /* | ||
1415 | * If no fragment of this line ever reached the console, | ||
1416 | * just submit it to the store and free the buffer. | ||
1417 | */ | ||
1418 | log_store(cont.facility, cont.level, flags, 0, | ||
1419 | NULL, 0, cont.buf, cont.len); | ||
1420 | cont.len = 0; | ||
1421 | } | ||
1422 | } | ||
1423 | |||
1424 | static bool cont_add(int facility, int level, const char *text, size_t len) | ||
1425 | { | ||
1426 | if (cont.len && cont.flushed) | ||
1427 | return false; | ||
1428 | |||
1429 | if (cont.len + len > sizeof(cont.buf)) { | ||
1430 | /* the line gets too long, split it up in separate records */ | ||
1431 | cont_flush(LOG_CONT); | ||
1432 | return false; | ||
1433 | } | ||
1434 | |||
1435 | if (!cont.len) { | ||
1436 | cont.facility = facility; | ||
1437 | cont.level = level; | ||
1438 | cont.owner = current; | ||
1439 | cont.ts_nsec = local_clock(); | ||
1440 | cont.flags = 0; | ||
1441 | cont.cons = 0; | ||
1442 | cont.flushed = false; | ||
1443 | } | ||
1444 | |||
1445 | memcpy(cont.buf + cont.len, text, len); | ||
1446 | cont.len += len; | ||
1447 | |||
1448 | if (cont.len > (sizeof(cont.buf) * 80) / 100) | ||
1449 | cont_flush(LOG_CONT); | ||
1450 | |||
1451 | return true; | ||
1452 | } | ||
1453 | |||
1454 | static size_t cont_print_text(char *text, size_t size) | ||
1455 | { | ||
1456 | size_t textlen = 0; | ||
1457 | size_t len; | ||
1458 | |||
1459 | if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { | ||
1460 | textlen += print_time(cont.ts_nsec, text); | ||
1461 | size -= textlen; | ||
1462 | } | ||
1463 | |||
1464 | len = cont.len - cont.cons; | ||
1465 | if (len > 0) { | ||
1466 | if (len+1 > size) | ||
1467 | len = size-1; | ||
1468 | memcpy(text + textlen, cont.buf + cont.cons, len); | ||
1469 | textlen += len; | ||
1470 | cont.cons = cont.len; | ||
1471 | } | ||
1472 | |||
1473 | if (cont.flushed) { | ||
1474 | if (cont.flags & LOG_NEWLINE) | ||
1475 | text[textlen++] = '\n'; | ||
1476 | /* got everything, release buffer */ | ||
1477 | cont.len = 0; | ||
1478 | } | ||
1479 | return textlen; | ||
1480 | } | ||
1481 | |||
1262 | asmlinkage int vprintk_emit(int facility, int level, | 1482 | asmlinkage int vprintk_emit(int facility, int level, |
1263 | const char *dict, size_t dictlen, | 1483 | const char *dict, size_t dictlen, |
1264 | const char *fmt, va_list args) | 1484 | const char *fmt, va_list args) |
1265 | { | 1485 | { |
1266 | static int recursion_bug; | 1486 | static int recursion_bug; |
1267 | static char cont_buf[LOG_LINE_MAX]; | ||
1268 | static size_t cont_len; | ||
1269 | static int cont_level; | ||
1270 | static struct task_struct *cont_task; | ||
1271 | static char textbuf[LOG_LINE_MAX]; | 1487 | static char textbuf[LOG_LINE_MAX]; |
1272 | char *text = textbuf; | 1488 | char *text = textbuf; |
1273 | size_t text_len; | 1489 | size_t text_len; |
1490 | enum log_flags lflags = 0; | ||
1274 | unsigned long flags; | 1491 | unsigned long flags; |
1275 | int this_cpu; | 1492 | int this_cpu; |
1276 | bool newline = false; | ||
1277 | bool prefix = false; | ||
1278 | int printed_len = 0; | 1493 | int printed_len = 0; |
1279 | 1494 | ||
1280 | boot_delay_msec(); | 1495 | boot_delay_msec(); |
@@ -1313,7 +1528,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1313 | recursion_bug = 0; | 1528 | recursion_bug = 0; |
1314 | printed_len += strlen(recursion_msg); | 1529 | printed_len += strlen(recursion_msg); |
1315 | /* emit KERN_CRIT message */ | 1530 | /* emit KERN_CRIT message */ |
1316 | log_store(0, 2, NULL, 0, recursion_msg, printed_len); | 1531 | log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
1532 | NULL, 0, recursion_msg, printed_len); | ||
1317 | } | 1533 | } |
1318 | 1534 | ||
1319 | /* | 1535 | /* |
@@ -1325,81 +1541,67 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1325 | /* mark and strip a trailing newline */ | 1541 | /* mark and strip a trailing newline */ |
1326 | if (text_len && text[text_len-1] == '\n') { | 1542 | if (text_len && text[text_len-1] == '\n') { |
1327 | text_len--; | 1543 | text_len--; |
1328 | newline = true; | 1544 | lflags |= LOG_NEWLINE; |
1329 | } | 1545 | } |
1330 | 1546 | ||
1331 | /* strip syslog prefix and extract log level or control flags */ | 1547 | /* strip kernel syslog prefix and extract log level or control flags */ |
1332 | if (text[0] == '<' && text[1] && text[2] == '>') { | 1548 | if (facility == 0) { |
1333 | switch (text[1]) { | 1549 | int kern_level = printk_get_level(text); |
1334 | case '0' ... '7': | 1550 | |
1335 | if (level == -1) | 1551 | if (kern_level) { |
1336 | level = text[1] - '0'; | 1552 | const char *end_of_header = printk_skip_level(text); |
1337 | case 'd': /* KERN_DEFAULT */ | 1553 | switch (kern_level) { |
1338 | prefix = true; | 1554 | case '0' ... '7': |
1339 | case 'c': /* KERN_CONT */ | 1555 | if (level == -1) |
1340 | text += 3; | 1556 | level = kern_level - '0'; |
1341 | text_len -= 3; | 1557 | case 'd': /* KERN_DEFAULT */ |
1558 | lflags |= LOG_PREFIX; | ||
1559 | case 'c': /* KERN_CONT */ | ||
1560 | break; | ||
1561 | } | ||
1562 | text_len -= end_of_header - text; | ||
1563 | text = (char *)end_of_header; | ||
1342 | } | 1564 | } |
1343 | } | 1565 | } |
1344 | 1566 | ||
1345 | if (level == -1) | 1567 | if (level == -1) |
1346 | level = default_message_loglevel; | 1568 | level = default_message_loglevel; |
1347 | 1569 | ||
1348 | if (dict) { | 1570 | if (dict) |
1349 | prefix = true; | 1571 | lflags |= LOG_PREFIX|LOG_NEWLINE; |
1350 | newline = true; | ||
1351 | } | ||
1352 | |||
1353 | if (!newline) { | ||
1354 | if (cont_len && (prefix || cont_task != current)) { | ||
1355 | /* | ||
1356 | * Flush earlier buffer, which is either from a | ||
1357 | * different thread, or when we got a new prefix. | ||
1358 | */ | ||
1359 | log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); | ||
1360 | cont_len = 0; | ||
1361 | } | ||
1362 | 1572 | ||
1363 | if (!cont_len) { | 1573 | if (!(lflags & LOG_NEWLINE)) { |
1364 | cont_level = level; | 1574 | /* |
1365 | cont_task = current; | 1575 | * Flush the conflicting buffer. An earlier newline was missing, |
1366 | } | 1576 | * or another task also prints continuation lines. |
1577 | */ | ||
1578 | if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) | ||
1579 | cont_flush(LOG_NEWLINE); | ||
1367 | 1580 | ||
1368 | /* buffer or append to earlier buffer from the same thread */ | 1581 | /* buffer line if possible, otherwise store it right away */ |
1369 | if (cont_len + text_len > sizeof(cont_buf)) | 1582 | if (!cont_add(facility, level, text, text_len)) |
1370 | text_len = sizeof(cont_buf) - cont_len; | 1583 | log_store(facility, level, lflags | LOG_CONT, 0, |
1371 | memcpy(cont_buf + cont_len, text, text_len); | 1584 | dict, dictlen, text, text_len); |
1372 | cont_len += text_len; | ||
1373 | } else { | 1585 | } else { |
1374 | if (cont_len && cont_task == current) { | 1586 | bool stored = false; |
1375 | if (prefix) { | ||
1376 | /* | ||
1377 | * New prefix from the same thread; flush. We | ||
1378 | * either got no earlier newline, or we race | ||
1379 | * with an interrupt. | ||
1380 | */ | ||
1381 | log_store(facility, cont_level, | ||
1382 | NULL, 0, cont_buf, cont_len); | ||
1383 | cont_len = 0; | ||
1384 | } | ||
1385 | 1587 | ||
1386 | /* append to the earlier buffer and flush */ | 1588 | /* |
1387 | if (cont_len + text_len > sizeof(cont_buf)) | 1589 | * If an earlier newline was missing and it was the same task, |
1388 | text_len = sizeof(cont_buf) - cont_len; | 1590 | * either merge it with the current buffer and flush, or if |
1389 | memcpy(cont_buf + cont_len, text, text_len); | 1591 | * there was a race with interrupts (prefix == true) then just |
1390 | cont_len += text_len; | 1592 | * flush it out and store this line separately. |
1391 | log_store(facility, cont_level, | 1593 | */ |
1392 | NULL, 0, cont_buf, cont_len); | 1594 | if (cont.len && cont.owner == current) { |
1393 | cont_len = 0; | 1595 | if (!(lflags & LOG_PREFIX)) |
1394 | cont_task = NULL; | 1596 | stored = cont_add(facility, level, text, text_len); |
1395 | printed_len = cont_len; | 1597 | cont_flush(LOG_NEWLINE); |
1396 | } else { | ||
1397 | /* ordinary single and terminated line */ | ||
1398 | log_store(facility, level, | ||
1399 | dict, dictlen, text, text_len); | ||
1400 | printed_len = text_len; | ||
1401 | } | 1598 | } |
1599 | |||
1600 | if (!stored) | ||
1601 | log_store(facility, level, lflags, 0, | ||
1602 | dict, dictlen, text, text_len); | ||
1402 | } | 1603 | } |
1604 | printed_len += text_len; | ||
1403 | 1605 | ||
1404 | /* | 1606 | /* |
1405 | * Try to acquire and then immediately release the console semaphore. | 1607 | * Try to acquire and then immediately release the console semaphore. |
@@ -1483,14 +1685,32 @@ asmlinkage int printk(const char *fmt, ...) | |||
1483 | } | 1685 | } |
1484 | EXPORT_SYMBOL(printk); | 1686 | EXPORT_SYMBOL(printk); |
1485 | 1687 | ||
1486 | #else | 1688 | #else /* CONFIG_PRINTK */ |
1487 | 1689 | ||
1690 | #define LOG_LINE_MAX 0 | ||
1691 | #define PREFIX_MAX 0 | ||
1488 | #define LOG_LINE_MAX 0 | 1692 | #define LOG_LINE_MAX 0 |
1693 | static u64 syslog_seq; | ||
1694 | static u32 syslog_idx; | ||
1695 | static u64 console_seq; | ||
1696 | static u32 console_idx; | ||
1697 | static enum log_flags syslog_prev; | ||
1698 | static u64 log_first_seq; | ||
1699 | static u32 log_first_idx; | ||
1700 | static u64 log_next_seq; | ||
1701 | static enum log_flags console_prev; | ||
1702 | static struct cont { | ||
1703 | size_t len; | ||
1704 | size_t cons; | ||
1705 | u8 level; | ||
1706 | bool flushed:1; | ||
1707 | } cont; | ||
1489 | static struct log *log_from_idx(u32 idx) { return NULL; } | 1708 | static struct log *log_from_idx(u32 idx) { return NULL; } |
1490 | static u32 log_next(u32 idx) { return 0; } | 1709 | static u32 log_next(u32 idx) { return 0; } |
1491 | static void call_console_drivers(int level, const char *text, size_t len) {} | 1710 | static void call_console_drivers(int level, const char *text, size_t len) {} |
1492 | static size_t msg_print_text(const struct log *msg, bool syslog, | 1711 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, |
1493 | char *buf, size_t size) { return 0; } | 1712 | bool syslog, char *buf, size_t size) { return 0; } |
1713 | static size_t cont_print_text(char *text, size_t size) { return 0; } | ||
1494 | 1714 | ||
1495 | #endif /* CONFIG_PRINTK */ | 1715 | #endif /* CONFIG_PRINTK */ |
1496 | 1716 | ||
@@ -1762,9 +1982,34 @@ void wake_up_klogd(void) | |||
1762 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 1982 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
1763 | } | 1983 | } |
1764 | 1984 | ||
1765 | /* the next printk record to write to the console */ | 1985 | static void console_cont_flush(char *text, size_t size) |
1766 | static u64 console_seq; | 1986 | { |
1767 | static u32 console_idx; | 1987 | unsigned long flags; |
1988 | size_t len; | ||
1989 | |||
1990 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
1991 | |||
1992 | if (!cont.len) | ||
1993 | goto out; | ||
1994 | |||
1995 | /* | ||
1996 | * We still queue earlier records, likely because the console was | ||
1997 | * busy. The earlier ones need to be printed before this one, we | ||
1998 | * did not flush any fragment so far, so just let it queue up. | ||
1999 | */ | ||
2000 | if (console_seq < log_next_seq && !cont.cons) | ||
2001 | goto out; | ||
2002 | |||
2003 | len = cont_print_text(text, size); | ||
2004 | raw_spin_unlock(&logbuf_lock); | ||
2005 | stop_critical_timings(); | ||
2006 | call_console_drivers(cont.level, text, len); | ||
2007 | start_critical_timings(); | ||
2008 | local_irq_restore(flags); | ||
2009 | return; | ||
2010 | out: | ||
2011 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2012 | } | ||
1768 | 2013 | ||
1769 | /** | 2014 | /** |
1770 | * console_unlock - unlock the console system | 2015 | * console_unlock - unlock the console system |
@@ -1782,6 +2027,7 @@ static u32 console_idx; | |||
1782 | */ | 2027 | */ |
1783 | void console_unlock(void) | 2028 | void console_unlock(void) |
1784 | { | 2029 | { |
2030 | static char text[LOG_LINE_MAX + PREFIX_MAX]; | ||
1785 | static u64 seen_seq; | 2031 | static u64 seen_seq; |
1786 | unsigned long flags; | 2032 | unsigned long flags; |
1787 | bool wake_klogd = false; | 2033 | bool wake_klogd = false; |
@@ -1794,10 +2040,11 @@ void console_unlock(void) | |||
1794 | 2040 | ||
1795 | console_may_schedule = 0; | 2041 | console_may_schedule = 0; |
1796 | 2042 | ||
2043 | /* flush buffered message fragment immediately to console */ | ||
2044 | console_cont_flush(text, sizeof(text)); | ||
1797 | again: | 2045 | again: |
1798 | for (;;) { | 2046 | for (;;) { |
1799 | struct log *msg; | 2047 | struct log *msg; |
1800 | static char text[LOG_LINE_MAX]; | ||
1801 | size_t len; | 2048 | size_t len; |
1802 | int level; | 2049 | int level; |
1803 | 2050 | ||
@@ -1811,18 +2058,36 @@ again: | |||
1811 | /* messages are gone, move to first one */ | 2058 | /* messages are gone, move to first one */ |
1812 | console_seq = log_first_seq; | 2059 | console_seq = log_first_seq; |
1813 | console_idx = log_first_idx; | 2060 | console_idx = log_first_idx; |
2061 | console_prev = 0; | ||
1814 | } | 2062 | } |
1815 | 2063 | skip: | |
1816 | if (console_seq == log_next_seq) | 2064 | if (console_seq == log_next_seq) |
1817 | break; | 2065 | break; |
1818 | 2066 | ||
1819 | msg = log_from_idx(console_idx); | 2067 | msg = log_from_idx(console_idx); |
1820 | level = msg->level & 7; | 2068 | if (msg->flags & LOG_NOCONS) { |
1821 | 2069 | /* | |
1822 | len = msg_print_text(msg, false, text, sizeof(text)); | 2070 | * Skip record we have buffered and already printed |
2071 | * directly to the console when we received it. | ||
2072 | */ | ||
2073 | console_idx = log_next(console_idx); | ||
2074 | console_seq++; | ||
2075 | /* | ||
2076 | * We will get here again when we register a new | ||
2077 | * CON_PRINTBUFFER console. Clear the flag so we | ||
2078 | * will properly dump everything later. | ||
2079 | */ | ||
2080 | msg->flags &= ~LOG_NOCONS; | ||
2081 | console_prev = msg->flags; | ||
2082 | goto skip; | ||
2083 | } | ||
1823 | 2084 | ||
2085 | level = msg->level; | ||
2086 | len = msg_print_text(msg, console_prev, false, | ||
2087 | text, sizeof(text)); | ||
1824 | console_idx = log_next(console_idx); | 2088 | console_idx = log_next(console_idx); |
1825 | console_seq++; | 2089 | console_seq++; |
2090 | console_prev = msg->flags; | ||
1826 | raw_spin_unlock(&logbuf_lock); | 2091 | raw_spin_unlock(&logbuf_lock); |
1827 | 2092 | ||
1828 | stop_critical_timings(); /* don't trace print latency */ | 2093 | stop_critical_timings(); /* don't trace print latency */ |
@@ -2085,6 +2350,7 @@ void register_console(struct console *newcon) | |||
2085 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2350 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
2086 | console_seq = syslog_seq; | 2351 | console_seq = syslog_seq; |
2087 | console_idx = syslog_idx; | 2352 | console_idx = syslog_idx; |
2353 | console_prev = syslog_prev; | ||
2088 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2354 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2089 | /* | 2355 | /* |
2090 | * We're about to replay the log buffer. Only do this to the | 2356 | * We're about to replay the log buffer. Only do this to the |
@@ -2300,48 +2566,256 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | |||
2300 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2566 | * kmsg_dump - dump kernel log to kernel message dumpers. |
2301 | * @reason: the reason (oops, panic etc) for dumping | 2567 | * @reason: the reason (oops, panic etc) for dumping |
2302 | * | 2568 | * |
2303 | * Iterate through each of the dump devices and call the oops/panic | 2569 | * Call each of the registered dumper's dump() callback, which can |
2304 | * callbacks with the log buffer. | 2570 | * retrieve the kmsg records with kmsg_dump_get_line() or |
2571 | * kmsg_dump_get_buffer(). | ||
2305 | */ | 2572 | */ |
2306 | void kmsg_dump(enum kmsg_dump_reason reason) | 2573 | void kmsg_dump(enum kmsg_dump_reason reason) |
2307 | { | 2574 | { |
2308 | u64 idx; | ||
2309 | struct kmsg_dumper *dumper; | 2575 | struct kmsg_dumper *dumper; |
2310 | const char *s1, *s2; | ||
2311 | unsigned long l1, l2; | ||
2312 | unsigned long flags; | 2576 | unsigned long flags; |
2313 | 2577 | ||
2314 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) | 2578 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) |
2315 | return; | 2579 | return; |
2316 | 2580 | ||
2317 | /* Theoretically, the log could move on after we do this, but | 2581 | rcu_read_lock(); |
2318 | there's not a lot we can do about that. The new messages | 2582 | list_for_each_entry_rcu(dumper, &dump_list, list) { |
2319 | will overwrite the start of what we dump. */ | 2583 | if (dumper->max_reason && reason > dumper->max_reason) |
2584 | continue; | ||
2585 | |||
2586 | /* initialize iterator with data about the stored records */ | ||
2587 | dumper->active = true; | ||
2588 | |||
2589 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2590 | dumper->cur_seq = clear_seq; | ||
2591 | dumper->cur_idx = clear_idx; | ||
2592 | dumper->next_seq = log_next_seq; | ||
2593 | dumper->next_idx = log_next_idx; | ||
2594 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2595 | |||
2596 | /* invoke dumper which will iterate over records */ | ||
2597 | dumper->dump(dumper, reason); | ||
2598 | |||
2599 | /* reset iterator */ | ||
2600 | dumper->active = false; | ||
2601 | } | ||
2602 | rcu_read_unlock(); | ||
2603 | } | ||
2604 | |||
2605 | /** | ||
2606 | * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) | ||
2607 | * @dumper: registered kmsg dumper | ||
2608 | * @syslog: include the "<4>" prefixes | ||
2609 | * @line: buffer to copy the line to | ||
2610 | * @size: maximum size of the buffer | ||
2611 | * @len: length of line placed into buffer | ||
2612 | * | ||
2613 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
2614 | * record, and copy one record into the provided buffer. | ||
2615 | * | ||
2616 | * Consecutive calls will return the next available record moving | ||
2617 | * towards the end of the buffer with the youngest messages. | ||
2618 | * | ||
2619 | * A return value of FALSE indicates that there are no more records to | ||
2620 | * read. | ||
2621 | * | ||
2622 | * The function is similar to kmsg_dump_get_line(), but grabs no locks. | ||
2623 | */ | ||
2624 | bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, | ||
2625 | char *line, size_t size, size_t *len) | ||
2626 | { | ||
2627 | struct log *msg; | ||
2628 | size_t l = 0; | ||
2629 | bool ret = false; | ||
2630 | |||
2631 | if (!dumper->active) | ||
2632 | goto out; | ||
2633 | |||
2634 | if (dumper->cur_seq < log_first_seq) { | ||
2635 | /* messages are gone, move to first available one */ | ||
2636 | dumper->cur_seq = log_first_seq; | ||
2637 | dumper->cur_idx = log_first_idx; | ||
2638 | } | ||
2639 | |||
2640 | /* last entry */ | ||
2641 | if (dumper->cur_seq >= log_next_seq) | ||
2642 | goto out; | ||
2643 | |||
2644 | msg = log_from_idx(dumper->cur_idx); | ||
2645 | l = msg_print_text(msg, 0, syslog, line, size); | ||
2646 | |||
2647 | dumper->cur_idx = log_next(dumper->cur_idx); | ||
2648 | dumper->cur_seq++; | ||
2649 | ret = true; | ||
2650 | out: | ||
2651 | if (len) | ||
2652 | *len = l; | ||
2653 | return ret; | ||
2654 | } | ||
2655 | |||
2656 | /** | ||
2657 | * kmsg_dump_get_line - retrieve one kmsg log line | ||
2658 | * @dumper: registered kmsg dumper | ||
2659 | * @syslog: include the "<4>" prefixes | ||
2660 | * @line: buffer to copy the line to | ||
2661 | * @size: maximum size of the buffer | ||
2662 | * @len: length of line placed into buffer | ||
2663 | * | ||
2664 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
2665 | * record, and copy one record into the provided buffer. | ||
2666 | * | ||
2667 | * Consecutive calls will return the next available record moving | ||
2668 | * towards the end of the buffer with the youngest messages. | ||
2669 | * | ||
2670 | * A return value of FALSE indicates that there are no more records to | ||
2671 | * read. | ||
2672 | */ | ||
2673 | bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | ||
2674 | char *line, size_t size, size_t *len) | ||
2675 | { | ||
2676 | unsigned long flags; | ||
2677 | bool ret; | ||
2320 | 2678 | ||
2321 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2679 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
2322 | if (syslog_seq < log_first_seq) | 2680 | ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); |
2323 | idx = syslog_idx; | 2681 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2324 | else | ||
2325 | idx = log_first_idx; | ||
2326 | 2682 | ||
2327 | if (idx > log_next_idx) { | 2683 | return ret; |
2328 | s1 = log_buf; | 2684 | } |
2329 | l1 = log_next_idx; | 2685 | EXPORT_SYMBOL_GPL(kmsg_dump_get_line); |
2330 | 2686 | ||
2331 | s2 = log_buf + idx; | 2687 | /** |
2332 | l2 = log_buf_len - idx; | 2688 | * kmsg_dump_get_buffer - copy kmsg log lines |
2333 | } else { | 2689 | * @dumper: registered kmsg dumper |
2334 | s1 = ""; | 2690 | * @syslog: include the "<4>" prefixes |
2335 | l1 = 0; | 2691 | * @buf: buffer to copy the line to |
2692 | * @size: maximum size of the buffer | ||
2693 | * @len: length of line placed into buffer | ||
2694 | * | ||
2695 | * Start at the end of the kmsg buffer and fill the provided buffer | ||
2696 | * with as many of the the *youngest* kmsg records that fit into it. | ||
2697 | * If the buffer is large enough, all available kmsg records will be | ||
2698 | * copied with a single call. | ||
2699 | * | ||
2700 | * Consecutive calls will fill the buffer with the next block of | ||
2701 | * available older records, not including the earlier retrieved ones. | ||
2702 | * | ||
2703 | * A return value of FALSE indicates that there are no more records to | ||
2704 | * read. | ||
2705 | */ | ||
2706 | bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | ||
2707 | char *buf, size_t size, size_t *len) | ||
2708 | { | ||
2709 | unsigned long flags; | ||
2710 | u64 seq; | ||
2711 | u32 idx; | ||
2712 | u64 next_seq; | ||
2713 | u32 next_idx; | ||
2714 | enum log_flags prev; | ||
2715 | size_t l = 0; | ||
2716 | bool ret = false; | ||
2717 | |||
2718 | if (!dumper->active) | ||
2719 | goto out; | ||
2720 | |||
2721 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2722 | if (dumper->cur_seq < log_first_seq) { | ||
2723 | /* messages are gone, move to first available one */ | ||
2724 | dumper->cur_seq = log_first_seq; | ||
2725 | dumper->cur_idx = log_first_idx; | ||
2726 | } | ||
2336 | 2727 | ||
2337 | s2 = log_buf + idx; | 2728 | /* last entry */ |
2338 | l2 = log_next_idx - idx; | 2729 | if (dumper->cur_seq >= dumper->next_seq) { |
2730 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2731 | goto out; | ||
2339 | } | 2732 | } |
2733 | |||
2734 | /* calculate length of entire buffer */ | ||
2735 | seq = dumper->cur_seq; | ||
2736 | idx = dumper->cur_idx; | ||
2737 | prev = 0; | ||
2738 | while (seq < dumper->next_seq) { | ||
2739 | struct log *msg = log_from_idx(idx); | ||
2740 | |||
2741 | l += msg_print_text(msg, prev, true, NULL, 0); | ||
2742 | idx = log_next(idx); | ||
2743 | seq++; | ||
2744 | prev = msg->flags; | ||
2745 | } | ||
2746 | |||
2747 | /* move first record forward until length fits into the buffer */ | ||
2748 | seq = dumper->cur_seq; | ||
2749 | idx = dumper->cur_idx; | ||
2750 | prev = 0; | ||
2751 | while (l > size && seq < dumper->next_seq) { | ||
2752 | struct log *msg = log_from_idx(idx); | ||
2753 | |||
2754 | l -= msg_print_text(msg, prev, true, NULL, 0); | ||
2755 | idx = log_next(idx); | ||
2756 | seq++; | ||
2757 | prev = msg->flags; | ||
2758 | } | ||
2759 | |||
2760 | /* last message in next interation */ | ||
2761 | next_seq = seq; | ||
2762 | next_idx = idx; | ||
2763 | |||
2764 | l = 0; | ||
2765 | prev = 0; | ||
2766 | while (seq < dumper->next_seq) { | ||
2767 | struct log *msg = log_from_idx(idx); | ||
2768 | |||
2769 | l += msg_print_text(msg, prev, syslog, buf + l, size - l); | ||
2770 | idx = log_next(idx); | ||
2771 | seq++; | ||
2772 | prev = msg->flags; | ||
2773 | } | ||
2774 | |||
2775 | dumper->next_seq = next_seq; | ||
2776 | dumper->next_idx = next_idx; | ||
2777 | ret = true; | ||
2340 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2778 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2779 | out: | ||
2780 | if (len) | ||
2781 | *len = l; | ||
2782 | return ret; | ||
2783 | } | ||
2784 | EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); | ||
2341 | 2785 | ||
2342 | rcu_read_lock(); | 2786 | /** |
2343 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2787 | * kmsg_dump_rewind_nolock - reset the interator (unlocked version) |
2344 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 2788 | * @dumper: registered kmsg dumper |
2345 | rcu_read_unlock(); | 2789 | * |
2790 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
2791 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
2792 | * times within the same dumper.dump() callback. | ||
2793 | * | ||
2794 | * The function is similar to kmsg_dump_rewind(), but grabs no locks. | ||
2795 | */ | ||
2796 | void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) | ||
2797 | { | ||
2798 | dumper->cur_seq = clear_seq; | ||
2799 | dumper->cur_idx = clear_idx; | ||
2800 | dumper->next_seq = log_next_seq; | ||
2801 | dumper->next_idx = log_next_idx; | ||
2802 | } | ||
2803 | |||
2804 | /** | ||
2805 | * kmsg_dump_rewind - reset the interator | ||
2806 | * @dumper: registered kmsg dumper | ||
2807 | * | ||
2808 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
2809 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
2810 | * times within the same dumper.dump() callback. | ||
2811 | */ | ||
2812 | void kmsg_dump_rewind(struct kmsg_dumper *dumper) | ||
2813 | { | ||
2814 | unsigned long flags; | ||
2815 | |||
2816 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2817 | kmsg_dump_rewind_nolock(dumper); | ||
2818 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2346 | } | 2819 | } |
2820 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); | ||
2347 | #endif | 2821 | #endif |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 95cba41ce1e9..4e6a61b15e86 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -54,6 +54,50 @@ | |||
54 | #ifdef CONFIG_PREEMPT_RCU | 54 | #ifdef CONFIG_PREEMPT_RCU |
55 | 55 | ||
56 | /* | 56 | /* |
57 | * Preemptible RCU implementation for rcu_read_lock(). | ||
58 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
59 | * if we block. | ||
60 | */ | ||
61 | void __rcu_read_lock(void) | ||
62 | { | ||
63 | current->rcu_read_lock_nesting++; | ||
64 | barrier(); /* critical section after entry code. */ | ||
65 | } | ||
66 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
67 | |||
68 | /* | ||
69 | * Preemptible RCU implementation for rcu_read_unlock(). | ||
70 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
71 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
72 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
73 | * in an RCU read-side critical section and other special cases. | ||
74 | */ | ||
75 | void __rcu_read_unlock(void) | ||
76 | { | ||
77 | struct task_struct *t = current; | ||
78 | |||
79 | if (t->rcu_read_lock_nesting != 1) { | ||
80 | --t->rcu_read_lock_nesting; | ||
81 | } else { | ||
82 | barrier(); /* critical section before exit code. */ | ||
83 | t->rcu_read_lock_nesting = INT_MIN; | ||
84 | barrier(); /* assign before ->rcu_read_unlock_special load */ | ||
85 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
86 | rcu_read_unlock_special(t); | ||
87 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
88 | t->rcu_read_lock_nesting = 0; | ||
89 | } | ||
90 | #ifdef CONFIG_PROVE_LOCKING | ||
91 | { | ||
92 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
93 | |||
94 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
95 | } | ||
96 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
97 | } | ||
98 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
99 | |||
100 | /* | ||
57 | * Check for a task exiting while in a preemptible-RCU read-side | 101 | * Check for a task exiting while in a preemptible-RCU read-side |
58 | * critical section, clean up if so. No need to issue warnings, | 102 | * critical section, clean up if so. No need to issue warnings, |
59 | * as debug_check_no_locks_held() already does this if lockdep | 103 | * as debug_check_no_locks_held() already does this if lockdep |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 37a5444204d2..547b1fe5b052 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -172,7 +172,7 @@ void rcu_irq_enter(void) | |||
172 | local_irq_restore(flags); | 172 | local_irq_restore(flags); |
173 | } | 173 | } |
174 | 174 | ||
175 | #ifdef CONFIG_PROVE_RCU | 175 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
176 | 176 | ||
177 | /* | 177 | /* |
178 | * Test whether RCU thinks that the current CPU is idle. | 178 | * Test whether RCU thinks that the current CPU is idle. |
@@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void) | |||
183 | } | 183 | } |
184 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 184 | EXPORT_SYMBOL(rcu_is_cpu_idle); |
185 | 185 | ||
186 | #endif /* #ifdef CONFIG_PROVE_RCU */ | 186 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
187 | 187 | ||
188 | /* | 188 | /* |
189 | * Test whether the current CPU was interrupted from idle. Nested | 189 | * Test whether the current CPU was interrupted from idle. Nested |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index fc31a2d65100..918fd1e8509c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | |||
132 | RCU_TRACE(.rcb.name = "rcu_preempt") | 132 | RCU_TRACE(.rcb.name = "rcu_preempt") |
133 | }; | 133 | }; |
134 | 134 | ||
135 | static void rcu_read_unlock_special(struct task_struct *t); | ||
136 | static int rcu_preempted_readers_exp(void); | 135 | static int rcu_preempted_readers_exp(void); |
137 | static void rcu_report_exp_done(void); | 136 | static void rcu_report_exp_done(void); |
138 | 137 | ||
@@ -351,8 +350,9 @@ static int rcu_initiate_boost(void) | |||
351 | rcu_preempt_ctrlblk.boost_tasks = | 350 | rcu_preempt_ctrlblk.boost_tasks = |
352 | rcu_preempt_ctrlblk.gp_tasks; | 351 | rcu_preempt_ctrlblk.gp_tasks; |
353 | invoke_rcu_callbacks(); | 352 | invoke_rcu_callbacks(); |
354 | } else | 353 | } else { |
355 | RCU_TRACE(rcu_initiate_boost_trace()); | 354 | RCU_TRACE(rcu_initiate_boost_trace()); |
355 | } | ||
356 | return 1; | 356 | return 1; |
357 | } | 357 | } |
358 | 358 | ||
@@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void) | |||
527 | } | 527 | } |
528 | 528 | ||
529 | /* | 529 | /* |
530 | * Tiny-preemptible RCU implementation for rcu_read_lock(). | ||
531 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
532 | * if we block. | ||
533 | */ | ||
534 | void __rcu_read_lock(void) | ||
535 | { | ||
536 | current->rcu_read_lock_nesting++; | ||
537 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ | ||
538 | } | ||
539 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
540 | |||
541 | /* | ||
542 | * Handle special cases during rcu_read_unlock(), such as needing to | 530 | * Handle special cases during rcu_read_unlock(), such as needing to |
543 | * notify RCU core processing or task having blocked during the RCU | 531 | * notify RCU core processing or task having blocked during the RCU |
544 | * read-side critical section. | 532 | * read-side critical section. |
545 | */ | 533 | */ |
546 | static noinline void rcu_read_unlock_special(struct task_struct *t) | 534 | void rcu_read_unlock_special(struct task_struct *t) |
547 | { | 535 | { |
548 | int empty; | 536 | int empty; |
549 | int empty_exp; | 537 | int empty_exp; |
@@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
627 | } | 615 | } |
628 | 616 | ||
629 | /* | 617 | /* |
630 | * Tiny-preemptible RCU implementation for rcu_read_unlock(). | ||
631 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
632 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
633 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
634 | * in an RCU read-side critical section and other special cases. | ||
635 | */ | ||
636 | void __rcu_read_unlock(void) | ||
637 | { | ||
638 | struct task_struct *t = current; | ||
639 | |||
640 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ | ||
641 | if (t->rcu_read_lock_nesting != 1) | ||
642 | --t->rcu_read_lock_nesting; | ||
643 | else { | ||
644 | t->rcu_read_lock_nesting = INT_MIN; | ||
645 | barrier(); /* assign before ->rcu_read_unlock_special load */ | ||
646 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
647 | rcu_read_unlock_special(t); | ||
648 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
649 | t->rcu_read_lock_nesting = 0; | ||
650 | } | ||
651 | #ifdef CONFIG_PROVE_LOCKING | ||
652 | { | ||
653 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
654 | |||
655 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
656 | } | ||
657 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
658 | } | ||
659 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
660 | |||
661 | /* | ||
662 | * Check for a quiescent state from the current CPU. When a task blocks, | 618 | * Check for a quiescent state from the current CPU. When a task blocks, |
663 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is | 619 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is |
664 | * checked elsewhere. This is called from the scheduling-clock interrupt. | 620 | * checked elsewhere. This is called from the scheduling-clock interrupt. |
@@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void) | |||
823 | rpcp->exp_tasks = NULL; | 779 | rpcp->exp_tasks = NULL; |
824 | 780 | ||
825 | /* Wait for tail of ->blkd_tasks list to drain. */ | 781 | /* Wait for tail of ->blkd_tasks list to drain. */ |
826 | if (!rcu_preempted_readers_exp()) | 782 | if (!rcu_preempted_readers_exp()) { |
827 | local_irq_restore(flags); | 783 | local_irq_restore(flags); |
828 | else { | 784 | } else { |
829 | rcu_initiate_boost(); | 785 | rcu_initiate_boost(); |
830 | local_irq_restore(flags); | 786 | local_irq_restore(flags); |
831 | wait_event(sync_rcu_preempt_exp_wq, | 787 | wait_event(sync_rcu_preempt_exp_wq, |
@@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
846 | */ | 802 | */ |
847 | int rcu_preempt_needs_cpu(void) | 803 | int rcu_preempt_needs_cpu(void) |
848 | { | 804 | { |
849 | if (!rcu_preempt_running_reader()) | ||
850 | rcu_preempt_cpu_qs(); | ||
851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | 805 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; |
852 | } | 806 | } |
853 | 807 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e66b34ab7555..25b15033c61f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -49,8 +49,7 @@ | |||
49 | #include <asm/byteorder.h> | 49 | #include <asm/byteorder.h> |
50 | 50 | ||
51 | MODULE_LICENSE("GPL"); | 51 | MODULE_LICENSE("GPL"); |
52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " | 52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); |
53 | "Josh Triplett <josh@freedesktop.org>"); | ||
54 | 53 | ||
55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 54 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ |
56 | static int nfakewriters = 4; /* # fake writer threads */ | 55 | static int nfakewriters = 4; /* # fake writer threads */ |
@@ -206,6 +205,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */ | |||
206 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 205 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
207 | /* and boost task create/destroy. */ | 206 | /* and boost task create/destroy. */ |
208 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | 207 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ |
208 | static bool barrier_phase; /* Test phase. */ | ||
209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ | 209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ |
210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ | 210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ |
211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); | 211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); |
@@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p) | |||
407 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | 407 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { |
408 | rp->rtort_mbtest = 0; | 408 | rp->rtort_mbtest = 0; |
409 | rcu_torture_free(rp); | 409 | rcu_torture_free(rp); |
410 | } else | 410 | } else { |
411 | cur_ops->deferred_free(rp); | 411 | cur_ops->deferred_free(rp); |
412 | } | ||
412 | } | 413 | } |
413 | 414 | ||
414 | static int rcu_no_completed(void) | 415 | static int rcu_no_completed(void) |
@@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void) | |||
635 | synchronize_srcu(&srcu_ctl); | 636 | synchronize_srcu(&srcu_ctl); |
636 | } | 637 | } |
637 | 638 | ||
639 | static void srcu_torture_call(struct rcu_head *head, | ||
640 | void (*func)(struct rcu_head *head)) | ||
641 | { | ||
642 | call_srcu(&srcu_ctl, head, func); | ||
643 | } | ||
644 | |||
645 | static void srcu_torture_barrier(void) | ||
646 | { | ||
647 | srcu_barrier(&srcu_ctl); | ||
648 | } | ||
649 | |||
638 | static int srcu_torture_stats(char *page) | 650 | static int srcu_torture_stats(char *page) |
639 | { | 651 | { |
640 | int cnt = 0; | 652 | int cnt = 0; |
@@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = { | |||
661 | .completed = srcu_torture_completed, | 673 | .completed = srcu_torture_completed, |
662 | .deferred_free = srcu_torture_deferred_free, | 674 | .deferred_free = srcu_torture_deferred_free, |
663 | .sync = srcu_torture_synchronize, | 675 | .sync = srcu_torture_synchronize, |
664 | .call = NULL, | 676 | .call = srcu_torture_call, |
665 | .cb_barrier = NULL, | 677 | .cb_barrier = srcu_torture_barrier, |
666 | .stats = srcu_torture_stats, | 678 | .stats = srcu_torture_stats, |
667 | .name = "srcu" | 679 | .name = "srcu" |
668 | }; | 680 | }; |
@@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg) | |||
1013 | do { | 1025 | do { |
1014 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 1026 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
1015 | udelay(rcu_random(&rand) & 0x3ff); | 1027 | udelay(rcu_random(&rand) & 0x3ff); |
1016 | cur_ops->sync(); | 1028 | if (cur_ops->cb_barrier != NULL && |
1029 | rcu_random(&rand) % (nfakewriters * 8) == 0) | ||
1030 | cur_ops->cb_barrier(); | ||
1031 | else | ||
1032 | cur_ops->sync(); | ||
1017 | rcu_stutter_wait("rcu_torture_fakewriter"); | 1033 | rcu_stutter_wait("rcu_torture_fakewriter"); |
1018 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 1034 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
1019 | 1035 | ||
@@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page) | |||
1183 | } | 1199 | } |
1184 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1200 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
1185 | cnt += sprintf(&page[cnt], | 1201 | cnt += sprintf(&page[cnt], |
1186 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1202 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", |
1187 | "rtmbe: %d rtbke: %ld rtbre: %ld " | ||
1188 | "rtbf: %ld rtb: %ld nt: %ld " | ||
1189 | "onoff: %ld/%ld:%ld/%ld " | ||
1190 | "barrier: %ld/%ld:%ld", | ||
1191 | rcu_torture_current, | 1203 | rcu_torture_current, |
1192 | rcu_torture_current_version, | 1204 | rcu_torture_current_version, |
1193 | list_empty(&rcu_torture_freelist), | 1205 | list_empty(&rcu_torture_freelist), |
1194 | atomic_read(&n_rcu_torture_alloc), | 1206 | atomic_read(&n_rcu_torture_alloc), |
1195 | atomic_read(&n_rcu_torture_alloc_fail), | 1207 | atomic_read(&n_rcu_torture_alloc_fail), |
1196 | atomic_read(&n_rcu_torture_free), | 1208 | atomic_read(&n_rcu_torture_free)); |
1209 | cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", | ||
1197 | atomic_read(&n_rcu_torture_mberror), | 1210 | atomic_read(&n_rcu_torture_mberror), |
1198 | n_rcu_torture_boost_ktrerror, | 1211 | n_rcu_torture_boost_ktrerror, |
1199 | n_rcu_torture_boost_rterror, | 1212 | n_rcu_torture_boost_rterror); |
1213 | cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", | ||
1200 | n_rcu_torture_boost_failure, | 1214 | n_rcu_torture_boost_failure, |
1201 | n_rcu_torture_boosts, | 1215 | n_rcu_torture_boosts, |
1202 | n_rcu_torture_timers, | 1216 | n_rcu_torture_timers); |
1217 | cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", | ||
1203 | n_online_successes, | 1218 | n_online_successes, |
1204 | n_online_attempts, | 1219 | n_online_attempts, |
1205 | n_offline_successes, | 1220 | n_offline_successes, |
1206 | n_offline_attempts, | 1221 | n_offline_attempts); |
1222 | cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", | ||
1207 | n_barrier_successes, | 1223 | n_barrier_successes, |
1208 | n_barrier_attempts, | 1224 | n_barrier_attempts, |
1209 | n_rcu_torture_barrier_error); | 1225 | n_rcu_torture_barrier_error); |
@@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg) | |||
1445 | delta = shutdown_time - jiffies_snap; | 1461 | delta = shutdown_time - jiffies_snap; |
1446 | if (verbose) | 1462 | if (verbose) |
1447 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1463 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1448 | "rcu_torture_shutdown task: %lu " | 1464 | "rcu_torture_shutdown task: %lu jiffies remaining\n", |
1449 | "jiffies remaining\n", | ||
1450 | torture_type, delta); | 1465 | torture_type, delta); |
1451 | schedule_timeout_interruptible(delta); | 1466 | schedule_timeout_interruptible(delta); |
1452 | jiffies_snap = ACCESS_ONCE(jiffies); | 1467 | jiffies_snap = ACCESS_ONCE(jiffies); |
@@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg) | |||
1498 | if (cpu_down(cpu) == 0) { | 1513 | if (cpu_down(cpu) == 0) { |
1499 | if (verbose) | 1514 | if (verbose) |
1500 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1515 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1501 | "rcu_torture_onoff task: " | 1516 | "rcu_torture_onoff task: offlined %d\n", |
1502 | "offlined %d\n", | ||
1503 | torture_type, cpu); | 1517 | torture_type, cpu); |
1504 | n_offline_successes++; | 1518 | n_offline_successes++; |
1505 | } | 1519 | } |
@@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg) | |||
1512 | if (cpu_up(cpu) == 0) { | 1526 | if (cpu_up(cpu) == 0) { |
1513 | if (verbose) | 1527 | if (verbose) |
1514 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1528 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1515 | "rcu_torture_onoff task: " | 1529 | "rcu_torture_onoff task: onlined %d\n", |
1516 | "onlined %d\n", | ||
1517 | torture_type, cpu); | 1530 | torture_type, cpu); |
1518 | n_online_successes++; | 1531 | n_online_successes++; |
1519 | } | 1532 | } |
@@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu) | |||
1631 | static int rcu_torture_barrier_cbs(void *arg) | 1644 | static int rcu_torture_barrier_cbs(void *arg) |
1632 | { | 1645 | { |
1633 | long myid = (long)arg; | 1646 | long myid = (long)arg; |
1647 | bool lastphase = 0; | ||
1634 | struct rcu_head rcu; | 1648 | struct rcu_head rcu; |
1635 | 1649 | ||
1636 | init_rcu_head_on_stack(&rcu); | 1650 | init_rcu_head_on_stack(&rcu); |
@@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg) | |||
1638 | set_user_nice(current, 19); | 1652 | set_user_nice(current, 19); |
1639 | do { | 1653 | do { |
1640 | wait_event(barrier_cbs_wq[myid], | 1654 | wait_event(barrier_cbs_wq[myid], |
1641 | atomic_read(&barrier_cbs_count) == n_barrier_cbs || | 1655 | barrier_phase != lastphase || |
1642 | kthread_should_stop() || | 1656 | kthread_should_stop() || |
1643 | fullstop != FULLSTOP_DONTSTOP); | 1657 | fullstop != FULLSTOP_DONTSTOP); |
1658 | lastphase = barrier_phase; | ||
1659 | smp_mb(); /* ensure barrier_phase load before ->call(). */ | ||
1644 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | 1660 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) |
1645 | break; | 1661 | break; |
1646 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | 1662 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); |
@@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg) | |||
1665 | do { | 1681 | do { |
1666 | atomic_set(&barrier_cbs_invoked, 0); | 1682 | atomic_set(&barrier_cbs_invoked, 0); |
1667 | atomic_set(&barrier_cbs_count, n_barrier_cbs); | 1683 | atomic_set(&barrier_cbs_count, n_barrier_cbs); |
1668 | /* wake_up() path contains the required barriers. */ | 1684 | smp_mb(); /* Ensure barrier_phase after prior assignments. */ |
1685 | barrier_phase = !barrier_phase; | ||
1669 | for (i = 0; i < n_barrier_cbs; i++) | 1686 | for (i = 0; i < n_barrier_cbs; i++) |
1670 | wake_up(&barrier_cbs_wq[i]); | 1687 | wake_up(&barrier_cbs_wq[i]); |
1671 | wait_event(barrier_wq, | 1688 | wait_event(barrier_wq, |
@@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg) | |||
1684 | schedule_timeout_interruptible(HZ / 10); | 1701 | schedule_timeout_interruptible(HZ / 10); |
1685 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 1702 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
1686 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); | 1703 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); |
1687 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | 1704 | rcutorture_shutdown_absorb("rcu_torture_barrier"); |
1688 | while (!kthread_should_stop()) | 1705 | while (!kthread_should_stop()) |
1689 | schedule_timeout_interruptible(1); | 1706 | schedule_timeout_interruptible(1); |
1690 | return 0; | 1707 | return 0; |
@@ -1908,8 +1925,8 @@ rcu_torture_init(void) | |||
1908 | static struct rcu_torture_ops *torture_ops[] = | 1925 | static struct rcu_torture_ops *torture_ops[] = |
1909 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1926 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1910 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1927 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1911 | &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, | 1928 | &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, |
1912 | &srcu_raw_sync_ops, &srcu_expedited_ops, | 1929 | &srcu_raw_ops, &srcu_raw_sync_ops, |
1913 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1930 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1914 | 1931 | ||
1915 | mutex_lock(&fullstop_mutex); | 1932 | mutex_lock(&fullstop_mutex); |
@@ -1931,8 +1948,7 @@ rcu_torture_init(void) | |||
1931 | return -EINVAL; | 1948 | return -EINVAL; |
1932 | } | 1949 | } |
1933 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | 1950 | if (cur_ops->fqs == NULL && fqs_duration != 0) { |
1934 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " | 1951 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); |
1935 | "fqs_duration, fqs disabled.\n"); | ||
1936 | fqs_duration = 0; | 1952 | fqs_duration = 0; |
1937 | } | 1953 | } |
1938 | if (cur_ops->init) | 1954 | if (cur_ops->init) |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88d92d0..f280e542e3e9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -60,36 +60,44 @@ | |||
60 | 60 | ||
61 | /* Data structures. */ | 61 | /* Data structures. */ |
62 | 62 | ||
63 | static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | 63 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
64 | 64 | ||
65 | #define RCU_STATE_INITIALIZER(structname) { \ | 65 | #define RCU_STATE_INITIALIZER(sname, cr) { \ |
66 | .level = { &structname##_state.node[0] }, \ | 66 | .level = { &sname##_state.node[0] }, \ |
67 | .levelcnt = { \ | 67 | .call = cr, \ |
68 | NUM_RCU_LVL_0, /* root of hierarchy. */ \ | ||
69 | NUM_RCU_LVL_1, \ | ||
70 | NUM_RCU_LVL_2, \ | ||
71 | NUM_RCU_LVL_3, \ | ||
72 | NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ | ||
73 | }, \ | ||
74 | .fqs_state = RCU_GP_IDLE, \ | 68 | .fqs_state = RCU_GP_IDLE, \ |
75 | .gpnum = -300, \ | 69 | .gpnum = -300, \ |
76 | .completed = -300, \ | 70 | .completed = -300, \ |
77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 71 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ |
78 | .orphan_nxttail = &structname##_state.orphan_nxtlist, \ | 72 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ |
79 | .orphan_donetail = &structname##_state.orphan_donelist, \ | 73 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
80 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ | 74 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
81 | .n_force_qs = 0, \ | 75 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ |
82 | .n_force_qs_ngp = 0, \ | 76 | .name = #sname, \ |
83 | .name = #structname, \ | ||
84 | } | 77 | } |
85 | 78 | ||
86 | struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); | 79 | struct rcu_state rcu_sched_state = |
80 | RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); | ||
87 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | 81 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); |
88 | 82 | ||
89 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); | 83 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); |
90 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 84 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
91 | 85 | ||
92 | static struct rcu_state *rcu_state; | 86 | static struct rcu_state *rcu_state; |
87 | LIST_HEAD(rcu_struct_flavors); | ||
88 | |||
89 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ | ||
90 | static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; | ||
91 | module_param(rcu_fanout_leaf, int, 0); | ||
92 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | ||
93 | static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ | ||
94 | NUM_RCU_LVL_0, | ||
95 | NUM_RCU_LVL_1, | ||
96 | NUM_RCU_LVL_2, | ||
97 | NUM_RCU_LVL_3, | ||
98 | NUM_RCU_LVL_4, | ||
99 | }; | ||
100 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | ||
93 | 101 | ||
94 | /* | 102 | /* |
95 | * The rcu_scheduler_active variable transitions from zero to one just | 103 | * The rcu_scheduler_active variable transitions from zero to one just |
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
147 | unsigned long rcutorture_testseq; | 155 | unsigned long rcutorture_testseq; |
148 | unsigned long rcutorture_vernum; | 156 | unsigned long rcutorture_vernum; |
149 | 157 | ||
150 | /* State information for rcu_barrier() and friends. */ | ||
151 | |||
152 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | ||
153 | static atomic_t rcu_barrier_cpu_count; | ||
154 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
155 | static struct completion rcu_barrier_completion; | ||
156 | |||
157 | /* | 158 | /* |
158 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 159 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
159 | * permit this function to be invoked without holding the root rcu_node | 160 | * permit this function to be invoked without holding the root rcu_node |
@@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu) | |||
201 | { | 202 | { |
202 | trace_rcu_utilization("Start context switch"); | 203 | trace_rcu_utilization("Start context switch"); |
203 | rcu_sched_qs(cpu); | 204 | rcu_sched_qs(cpu); |
205 | rcu_preempt_note_context_switch(cpu); | ||
204 | trace_rcu_utilization("End context switch"); | 206 | trace_rcu_utilization("End context switch"); |
205 | } | 207 | } |
206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 208 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
@@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
357 | struct task_struct *idle = idle_task(smp_processor_id()); | 359 | struct task_struct *idle = idle_task(smp_processor_id()); |
358 | 360 | ||
359 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | 361 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); |
360 | ftrace_dump(DUMP_ALL); | 362 | ftrace_dump(DUMP_ORIG); |
361 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 363 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
362 | current->pid, current->comm, | 364 | current->pid, current->comm, |
363 | idle->pid, idle->comm); /* must be idle task! */ | 365 | idle->pid, idle->comm); /* must be idle task! */ |
@@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
467 | 469 | ||
468 | trace_rcu_dyntick("Error on exit: not idle task", | 470 | trace_rcu_dyntick("Error on exit: not idle task", |
469 | oldval, rdtp->dynticks_nesting); | 471 | oldval, rdtp->dynticks_nesting); |
470 | ftrace_dump(DUMP_ALL); | 472 | ftrace_dump(DUMP_ORIG); |
471 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 473 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
472 | current->pid, current->comm, | 474 | current->pid, current->comm, |
473 | idle->pid, idle->comm); /* must be idle task! */ | 475 | idle->pid, idle->comm); /* must be idle task! */ |
@@ -584,8 +586,6 @@ void rcu_nmi_exit(void) | |||
584 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 586 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
585 | } | 587 | } |
586 | 588 | ||
587 | #ifdef CONFIG_PROVE_RCU | ||
588 | |||
589 | /** | 589 | /** |
590 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle | 590 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle |
591 | * | 591 | * |
@@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void) | |||
603 | } | 603 | } |
604 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 604 | EXPORT_SYMBOL(rcu_is_cpu_idle); |
605 | 605 | ||
606 | #ifdef CONFIG_HOTPLUG_CPU | 606 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
607 | 607 | ||
608 | /* | 608 | /* |
609 | * Is the current CPU online? Disable preemption to avoid false positives | 609 | * Is the current CPU online? Disable preemption to avoid false positives |
@@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void) | |||
644 | } | 644 | } |
645 | EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); | 645 | EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); |
646 | 646 | ||
647 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 647 | #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ |
648 | |||
649 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
650 | 648 | ||
651 | /** | 649 | /** |
652 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle | 650 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle |
@@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
732 | int cpu; | 730 | int cpu; |
733 | long delta; | 731 | long delta; |
734 | unsigned long flags; | 732 | unsigned long flags; |
735 | int ndetected; | 733 | int ndetected = 0; |
736 | struct rcu_node *rnp = rcu_get_root(rsp); | 734 | struct rcu_node *rnp = rcu_get_root(rsp); |
737 | 735 | ||
738 | /* Only let one CPU complain about others per time interval. */ | 736 | /* Only let one CPU complain about others per time interval. */ |
@@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
773 | */ | 771 | */ |
774 | rnp = rcu_get_root(rsp); | 772 | rnp = rcu_get_root(rsp); |
775 | raw_spin_lock_irqsave(&rnp->lock, flags); | 773 | raw_spin_lock_irqsave(&rnp->lock, flags); |
776 | ndetected = rcu_print_task_stall(rnp); | 774 | ndetected += rcu_print_task_stall(rnp); |
777 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 775 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
778 | 776 | ||
779 | print_cpu_stall_info_end(); | 777 | print_cpu_stall_info_end(); |
@@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | |||
859 | */ | 857 | */ |
860 | void rcu_cpu_stall_reset(void) | 858 | void rcu_cpu_stall_reset(void) |
861 | { | 859 | { |
862 | rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; | 860 | struct rcu_state *rsp; |
863 | rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; | 861 | |
864 | rcu_preempt_stall_reset(); | 862 | for_each_rcu_flavor(rsp) |
863 | rsp->jiffies_stall = jiffies + ULONG_MAX / 2; | ||
865 | } | 864 | } |
866 | 865 | ||
867 | static struct notifier_block rcu_panic_block = { | 866 | static struct notifier_block rcu_panic_block = { |
@@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct | |||
893 | if (rnp->qsmask & rdp->grpmask) { | 892 | if (rnp->qsmask & rdp->grpmask) { |
894 | rdp->qs_pending = 1; | 893 | rdp->qs_pending = 1; |
895 | rdp->passed_quiesce = 0; | 894 | rdp->passed_quiesce = 0; |
896 | } else | 895 | } else { |
897 | rdp->qs_pending = 0; | 896 | rdp->qs_pending = 0; |
897 | } | ||
898 | zero_cpu_stall_ticks(rdp); | 898 | zero_cpu_stall_ticks(rdp); |
899 | } | 899 | } |
900 | } | 900 | } |
@@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) | |||
936 | } | 936 | } |
937 | 937 | ||
938 | /* | 938 | /* |
939 | * Initialize the specified rcu_data structure's callback list to empty. | ||
940 | */ | ||
941 | static void init_callback_list(struct rcu_data *rdp) | ||
942 | { | ||
943 | int i; | ||
944 | |||
945 | rdp->nxtlist = NULL; | ||
946 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
947 | rdp->nxttail[i] = &rdp->nxtlist; | ||
948 | } | ||
949 | |||
950 | /* | ||
939 | * Advance this CPU's callbacks, but only if the current grace period | 951 | * Advance this CPU's callbacks, but only if the current grace period |
940 | * has ended. This may be called only from the CPU to whom the rdp | 952 | * has ended. This may be called only from the CPU to whom the rdp |
941 | * belongs. In addition, the corresponding leaf rcu_node structure's | 953 | * belongs. In addition, the corresponding leaf rcu_node structure's |
@@ -1327,8 +1339,6 @@ static void | |||
1327 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | 1339 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, |
1328 | struct rcu_node *rnp, struct rcu_data *rdp) | 1340 | struct rcu_node *rnp, struct rcu_data *rdp) |
1329 | { | 1341 | { |
1330 | int i; | ||
1331 | |||
1332 | /* | 1342 | /* |
1333 | * Orphan the callbacks. First adjust the counts. This is safe | 1343 | * Orphan the callbacks. First adjust the counts. This is safe |
1334 | * because ->onofflock excludes _rcu_barrier()'s adoption of | 1344 | * because ->onofflock excludes _rcu_barrier()'s adoption of |
@@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1339 | rsp->qlen += rdp->qlen; | 1349 | rsp->qlen += rdp->qlen; |
1340 | rdp->n_cbs_orphaned += rdp->qlen; | 1350 | rdp->n_cbs_orphaned += rdp->qlen; |
1341 | rdp->qlen_lazy = 0; | 1351 | rdp->qlen_lazy = 0; |
1342 | rdp->qlen = 0; | 1352 | ACCESS_ONCE(rdp->qlen) = 0; |
1343 | } | 1353 | } |
1344 | 1354 | ||
1345 | /* | 1355 | /* |
@@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1368 | } | 1378 | } |
1369 | 1379 | ||
1370 | /* Finally, initialize the rcu_data structure's list to empty. */ | 1380 | /* Finally, initialize the rcu_data structure's list to empty. */ |
1371 | rdp->nxtlist = NULL; | 1381 | init_callback_list(rdp); |
1372 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1373 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1374 | } | 1382 | } |
1375 | 1383 | ||
1376 | /* | 1384 | /* |
@@ -1397,6 +1405,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
1397 | rdp->qlen_lazy += rsp->qlen_lazy; | 1405 | rdp->qlen_lazy += rsp->qlen_lazy; |
1398 | rdp->qlen += rsp->qlen; | 1406 | rdp->qlen += rsp->qlen; |
1399 | rdp->n_cbs_adopted += rsp->qlen; | 1407 | rdp->n_cbs_adopted += rsp->qlen; |
1408 | if (rsp->qlen_lazy != rsp->qlen) | ||
1409 | rcu_idle_count_callbacks_posted(); | ||
1400 | rsp->qlen_lazy = 0; | 1410 | rsp->qlen_lazy = 0; |
1401 | rsp->qlen = 0; | 1411 | rsp->qlen = 0; |
1402 | 1412 | ||
@@ -1502,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1502 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1512 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1503 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1513 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1504 | rcu_report_exp_rnp(rsp, rnp, true); | 1514 | rcu_report_exp_rnp(rsp, rnp, true); |
1515 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | ||
1516 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | ||
1517 | cpu, rdp->qlen, rdp->nxtlist); | ||
1505 | } | 1518 | } |
1506 | 1519 | ||
1507 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1520 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
@@ -1528,7 +1541,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1528 | { | 1541 | { |
1529 | unsigned long flags; | 1542 | unsigned long flags; |
1530 | struct rcu_head *next, *list, **tail; | 1543 | struct rcu_head *next, *list, **tail; |
1531 | int bl, count, count_lazy; | 1544 | int bl, count, count_lazy, i; |
1532 | 1545 | ||
1533 | /* If no callbacks are ready, just return.*/ | 1546 | /* If no callbacks are ready, just return.*/ |
1534 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1547 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
@@ -1551,9 +1564,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1551 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 1564 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; |
1552 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | 1565 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1553 | tail = rdp->nxttail[RCU_DONE_TAIL]; | 1566 | tail = rdp->nxttail[RCU_DONE_TAIL]; |
1554 | for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) | 1567 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) |
1555 | if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) | 1568 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) |
1556 | rdp->nxttail[count] = &rdp->nxtlist; | 1569 | rdp->nxttail[i] = &rdp->nxtlist; |
1557 | local_irq_restore(flags); | 1570 | local_irq_restore(flags); |
1558 | 1571 | ||
1559 | /* Invoke callbacks. */ | 1572 | /* Invoke callbacks. */ |
@@ -1581,15 +1594,15 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1581 | if (list != NULL) { | 1594 | if (list != NULL) { |
1582 | *tail = rdp->nxtlist; | 1595 | *tail = rdp->nxtlist; |
1583 | rdp->nxtlist = list; | 1596 | rdp->nxtlist = list; |
1584 | for (count = 0; count < RCU_NEXT_SIZE; count++) | 1597 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1585 | if (&rdp->nxtlist == rdp->nxttail[count]) | 1598 | if (&rdp->nxtlist == rdp->nxttail[i]) |
1586 | rdp->nxttail[count] = tail; | 1599 | rdp->nxttail[i] = tail; |
1587 | else | 1600 | else |
1588 | break; | 1601 | break; |
1589 | } | 1602 | } |
1590 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | 1603 | smp_mb(); /* List handling before counting for rcu_barrier(). */ |
1591 | rdp->qlen_lazy -= count_lazy; | 1604 | rdp->qlen_lazy -= count_lazy; |
1592 | rdp->qlen -= count; | 1605 | ACCESS_ONCE(rdp->qlen) -= count; |
1593 | rdp->n_cbs_invoked += count; | 1606 | rdp->n_cbs_invoked += count; |
1594 | 1607 | ||
1595 | /* Reinstate batch limit if we have worked down the excess. */ | 1608 | /* Reinstate batch limit if we have worked down the excess. */ |
@@ -1602,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1602 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1615 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1603 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) | 1616 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) |
1604 | rdp->qlen_last_fqs_check = rdp->qlen; | 1617 | rdp->qlen_last_fqs_check = rdp->qlen; |
1618 | WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); | ||
1605 | 1619 | ||
1606 | local_irq_restore(flags); | 1620 | local_irq_restore(flags); |
1607 | 1621 | ||
@@ -1742,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1742 | break; /* grace period idle or initializing, ignore. */ | 1756 | break; /* grace period idle or initializing, ignore. */ |
1743 | 1757 | ||
1744 | case RCU_SAVE_DYNTICK: | 1758 | case RCU_SAVE_DYNTICK: |
1745 | if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) | ||
1746 | break; /* So gcc recognizes the dead code. */ | ||
1747 | 1759 | ||
1748 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 1760 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
1749 | 1761 | ||
@@ -1785,9 +1797,10 @@ unlock_fqs_ret: | |||
1785 | * whom the rdp belongs. | 1797 | * whom the rdp belongs. |
1786 | */ | 1798 | */ |
1787 | static void | 1799 | static void |
1788 | __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | 1800 | __rcu_process_callbacks(struct rcu_state *rsp) |
1789 | { | 1801 | { |
1790 | unsigned long flags; | 1802 | unsigned long flags; |
1803 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
1791 | 1804 | ||
1792 | WARN_ON_ONCE(rdp->beenonline == 0); | 1805 | WARN_ON_ONCE(rdp->beenonline == 0); |
1793 | 1806 | ||
@@ -1823,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1823 | */ | 1836 | */ |
1824 | static void rcu_process_callbacks(struct softirq_action *unused) | 1837 | static void rcu_process_callbacks(struct softirq_action *unused) |
1825 | { | 1838 | { |
1839 | struct rcu_state *rsp; | ||
1840 | |||
1826 | trace_rcu_utilization("Start RCU core"); | 1841 | trace_rcu_utilization("Start RCU core"); |
1827 | __rcu_process_callbacks(&rcu_sched_state, | 1842 | for_each_rcu_flavor(rsp) |
1828 | &__get_cpu_var(rcu_sched_data)); | 1843 | __rcu_process_callbacks(rsp); |
1829 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | ||
1830 | rcu_preempt_process_callbacks(); | ||
1831 | trace_rcu_utilization("End RCU core"); | 1844 | trace_rcu_utilization("End RCU core"); |
1832 | } | 1845 | } |
1833 | 1846 | ||
@@ -1854,6 +1867,56 @@ static void invoke_rcu_core(void) | |||
1854 | raise_softirq(RCU_SOFTIRQ); | 1867 | raise_softirq(RCU_SOFTIRQ); |
1855 | } | 1868 | } |
1856 | 1869 | ||
1870 | /* | ||
1871 | * Handle any core-RCU processing required by a call_rcu() invocation. | ||
1872 | */ | ||
1873 | static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | ||
1874 | struct rcu_head *head, unsigned long flags) | ||
1875 | { | ||
1876 | /* | ||
1877 | * If called from an extended quiescent state, invoke the RCU | ||
1878 | * core in order to force a re-evaluation of RCU's idleness. | ||
1879 | */ | ||
1880 | if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) | ||
1881 | invoke_rcu_core(); | ||
1882 | |||
1883 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | ||
1884 | if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) | ||
1885 | return; | ||
1886 | |||
1887 | /* | ||
1888 | * Force the grace period if too many callbacks or too long waiting. | ||
1889 | * Enforce hysteresis, and don't invoke force_quiescent_state() | ||
1890 | * if some other CPU has recently done so. Also, don't bother | ||
1891 | * invoking force_quiescent_state() if the newly enqueued callback | ||
1892 | * is the only one waiting for a grace period to complete. | ||
1893 | */ | ||
1894 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | ||
1895 | |||
1896 | /* Are we ignoring a completed grace period? */ | ||
1897 | rcu_process_gp_end(rsp, rdp); | ||
1898 | check_for_new_grace_period(rsp, rdp); | ||
1899 | |||
1900 | /* Start a new grace period if one not already started. */ | ||
1901 | if (!rcu_gp_in_progress(rsp)) { | ||
1902 | unsigned long nestflag; | ||
1903 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
1904 | |||
1905 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
1906 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | ||
1907 | } else { | ||
1908 | /* Give the grace period a kick. */ | ||
1909 | rdp->blimit = LONG_MAX; | ||
1910 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
1911 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
1912 | force_quiescent_state(rsp, 0); | ||
1913 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
1914 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
1915 | } | ||
1916 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | ||
1917 | force_quiescent_state(rsp, 1); | ||
1918 | } | ||
1919 | |||
1857 | static void | 1920 | static void |
1858 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 1921 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
1859 | struct rcu_state *rsp, bool lazy) | 1922 | struct rcu_state *rsp, bool lazy) |
@@ -1878,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1878 | rdp = this_cpu_ptr(rsp->rda); | 1941 | rdp = this_cpu_ptr(rsp->rda); |
1879 | 1942 | ||
1880 | /* Add the callback to our list. */ | 1943 | /* Add the callback to our list. */ |
1881 | rdp->qlen++; | 1944 | ACCESS_ONCE(rdp->qlen)++; |
1882 | if (lazy) | 1945 | if (lazy) |
1883 | rdp->qlen_lazy++; | 1946 | rdp->qlen_lazy++; |
1884 | else | 1947 | else |
@@ -1893,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1893 | else | 1956 | else |
1894 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); | 1957 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); |
1895 | 1958 | ||
1896 | /* If interrupts were disabled, don't dive into RCU core. */ | 1959 | /* Go handle any RCU core processing required. */ |
1897 | if (irqs_disabled_flags(flags)) { | 1960 | __call_rcu_core(rsp, rdp, head, flags); |
1898 | local_irq_restore(flags); | ||
1899 | return; | ||
1900 | } | ||
1901 | |||
1902 | /* | ||
1903 | * Force the grace period if too many callbacks or too long waiting. | ||
1904 | * Enforce hysteresis, and don't invoke force_quiescent_state() | ||
1905 | * if some other CPU has recently done so. Also, don't bother | ||
1906 | * invoking force_quiescent_state() if the newly enqueued callback | ||
1907 | * is the only one waiting for a grace period to complete. | ||
1908 | */ | ||
1909 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | ||
1910 | |||
1911 | /* Are we ignoring a completed grace period? */ | ||
1912 | rcu_process_gp_end(rsp, rdp); | ||
1913 | check_for_new_grace_period(rsp, rdp); | ||
1914 | |||
1915 | /* Start a new grace period if one not already started. */ | ||
1916 | if (!rcu_gp_in_progress(rsp)) { | ||
1917 | unsigned long nestflag; | ||
1918 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
1919 | |||
1920 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
1921 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | ||
1922 | } else { | ||
1923 | /* Give the grace period a kick. */ | ||
1924 | rdp->blimit = LONG_MAX; | ||
1925 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
1926 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
1927 | force_quiescent_state(rsp, 0); | ||
1928 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
1929 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
1930 | } | ||
1931 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | ||
1932 | force_quiescent_state(rsp, 1); | ||
1933 | local_irq_restore(flags); | 1961 | local_irq_restore(flags); |
1934 | } | 1962 | } |
1935 | 1963 | ||
@@ -1959,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
1959 | * occasionally incorrectly indicate that there are multiple CPUs online | 1987 | * occasionally incorrectly indicate that there are multiple CPUs online |
1960 | * when there was in fact only one the whole time, as this just adds | 1988 | * when there was in fact only one the whole time, as this just adds |
1961 | * some overhead: RCU still operates correctly. | 1989 | * some overhead: RCU still operates correctly. |
1962 | * | ||
1963 | * Of course, sampling num_online_cpus() with preemption enabled can | ||
1964 | * give erroneous results if there are concurrent CPU-hotplug operations. | ||
1965 | * For example, given a demonic sequence of preemptions in num_online_cpus() | ||
1966 | * and CPU-hotplug operations, there could be two or more CPUs online at | ||
1967 | * all times, but num_online_cpus() might well return one (or even zero). | ||
1968 | * | ||
1969 | * However, all such demonic sequences require at least one CPU-offline | ||
1970 | * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer | ||
1971 | * is only a problem if there is an RCU read-side critical section executing | ||
1972 | * throughout. But RCU-sched and RCU-bh read-side critical sections | ||
1973 | * disable either preemption or bh, which prevents a CPU from going offline. | ||
1974 | * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return | ||
1975 | * that there is only one CPU when in fact there was more than one throughout | ||
1976 | * is when there were no RCU readers in the system. If there are no | ||
1977 | * RCU readers, the grace period by definition can be of zero length, | ||
1978 | * regardless of the number of online CPUs. | ||
1979 | */ | 1990 | */ |
1980 | static inline int rcu_blocking_is_gp(void) | 1991 | static inline int rcu_blocking_is_gp(void) |
1981 | { | 1992 | { |
1993 | int ret; | ||
1994 | |||
1982 | might_sleep(); /* Check for RCU read-side critical section. */ | 1995 | might_sleep(); /* Check for RCU read-side critical section. */ |
1983 | return num_online_cpus() <= 1; | 1996 | preempt_disable(); |
1997 | ret = num_online_cpus() <= 1; | ||
1998 | preempt_enable(); | ||
1999 | return ret; | ||
1984 | } | 2000 | } |
1985 | 2001 | ||
1986 | /** | 2002 | /** |
@@ -2115,9 +2131,9 @@ void synchronize_sched_expedited(void) | |||
2115 | put_online_cpus(); | 2131 | put_online_cpus(); |
2116 | 2132 | ||
2117 | /* No joy, try again later. Or just synchronize_sched(). */ | 2133 | /* No joy, try again later. Or just synchronize_sched(). */ |
2118 | if (trycount++ < 10) | 2134 | if (trycount++ < 10) { |
2119 | udelay(trycount * num_online_cpus()); | 2135 | udelay(trycount * num_online_cpus()); |
2120 | else { | 2136 | } else { |
2121 | synchronize_sched(); | 2137 | synchronize_sched(); |
2122 | return; | 2138 | return; |
2123 | } | 2139 | } |
@@ -2238,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2238 | */ | 2254 | */ |
2239 | static int rcu_pending(int cpu) | 2255 | static int rcu_pending(int cpu) |
2240 | { | 2256 | { |
2241 | return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || | 2257 | struct rcu_state *rsp; |
2242 | __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || | 2258 | |
2243 | rcu_preempt_pending(cpu); | 2259 | for_each_rcu_flavor(rsp) |
2260 | if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) | ||
2261 | return 1; | ||
2262 | return 0; | ||
2244 | } | 2263 | } |
2245 | 2264 | ||
2246 | /* | 2265 | /* |
@@ -2250,20 +2269,41 @@ static int rcu_pending(int cpu) | |||
2250 | */ | 2269 | */ |
2251 | static int rcu_cpu_has_callbacks(int cpu) | 2270 | static int rcu_cpu_has_callbacks(int cpu) |
2252 | { | 2271 | { |
2272 | struct rcu_state *rsp; | ||
2273 | |||
2253 | /* RCU callbacks either ready or pending? */ | 2274 | /* RCU callbacks either ready or pending? */ |
2254 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 2275 | for_each_rcu_flavor(rsp) |
2255 | per_cpu(rcu_bh_data, cpu).nxtlist || | 2276 | if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) |
2256 | rcu_preempt_cpu_has_callbacks(cpu); | 2277 | return 1; |
2278 | return 0; | ||
2279 | } | ||
2280 | |||
2281 | /* | ||
2282 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, | ||
2283 | * the compiler is expected to optimize this away. | ||
2284 | */ | ||
2285 | static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, | ||
2286 | int cpu, unsigned long done) | ||
2287 | { | ||
2288 | trace_rcu_barrier(rsp->name, s, cpu, | ||
2289 | atomic_read(&rsp->barrier_cpu_count), done); | ||
2257 | } | 2290 | } |
2258 | 2291 | ||
2259 | /* | 2292 | /* |
2260 | * RCU callback function for _rcu_barrier(). If we are last, wake | 2293 | * RCU callback function for _rcu_barrier(). If we are last, wake |
2261 | * up the task executing _rcu_barrier(). | 2294 | * up the task executing _rcu_barrier(). |
2262 | */ | 2295 | */ |
2263 | static void rcu_barrier_callback(struct rcu_head *notused) | 2296 | static void rcu_barrier_callback(struct rcu_head *rhp) |
2264 | { | 2297 | { |
2265 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2298 | struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); |
2266 | complete(&rcu_barrier_completion); | 2299 | struct rcu_state *rsp = rdp->rsp; |
2300 | |||
2301 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { | ||
2302 | _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); | ||
2303 | complete(&rsp->barrier_completion); | ||
2304 | } else { | ||
2305 | _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); | ||
2306 | } | ||
2267 | } | 2307 | } |
2268 | 2308 | ||
2269 | /* | 2309 | /* |
@@ -2271,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused) | |||
2271 | */ | 2311 | */ |
2272 | static void rcu_barrier_func(void *type) | 2312 | static void rcu_barrier_func(void *type) |
2273 | { | 2313 | { |
2274 | int cpu = smp_processor_id(); | 2314 | struct rcu_state *rsp = type; |
2275 | struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); | 2315 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); |
2276 | void (*call_rcu_func)(struct rcu_head *head, | ||
2277 | void (*func)(struct rcu_head *head)); | ||
2278 | 2316 | ||
2279 | atomic_inc(&rcu_barrier_cpu_count); | 2317 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); |
2280 | call_rcu_func = type; | 2318 | atomic_inc(&rsp->barrier_cpu_count); |
2281 | call_rcu_func(head, rcu_barrier_callback); | 2319 | rsp->call(&rdp->barrier_head, rcu_barrier_callback); |
2282 | } | 2320 | } |
2283 | 2321 | ||
2284 | /* | 2322 | /* |
2285 | * Orchestrate the specified type of RCU barrier, waiting for all | 2323 | * Orchestrate the specified type of RCU barrier, waiting for all |
2286 | * RCU callbacks of the specified type to complete. | 2324 | * RCU callbacks of the specified type to complete. |
2287 | */ | 2325 | */ |
2288 | static void _rcu_barrier(struct rcu_state *rsp, | 2326 | static void _rcu_barrier(struct rcu_state *rsp) |
2289 | void (*call_rcu_func)(struct rcu_head *head, | ||
2290 | void (*func)(struct rcu_head *head))) | ||
2291 | { | 2327 | { |
2292 | int cpu; | 2328 | int cpu; |
2293 | unsigned long flags; | 2329 | unsigned long flags; |
2294 | struct rcu_data *rdp; | 2330 | struct rcu_data *rdp; |
2295 | struct rcu_head rh; | 2331 | struct rcu_data rd; |
2332 | unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); | ||
2333 | unsigned long snap_done; | ||
2296 | 2334 | ||
2297 | init_rcu_head_on_stack(&rh); | 2335 | init_rcu_head_on_stack(&rd.barrier_head); |
2336 | _rcu_barrier_trace(rsp, "Begin", -1, snap); | ||
2298 | 2337 | ||
2299 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2338 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
2300 | mutex_lock(&rcu_barrier_mutex); | 2339 | mutex_lock(&rsp->barrier_mutex); |
2340 | |||
2341 | /* | ||
2342 | * Ensure that all prior references, including to ->n_barrier_done, | ||
2343 | * are ordered before the _rcu_barrier() machinery. | ||
2344 | */ | ||
2345 | smp_mb(); /* See above block comment. */ | ||
2346 | |||
2347 | /* | ||
2348 | * Recheck ->n_barrier_done to see if others did our work for us. | ||
2349 | * This means checking ->n_barrier_done for an even-to-odd-to-even | ||
2350 | * transition. The "if" expression below therefore rounds the old | ||
2351 | * value up to the next even number and adds two before comparing. | ||
2352 | */ | ||
2353 | snap_done = ACCESS_ONCE(rsp->n_barrier_done); | ||
2354 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); | ||
2355 | if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { | ||
2356 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); | ||
2357 | smp_mb(); /* caller's subsequent code after above check. */ | ||
2358 | mutex_unlock(&rsp->barrier_mutex); | ||
2359 | return; | ||
2360 | } | ||
2301 | 2361 | ||
2302 | smp_mb(); /* Prevent any prior operations from leaking in. */ | 2362 | /* |
2363 | * Increment ->n_barrier_done to avoid duplicate work. Use | ||
2364 | * ACCESS_ONCE() to prevent the compiler from speculating | ||
2365 | * the increment to precede the early-exit check. | ||
2366 | */ | ||
2367 | ACCESS_ONCE(rsp->n_barrier_done)++; | ||
2368 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); | ||
2369 | _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); | ||
2370 | smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ | ||
2303 | 2371 | ||
2304 | /* | 2372 | /* |
2305 | * Initialize the count to one rather than to zero in order to | 2373 | * Initialize the count to one rather than to zero in order to |
@@ -2318,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2318 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | 2386 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening |
2319 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | 2387 | * us -- but before CPU 1's orphaned callbacks are invoked!!! |
2320 | */ | 2388 | */ |
2321 | init_completion(&rcu_barrier_completion); | 2389 | init_completion(&rsp->barrier_completion); |
2322 | atomic_set(&rcu_barrier_cpu_count, 1); | 2390 | atomic_set(&rsp->barrier_cpu_count, 1); |
2323 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 2391 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
2324 | rsp->rcu_barrier_in_progress = current; | 2392 | rsp->rcu_barrier_in_progress = current; |
2325 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 2393 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
@@ -2335,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2335 | preempt_disable(); | 2403 | preempt_disable(); |
2336 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2404 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2337 | if (cpu_is_offline(cpu)) { | 2405 | if (cpu_is_offline(cpu)) { |
2406 | _rcu_barrier_trace(rsp, "Offline", cpu, | ||
2407 | rsp->n_barrier_done); | ||
2338 | preempt_enable(); | 2408 | preempt_enable(); |
2339 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | 2409 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) |
2340 | schedule_timeout_interruptible(1); | 2410 | schedule_timeout_interruptible(1); |
2341 | } else if (ACCESS_ONCE(rdp->qlen)) { | 2411 | } else if (ACCESS_ONCE(rdp->qlen)) { |
2342 | smp_call_function_single(cpu, rcu_barrier_func, | 2412 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
2343 | (void *)call_rcu_func, 1); | 2413 | rsp->n_barrier_done); |
2414 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | ||
2344 | preempt_enable(); | 2415 | preempt_enable(); |
2345 | } else { | 2416 | } else { |
2417 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, | ||
2418 | rsp->n_barrier_done); | ||
2346 | preempt_enable(); | 2419 | preempt_enable(); |
2347 | } | 2420 | } |
2348 | } | 2421 | } |
@@ -2359,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2359 | rcu_adopt_orphan_cbs(rsp); | 2432 | rcu_adopt_orphan_cbs(rsp); |
2360 | rsp->rcu_barrier_in_progress = NULL; | 2433 | rsp->rcu_barrier_in_progress = NULL; |
2361 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 2434 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
2362 | atomic_inc(&rcu_barrier_cpu_count); | 2435 | atomic_inc(&rsp->barrier_cpu_count); |
2363 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | 2436 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ |
2364 | call_rcu_func(&rh, rcu_barrier_callback); | 2437 | rd.rsp = rsp; |
2438 | rsp->call(&rd.barrier_head, rcu_barrier_callback); | ||
2365 | 2439 | ||
2366 | /* | 2440 | /* |
2367 | * Now that we have an rcu_barrier_callback() callback on each | 2441 | * Now that we have an rcu_barrier_callback() callback on each |
2368 | * CPU, and thus each counted, remove the initial count. | 2442 | * CPU, and thus each counted, remove the initial count. |
2369 | */ | 2443 | */ |
2370 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2444 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) |
2371 | complete(&rcu_barrier_completion); | 2445 | complete(&rsp->barrier_completion); |
2446 | |||
2447 | /* Increment ->n_barrier_done to prevent duplicate work. */ | ||
2448 | smp_mb(); /* Keep increment after above mechanism. */ | ||
2449 | ACCESS_ONCE(rsp->n_barrier_done)++; | ||
2450 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); | ||
2451 | _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); | ||
2452 | smp_mb(); /* Keep increment before caller's subsequent code. */ | ||
2372 | 2453 | ||
2373 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | 2454 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ |
2374 | wait_for_completion(&rcu_barrier_completion); | 2455 | wait_for_completion(&rsp->barrier_completion); |
2375 | 2456 | ||
2376 | /* Other rcu_barrier() invocations can now safely proceed. */ | 2457 | /* Other rcu_barrier() invocations can now safely proceed. */ |
2377 | mutex_unlock(&rcu_barrier_mutex); | 2458 | mutex_unlock(&rsp->barrier_mutex); |
2378 | 2459 | ||
2379 | destroy_rcu_head_on_stack(&rh); | 2460 | destroy_rcu_head_on_stack(&rd.barrier_head); |
2380 | } | 2461 | } |
2381 | 2462 | ||
2382 | /** | 2463 | /** |
@@ -2384,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2384 | */ | 2465 | */ |
2385 | void rcu_barrier_bh(void) | 2466 | void rcu_barrier_bh(void) |
2386 | { | 2467 | { |
2387 | _rcu_barrier(&rcu_bh_state, call_rcu_bh); | 2468 | _rcu_barrier(&rcu_bh_state); |
2388 | } | 2469 | } |
2389 | EXPORT_SYMBOL_GPL(rcu_barrier_bh); | 2470 | EXPORT_SYMBOL_GPL(rcu_barrier_bh); |
2390 | 2471 | ||
@@ -2393,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); | |||
2393 | */ | 2474 | */ |
2394 | void rcu_barrier_sched(void) | 2475 | void rcu_barrier_sched(void) |
2395 | { | 2476 | { |
2396 | _rcu_barrier(&rcu_sched_state, call_rcu_sched); | 2477 | _rcu_barrier(&rcu_sched_state); |
2397 | } | 2478 | } |
2398 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 2479 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); |
2399 | 2480 | ||
@@ -2404,18 +2485,15 @@ static void __init | |||
2404 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | 2485 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) |
2405 | { | 2486 | { |
2406 | unsigned long flags; | 2487 | unsigned long flags; |
2407 | int i; | ||
2408 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 2488 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
2409 | struct rcu_node *rnp = rcu_get_root(rsp); | 2489 | struct rcu_node *rnp = rcu_get_root(rsp); |
2410 | 2490 | ||
2411 | /* Set up local state, ensuring consistent view of global state. */ | 2491 | /* Set up local state, ensuring consistent view of global state. */ |
2412 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2492 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2413 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | 2493 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); |
2414 | rdp->nxtlist = NULL; | 2494 | init_callback_list(rdp); |
2415 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
2416 | rdp->nxttail[i] = &rdp->nxtlist; | ||
2417 | rdp->qlen_lazy = 0; | 2495 | rdp->qlen_lazy = 0; |
2418 | rdp->qlen = 0; | 2496 | ACCESS_ONCE(rdp->qlen) = 0; |
2419 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2497 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
2420 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | 2498 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
2421 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 2499 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
@@ -2489,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2489 | 2567 | ||
2490 | static void __cpuinit rcu_prepare_cpu(int cpu) | 2568 | static void __cpuinit rcu_prepare_cpu(int cpu) |
2491 | { | 2569 | { |
2492 | rcu_init_percpu_data(cpu, &rcu_sched_state, 0); | 2570 | struct rcu_state *rsp; |
2493 | rcu_init_percpu_data(cpu, &rcu_bh_state, 0); | 2571 | |
2494 | rcu_preempt_init_percpu_data(cpu); | 2572 | for_each_rcu_flavor(rsp) |
2573 | rcu_init_percpu_data(cpu, rsp, | ||
2574 | strcmp(rsp->name, "rcu_preempt") == 0); | ||
2495 | } | 2575 | } |
2496 | 2576 | ||
2497 | /* | 2577 | /* |
@@ -2503,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2503 | long cpu = (long)hcpu; | 2583 | long cpu = (long)hcpu; |
2504 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 2584 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
2505 | struct rcu_node *rnp = rdp->mynode; | 2585 | struct rcu_node *rnp = rdp->mynode; |
2586 | struct rcu_state *rsp; | ||
2506 | 2587 | ||
2507 | trace_rcu_utilization("Start CPU hotplug"); | 2588 | trace_rcu_utilization("Start CPU hotplug"); |
2508 | switch (action) { | 2589 | switch (action) { |
@@ -2527,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2527 | * touch any data without introducing corruption. We send the | 2608 | * touch any data without introducing corruption. We send the |
2528 | * dying CPU's callbacks to an arbitrarily chosen online CPU. | 2609 | * dying CPU's callbacks to an arbitrarily chosen online CPU. |
2529 | */ | 2610 | */ |
2530 | rcu_cleanup_dying_cpu(&rcu_bh_state); | 2611 | for_each_rcu_flavor(rsp) |
2531 | rcu_cleanup_dying_cpu(&rcu_sched_state); | 2612 | rcu_cleanup_dying_cpu(rsp); |
2532 | rcu_preempt_cleanup_dying_cpu(); | ||
2533 | rcu_cleanup_after_idle(cpu); | 2613 | rcu_cleanup_after_idle(cpu); |
2534 | break; | 2614 | break; |
2535 | case CPU_DEAD: | 2615 | case CPU_DEAD: |
2536 | case CPU_DEAD_FROZEN: | 2616 | case CPU_DEAD_FROZEN: |
2537 | case CPU_UP_CANCELED: | 2617 | case CPU_UP_CANCELED: |
2538 | case CPU_UP_CANCELED_FROZEN: | 2618 | case CPU_UP_CANCELED_FROZEN: |
2539 | rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); | 2619 | for_each_rcu_flavor(rsp) |
2540 | rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); | 2620 | rcu_cleanup_dead_cpu(cpu, rsp); |
2541 | rcu_preempt_cleanup_dead_cpu(cpu); | ||
2542 | break; | 2621 | break; |
2543 | default: | 2622 | default: |
2544 | break; | 2623 | break; |
@@ -2571,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2571 | { | 2650 | { |
2572 | int i; | 2651 | int i; |
2573 | 2652 | ||
2574 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) | 2653 | for (i = rcu_num_lvls - 1; i > 0; i--) |
2575 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 2654 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
2576 | rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; | 2655 | rsp->levelspread[0] = rcu_fanout_leaf; |
2577 | } | 2656 | } |
2578 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 2657 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
2579 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 2658 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
@@ -2583,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2583 | int i; | 2662 | int i; |
2584 | 2663 | ||
2585 | cprv = NR_CPUS; | 2664 | cprv = NR_CPUS; |
2586 | for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { | 2665 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
2587 | ccur = rsp->levelcnt[i]; | 2666 | ccur = rsp->levelcnt[i]; |
2588 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | 2667 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; |
2589 | cprv = ccur; | 2668 | cprv = ccur; |
@@ -2610,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
2610 | 2689 | ||
2611 | /* Initialize the level-tracking arrays. */ | 2690 | /* Initialize the level-tracking arrays. */ |
2612 | 2691 | ||
2613 | for (i = 1; i < NUM_RCU_LVLS; i++) | 2692 | for (i = 0; i < rcu_num_lvls; i++) |
2693 | rsp->levelcnt[i] = num_rcu_lvl[i]; | ||
2694 | for (i = 1; i < rcu_num_lvls; i++) | ||
2614 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; | 2695 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; |
2615 | rcu_init_levelspread(rsp); | 2696 | rcu_init_levelspread(rsp); |
2616 | 2697 | ||
2617 | /* Initialize the elements themselves, starting from the leaves. */ | 2698 | /* Initialize the elements themselves, starting from the leaves. */ |
2618 | 2699 | ||
2619 | for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { | 2700 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
2620 | cpustride *= rsp->levelspread[i]; | 2701 | cpustride *= rsp->levelspread[i]; |
2621 | rnp = rsp->level[i]; | 2702 | rnp = rsp->level[i]; |
2622 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { | 2703 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { |
@@ -2646,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
2646 | } | 2727 | } |
2647 | 2728 | ||
2648 | rsp->rda = rda; | 2729 | rsp->rda = rda; |
2649 | rnp = rsp->level[NUM_RCU_LVLS - 1]; | 2730 | rnp = rsp->level[rcu_num_lvls - 1]; |
2650 | for_each_possible_cpu(i) { | 2731 | for_each_possible_cpu(i) { |
2651 | while (i > rnp->grphi) | 2732 | while (i > rnp->grphi) |
2652 | rnp++; | 2733 | rnp++; |
2653 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; | 2734 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; |
2654 | rcu_boot_init_percpu_data(i, rsp); | 2735 | rcu_boot_init_percpu_data(i, rsp); |
2655 | } | 2736 | } |
2737 | list_add(&rsp->flavors, &rcu_struct_flavors); | ||
2738 | } | ||
2739 | |||
2740 | /* | ||
2741 | * Compute the rcu_node tree geometry from kernel parameters. This cannot | ||
2742 | * replace the definitions in rcutree.h because those are needed to size | ||
2743 | * the ->node array in the rcu_state structure. | ||
2744 | */ | ||
2745 | static void __init rcu_init_geometry(void) | ||
2746 | { | ||
2747 | int i; | ||
2748 | int j; | ||
2749 | int n = nr_cpu_ids; | ||
2750 | int rcu_capacity[MAX_RCU_LVLS + 1]; | ||
2751 | |||
2752 | /* If the compile-time values are accurate, just leave. */ | ||
2753 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) | ||
2754 | return; | ||
2755 | |||
2756 | /* | ||
2757 | * Compute number of nodes that can be handled an rcu_node tree | ||
2758 | * with the given number of levels. Setting rcu_capacity[0] makes | ||
2759 | * some of the arithmetic easier. | ||
2760 | */ | ||
2761 | rcu_capacity[0] = 1; | ||
2762 | rcu_capacity[1] = rcu_fanout_leaf; | ||
2763 | for (i = 2; i <= MAX_RCU_LVLS; i++) | ||
2764 | rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; | ||
2765 | |||
2766 | /* | ||
2767 | * The boot-time rcu_fanout_leaf parameter is only permitted | ||
2768 | * to increase the leaf-level fanout, not decrease it. Of course, | ||
2769 | * the leaf-level fanout cannot exceed the number of bits in | ||
2770 | * the rcu_node masks. Finally, the tree must be able to accommodate | ||
2771 | * the configured number of CPUs. Complain and fall back to the | ||
2772 | * compile-time values if these limits are exceeded. | ||
2773 | */ | ||
2774 | if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || | ||
2775 | rcu_fanout_leaf > sizeof(unsigned long) * 8 || | ||
2776 | n > rcu_capacity[MAX_RCU_LVLS]) { | ||
2777 | WARN_ON(1); | ||
2778 | return; | ||
2779 | } | ||
2780 | |||
2781 | /* Calculate the number of rcu_nodes at each level of the tree. */ | ||
2782 | for (i = 1; i <= MAX_RCU_LVLS; i++) | ||
2783 | if (n <= rcu_capacity[i]) { | ||
2784 | for (j = 0; j <= i; j++) | ||
2785 | num_rcu_lvl[j] = | ||
2786 | DIV_ROUND_UP(n, rcu_capacity[i - j]); | ||
2787 | rcu_num_lvls = i; | ||
2788 | for (j = i + 1; j <= MAX_RCU_LVLS; j++) | ||
2789 | num_rcu_lvl[j] = 0; | ||
2790 | break; | ||
2791 | } | ||
2792 | |||
2793 | /* Calculate the total number of rcu_node structures. */ | ||
2794 | rcu_num_nodes = 0; | ||
2795 | for (i = 0; i <= MAX_RCU_LVLS; i++) | ||
2796 | rcu_num_nodes += num_rcu_lvl[i]; | ||
2797 | rcu_num_nodes -= n; | ||
2656 | } | 2798 | } |
2657 | 2799 | ||
2658 | void __init rcu_init(void) | 2800 | void __init rcu_init(void) |
@@ -2660,6 +2802,7 @@ void __init rcu_init(void) | |||
2660 | int cpu; | 2802 | int cpu; |
2661 | 2803 | ||
2662 | rcu_bootup_announce(); | 2804 | rcu_bootup_announce(); |
2805 | rcu_init_geometry(); | ||
2663 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 2806 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
2664 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 2807 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
2665 | __rcu_init_preempt(); | 2808 | __rcu_init_preempt(); |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138dedf5..4d29169f2124 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -42,28 +42,28 @@ | |||
42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | 42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) |
43 | 43 | ||
44 | #if NR_CPUS <= RCU_FANOUT_1 | 44 | #if NR_CPUS <= RCU_FANOUT_1 |
45 | # define NUM_RCU_LVLS 1 | 45 | # define RCU_NUM_LVLS 1 |
46 | # define NUM_RCU_LVL_0 1 | 46 | # define NUM_RCU_LVL_0 1 |
47 | # define NUM_RCU_LVL_1 (NR_CPUS) | 47 | # define NUM_RCU_LVL_1 (NR_CPUS) |
48 | # define NUM_RCU_LVL_2 0 | 48 | # define NUM_RCU_LVL_2 0 |
49 | # define NUM_RCU_LVL_3 0 | 49 | # define NUM_RCU_LVL_3 0 |
50 | # define NUM_RCU_LVL_4 0 | 50 | # define NUM_RCU_LVL_4 0 |
51 | #elif NR_CPUS <= RCU_FANOUT_2 | 51 | #elif NR_CPUS <= RCU_FANOUT_2 |
52 | # define NUM_RCU_LVLS 2 | 52 | # define RCU_NUM_LVLS 2 |
53 | # define NUM_RCU_LVL_0 1 | 53 | # define NUM_RCU_LVL_0 1 |
54 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | 54 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
55 | # define NUM_RCU_LVL_2 (NR_CPUS) | 55 | # define NUM_RCU_LVL_2 (NR_CPUS) |
56 | # define NUM_RCU_LVL_3 0 | 56 | # define NUM_RCU_LVL_3 0 |
57 | # define NUM_RCU_LVL_4 0 | 57 | # define NUM_RCU_LVL_4 0 |
58 | #elif NR_CPUS <= RCU_FANOUT_3 | 58 | #elif NR_CPUS <= RCU_FANOUT_3 |
59 | # define NUM_RCU_LVLS 3 | 59 | # define RCU_NUM_LVLS 3 |
60 | # define NUM_RCU_LVL_0 1 | 60 | # define NUM_RCU_LVL_0 1 |
61 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | 61 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
62 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | 62 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
63 | # define NUM_RCU_LVL_3 (NR_CPUS) | 63 | # define NUM_RCU_LVL_3 (NR_CPUS) |
64 | # define NUM_RCU_LVL_4 0 | 64 | # define NUM_RCU_LVL_4 0 |
65 | #elif NR_CPUS <= RCU_FANOUT_4 | 65 | #elif NR_CPUS <= RCU_FANOUT_4 |
66 | # define NUM_RCU_LVLS 4 | 66 | # define RCU_NUM_LVLS 4 |
67 | # define NUM_RCU_LVL_0 1 | 67 | # define NUM_RCU_LVL_0 1 |
68 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | 68 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) |
69 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | 69 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
@@ -76,6 +76,9 @@ | |||
76 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) | 76 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) |
77 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) | 77 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) |
78 | 78 | ||
79 | extern int rcu_num_lvls; | ||
80 | extern int rcu_num_nodes; | ||
81 | |||
79 | /* | 82 | /* |
80 | * Dynticks per-CPU state. | 83 | * Dynticks per-CPU state. |
81 | */ | 84 | */ |
@@ -84,6 +87,21 @@ struct rcu_dynticks { | |||
84 | /* Process level is worth LLONG_MAX/2. */ | 87 | /* Process level is worth LLONG_MAX/2. */ |
85 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
86 | atomic_t dynticks; /* Even value for idle, else odd. */ | 89 | atomic_t dynticks; /* Even value for idle, else odd. */ |
90 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
91 | int dyntick_drain; /* Prepare-for-idle state variable. */ | ||
92 | unsigned long dyntick_holdoff; | ||
93 | /* No retries for the jiffy of failure. */ | ||
94 | struct timer_list idle_gp_timer; | ||
95 | /* Wake up CPU sleeping with callbacks. */ | ||
96 | unsigned long idle_gp_timer_expires; | ||
97 | /* When to wake up CPU (for repost). */ | ||
98 | bool idle_first_pass; /* First pass of attempt to go idle? */ | ||
99 | unsigned long nonlazy_posted; | ||
100 | /* # times non-lazy CBs posted to CPU. */ | ||
101 | unsigned long nonlazy_posted_snap; | ||
102 | /* idle-period nonlazy_posted snapshot. */ | ||
103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | ||
104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
87 | }; | 105 | }; |
88 | 106 | ||
89 | /* RCU's kthread states for tracing. */ | 107 | /* RCU's kthread states for tracing. */ |
@@ -192,7 +210,7 @@ struct rcu_node { | |||
192 | */ | 210 | */ |
193 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | 211 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ |
194 | for ((rnp) = &(rsp)->node[0]; \ | 212 | for ((rnp) = &(rsp)->node[0]; \ |
195 | (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) | 213 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) |
196 | 214 | ||
197 | /* | 215 | /* |
198 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | 216 | * Do a breadth-first scan of the non-leaf rcu_node structures for the |
@@ -201,7 +219,7 @@ struct rcu_node { | |||
201 | */ | 219 | */ |
202 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | 220 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ |
203 | for ((rnp) = &(rsp)->node[0]; \ | 221 | for ((rnp) = &(rsp)->node[0]; \ |
204 | (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) | 222 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) |
205 | 223 | ||
206 | /* | 224 | /* |
207 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | 225 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state |
@@ -210,8 +228,8 @@ struct rcu_node { | |||
210 | * It is still a leaf node, even if it is also the root node. | 228 | * It is still a leaf node, even if it is also the root node. |
211 | */ | 229 | */ |
212 | #define rcu_for_each_leaf_node(rsp, rnp) \ | 230 | #define rcu_for_each_leaf_node(rsp, rnp) \ |
213 | for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ | 231 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ |
214 | (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) | 232 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) |
215 | 233 | ||
216 | /* Index values for nxttail array in struct rcu_data. */ | 234 | /* Index values for nxttail array in struct rcu_data. */ |
217 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ | 235 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ |
@@ -297,6 +315,9 @@ struct rcu_data { | |||
297 | unsigned long n_rp_need_fqs; | 315 | unsigned long n_rp_need_fqs; |
298 | unsigned long n_rp_need_nothing; | 316 | unsigned long n_rp_need_nothing; |
299 | 317 | ||
318 | /* 6) _rcu_barrier() callback. */ | ||
319 | struct rcu_head barrier_head; | ||
320 | |||
300 | int cpu; | 321 | int cpu; |
301 | struct rcu_state *rsp; | 322 | struct rcu_state *rsp; |
302 | }; | 323 | }; |
@@ -343,10 +364,12 @@ do { \ | |||
343 | */ | 364 | */ |
344 | struct rcu_state { | 365 | struct rcu_state { |
345 | struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ | 366 | struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ |
346 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ | 367 | struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ |
347 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 368 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ |
348 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ | 369 | u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ |
349 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 370 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
371 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | ||
372 | void (*func)(struct rcu_head *head)); | ||
350 | 373 | ||
351 | /* The following fields are guarded by the root rcu_node's lock. */ | 374 | /* The following fields are guarded by the root rcu_node's lock. */ |
352 | 375 | ||
@@ -378,6 +401,11 @@ struct rcu_state { | |||
378 | struct task_struct *rcu_barrier_in_progress; | 401 | struct task_struct *rcu_barrier_in_progress; |
379 | /* Task doing rcu_barrier(), */ | 402 | /* Task doing rcu_barrier(), */ |
380 | /* or NULL if no barrier. */ | 403 | /* or NULL if no barrier. */ |
404 | struct mutex barrier_mutex; /* Guards barrier fields. */ | ||
405 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | ||
406 | struct completion barrier_completion; /* Wake at barrier end. */ | ||
407 | unsigned long n_barrier_done; /* ++ at start and end of */ | ||
408 | /* _rcu_barrier(). */ | ||
381 | raw_spinlock_t fqslock; /* Only one task forcing */ | 409 | raw_spinlock_t fqslock; /* Only one task forcing */ |
382 | /* quiescent states. */ | 410 | /* quiescent states. */ |
383 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 411 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -395,8 +423,13 @@ struct rcu_state { | |||
395 | unsigned long gp_max; /* Maximum GP duration in */ | 423 | unsigned long gp_max; /* Maximum GP duration in */ |
396 | /* jiffies. */ | 424 | /* jiffies. */ |
397 | char *name; /* Name of structure. */ | 425 | char *name; /* Name of structure. */ |
426 | struct list_head flavors; /* List of RCU flavors. */ | ||
398 | }; | 427 | }; |
399 | 428 | ||
429 | extern struct list_head rcu_struct_flavors; | ||
430 | #define for_each_rcu_flavor(rsp) \ | ||
431 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) | ||
432 | |||
400 | /* Return values for rcu_preempt_offline_tasks(). */ | 433 | /* Return values for rcu_preempt_offline_tasks(). */ |
401 | 434 | ||
402 | #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ | 435 | #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ |
@@ -430,6 +463,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
430 | /* Forward declarations for rcutree_plugin.h */ | 463 | /* Forward declarations for rcutree_plugin.h */ |
431 | static void rcu_bootup_announce(void); | 464 | static void rcu_bootup_announce(void); |
432 | long rcu_batches_completed(void); | 465 | long rcu_batches_completed(void); |
466 | static void rcu_preempt_note_context_switch(int cpu); | ||
433 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 467 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
434 | #ifdef CONFIG_HOTPLUG_CPU | 468 | #ifdef CONFIG_HOTPLUG_CPU |
435 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 469 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
@@ -438,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu); | |||
438 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 472 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
439 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 473 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
440 | static int rcu_print_task_stall(struct rcu_node *rnp); | 474 | static int rcu_print_task_stall(struct rcu_node *rnp); |
441 | static void rcu_preempt_stall_reset(void); | ||
442 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 475 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
443 | #ifdef CONFIG_HOTPLUG_CPU | 476 | #ifdef CONFIG_HOTPLUG_CPU |
444 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | 477 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
445 | struct rcu_node *rnp, | 478 | struct rcu_node *rnp, |
446 | struct rcu_data *rdp); | 479 | struct rcu_data *rdp); |
447 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 480 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
448 | static void rcu_preempt_cleanup_dead_cpu(int cpu); | ||
449 | static void rcu_preempt_check_callbacks(int cpu); | 481 | static void rcu_preempt_check_callbacks(int cpu); |
450 | static void rcu_preempt_process_callbacks(void); | ||
451 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 482 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
452 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) | 483 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) |
453 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 484 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
454 | bool wake); | 485 | bool wake); |
455 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | 486 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ |
456 | static int rcu_preempt_pending(int cpu); | ||
457 | static int rcu_preempt_cpu_has_callbacks(int cpu); | ||
458 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | ||
459 | static void rcu_preempt_cleanup_dying_cpu(void); | ||
460 | static void __init __rcu_init_preempt(void); | 487 | static void __init __rcu_init_preempt(void); |
461 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 488 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
462 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 489 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d9869..7f3244c0df01 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void) | |||
68 | printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); | 68 | printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); |
69 | #endif | 69 | #endif |
70 | #if NUM_RCU_LVL_4 != 0 | 70 | #if NUM_RCU_LVL_4 != 0 |
71 | printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); | 71 | printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); |
72 | #endif | 72 | #endif |
73 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) | ||
74 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | ||
75 | if (nr_cpu_ids != NR_CPUS) | ||
76 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | ||
73 | } | 77 | } |
74 | 78 | ||
75 | #ifdef CONFIG_TREE_PREEMPT_RCU | 79 | #ifdef CONFIG_TREE_PREEMPT_RCU |
76 | 80 | ||
77 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); | 81 | struct rcu_state rcu_preempt_state = |
82 | RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); | ||
78 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 83 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
79 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 84 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
80 | 85 | ||
81 | static void rcu_read_unlock_special(struct task_struct *t); | ||
82 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 86 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
83 | 87 | ||
84 | /* | 88 | /* |
@@ -153,7 +157,7 @@ static void rcu_preempt_qs(int cpu) | |||
153 | * | 157 | * |
154 | * Caller must disable preemption. | 158 | * Caller must disable preemption. |
155 | */ | 159 | */ |
156 | void rcu_preempt_note_context_switch(void) | 160 | static void rcu_preempt_note_context_switch(int cpu) |
157 | { | 161 | { |
158 | struct task_struct *t = current; | 162 | struct task_struct *t = current; |
159 | unsigned long flags; | 163 | unsigned long flags; |
@@ -164,7 +168,7 @@ void rcu_preempt_note_context_switch(void) | |||
164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 168 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
165 | 169 | ||
166 | /* Possibly blocking in an RCU read-side critical section. */ | 170 | /* Possibly blocking in an RCU read-side critical section. */ |
167 | rdp = __this_cpu_ptr(rcu_preempt_state.rda); | 171 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
168 | rnp = rdp->mynode; | 172 | rnp = rdp->mynode; |
169 | raw_spin_lock_irqsave(&rnp->lock, flags); | 173 | raw_spin_lock_irqsave(&rnp->lock, flags); |
170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 174 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -228,23 +232,11 @@ void rcu_preempt_note_context_switch(void) | |||
228 | * means that we continue to block the current grace period. | 232 | * means that we continue to block the current grace period. |
229 | */ | 233 | */ |
230 | local_irq_save(flags); | 234 | local_irq_save(flags); |
231 | rcu_preempt_qs(smp_processor_id()); | 235 | rcu_preempt_qs(cpu); |
232 | local_irq_restore(flags); | 236 | local_irq_restore(flags); |
233 | } | 237 | } |
234 | 238 | ||
235 | /* | 239 | /* |
236 | * Tree-preemptible RCU implementation for rcu_read_lock(). | ||
237 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
238 | * if we block. | ||
239 | */ | ||
240 | void __rcu_read_lock(void) | ||
241 | { | ||
242 | current->rcu_read_lock_nesting++; | ||
243 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ | ||
244 | } | ||
245 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
246 | |||
247 | /* | ||
248 | * Check for preempted RCU readers blocking the current grace period | 240 | * Check for preempted RCU readers blocking the current grace period |
249 | * for the specified rcu_node structure. If the caller needs a reliable | 241 | * for the specified rcu_node structure. If the caller needs a reliable |
250 | * answer, it must hold the rcu_node's ->lock. | 242 | * answer, it must hold the rcu_node's ->lock. |
@@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, | |||
310 | * notify RCU core processing or task having blocked during the RCU | 302 | * notify RCU core processing or task having blocked during the RCU |
311 | * read-side critical section. | 303 | * read-side critical section. |
312 | */ | 304 | */ |
313 | static noinline void rcu_read_unlock_special(struct task_struct *t) | 305 | void rcu_read_unlock_special(struct task_struct *t) |
314 | { | 306 | { |
315 | int empty; | 307 | int empty; |
316 | int empty_exp; | 308 | int empty_exp; |
@@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
398 | rnp->grphi, | 390 | rnp->grphi, |
399 | !!rnp->gp_tasks); | 391 | !!rnp->gp_tasks); |
400 | rcu_report_unblock_qs_rnp(rnp, flags); | 392 | rcu_report_unblock_qs_rnp(rnp, flags); |
401 | } else | 393 | } else { |
402 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 394 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
395 | } | ||
403 | 396 | ||
404 | #ifdef CONFIG_RCU_BOOST | 397 | #ifdef CONFIG_RCU_BOOST |
405 | /* Unboost if we were boosted. */ | 398 | /* Unboost if we were boosted. */ |
@@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
418 | } | 411 | } |
419 | } | 412 | } |
420 | 413 | ||
421 | /* | ||
422 | * Tree-preemptible RCU implementation for rcu_read_unlock(). | ||
423 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
424 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
425 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
426 | * in an RCU read-side critical section and other special cases. | ||
427 | */ | ||
428 | void __rcu_read_unlock(void) | ||
429 | { | ||
430 | struct task_struct *t = current; | ||
431 | |||
432 | if (t->rcu_read_lock_nesting != 1) | ||
433 | --t->rcu_read_lock_nesting; | ||
434 | else { | ||
435 | barrier(); /* critical section before exit code. */ | ||
436 | t->rcu_read_lock_nesting = INT_MIN; | ||
437 | barrier(); /* assign before ->rcu_read_unlock_special load */ | ||
438 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
439 | rcu_read_unlock_special(t); | ||
440 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
441 | t->rcu_read_lock_nesting = 0; | ||
442 | } | ||
443 | #ifdef CONFIG_PROVE_LOCKING | ||
444 | { | ||
445 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
446 | |||
447 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
448 | } | ||
449 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
450 | } | ||
451 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
452 | |||
453 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | 414 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE |
454 | 415 | ||
455 | /* | 416 | /* |
@@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) | |||
540 | } | 501 | } |
541 | 502 | ||
542 | /* | 503 | /* |
543 | * Suppress preemptible RCU's CPU stall warnings by pushing the | ||
544 | * time of the next stall-warning message comfortably far into the | ||
545 | * future. | ||
546 | */ | ||
547 | static void rcu_preempt_stall_reset(void) | ||
548 | { | ||
549 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
550 | } | ||
551 | |||
552 | /* | ||
553 | * Check that the list of blocked tasks for the newly completed grace | 504 | * Check that the list of blocked tasks for the newly completed grace |
554 | * period is in fact empty. It is a serious bug to complete a grace | 505 | * period is in fact empty. It is a serious bug to complete a grace |
555 | * period that still has RCU readers blocked! This function must be | 506 | * period that still has RCU readers blocked! This function must be |
@@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
650 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 601 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
651 | 602 | ||
652 | /* | 603 | /* |
653 | * Do CPU-offline processing for preemptible RCU. | ||
654 | */ | ||
655 | static void rcu_preempt_cleanup_dead_cpu(int cpu) | ||
656 | { | ||
657 | rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); | ||
658 | } | ||
659 | |||
660 | /* | ||
661 | * Check for a quiescent state from the current CPU. When a task blocks, | 604 | * Check for a quiescent state from the current CPU. When a task blocks, |
662 | * the task is recorded in the corresponding CPU's rcu_node structure, | 605 | * the task is recorded in the corresponding CPU's rcu_node structure, |
663 | * which is checked elsewhere. | 606 | * which is checked elsewhere. |
@@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
677 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | 620 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; |
678 | } | 621 | } |
679 | 622 | ||
680 | /* | ||
681 | * Process callbacks for preemptible RCU. | ||
682 | */ | ||
683 | static void rcu_preempt_process_callbacks(void) | ||
684 | { | ||
685 | __rcu_process_callbacks(&rcu_preempt_state, | ||
686 | &__get_cpu_var(rcu_preempt_data)); | ||
687 | } | ||
688 | |||
689 | #ifdef CONFIG_RCU_BOOST | 623 | #ifdef CONFIG_RCU_BOOST |
690 | 624 | ||
691 | static void rcu_preempt_do_callbacks(void) | 625 | static void rcu_preempt_do_callbacks(void) |
@@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
824 | int must_wait = 0; | 758 | int must_wait = 0; |
825 | 759 | ||
826 | raw_spin_lock_irqsave(&rnp->lock, flags); | 760 | raw_spin_lock_irqsave(&rnp->lock, flags); |
827 | if (list_empty(&rnp->blkd_tasks)) | 761 | if (list_empty(&rnp->blkd_tasks)) { |
828 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 762 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
829 | else { | 763 | } else { |
830 | rnp->exp_tasks = rnp->blkd_tasks.next; | 764 | rnp->exp_tasks = rnp->blkd_tasks.next; |
831 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | 765 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ |
832 | must_wait = 1; | 766 | must_wait = 1; |
@@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void) | |||
870 | * expedited grace period for us, just leave. | 804 | * expedited grace period for us, just leave. |
871 | */ | 805 | */ |
872 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { | 806 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { |
873 | if (trycount++ < 10) | 807 | if (trycount++ < 10) { |
874 | udelay(trycount * num_online_cpus()); | 808 | udelay(trycount * num_online_cpus()); |
875 | else { | 809 | } else { |
876 | synchronize_rcu(); | 810 | synchronize_rcu(); |
877 | return; | 811 | return; |
878 | } | 812 | } |
@@ -917,51 +851,16 @@ mb_ret: | |||
917 | } | 851 | } |
918 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 852 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
919 | 853 | ||
920 | /* | ||
921 | * Check to see if there is any immediate preemptible-RCU-related work | ||
922 | * to be done. | ||
923 | */ | ||
924 | static int rcu_preempt_pending(int cpu) | ||
925 | { | ||
926 | return __rcu_pending(&rcu_preempt_state, | ||
927 | &per_cpu(rcu_preempt_data, cpu)); | ||
928 | } | ||
929 | |||
930 | /* | ||
931 | * Does preemptible RCU have callbacks on this CPU? | ||
932 | */ | ||
933 | static int rcu_preempt_cpu_has_callbacks(int cpu) | ||
934 | { | ||
935 | return !!per_cpu(rcu_preempt_data, cpu).nxtlist; | ||
936 | } | ||
937 | |||
938 | /** | 854 | /** |
939 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | 855 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. |
940 | */ | 856 | */ |
941 | void rcu_barrier(void) | 857 | void rcu_barrier(void) |
942 | { | 858 | { |
943 | _rcu_barrier(&rcu_preempt_state, call_rcu); | 859 | _rcu_barrier(&rcu_preempt_state); |
944 | } | 860 | } |
945 | EXPORT_SYMBOL_GPL(rcu_barrier); | 861 | EXPORT_SYMBOL_GPL(rcu_barrier); |
946 | 862 | ||
947 | /* | 863 | /* |
948 | * Initialize preemptible RCU's per-CPU data. | ||
949 | */ | ||
950 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | ||
951 | { | ||
952 | rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); | ||
953 | } | ||
954 | |||
955 | /* | ||
956 | * Move preemptible RCU's callbacks from dying CPU to other online CPU | ||
957 | * and record a quiescent state. | ||
958 | */ | ||
959 | static void rcu_preempt_cleanup_dying_cpu(void) | ||
960 | { | ||
961 | rcu_cleanup_dying_cpu(&rcu_preempt_state); | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Initialize preemptible RCU's state structures. | 864 | * Initialize preemptible RCU's state structures. |
966 | */ | 865 | */ |
967 | static void __init __rcu_init_preempt(void) | 866 | static void __init __rcu_init_preempt(void) |
@@ -1002,6 +901,14 @@ void rcu_force_quiescent_state(void) | |||
1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 901 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
1003 | 902 | ||
1004 | /* | 903 | /* |
904 | * Because preemptible RCU does not exist, we never have to check for | ||
905 | * CPUs being in quiescent states. | ||
906 | */ | ||
907 | static void rcu_preempt_note_context_switch(int cpu) | ||
908 | { | ||
909 | } | ||
910 | |||
911 | /* | ||
1005 | * Because preemptible RCU does not exist, there are never any preempted | 912 | * Because preemptible RCU does not exist, there are never any preempted |
1006 | * RCU readers. | 913 | * RCU readers. |
1007 | */ | 914 | */ |
@@ -1038,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) | |||
1038 | } | 945 | } |
1039 | 946 | ||
1040 | /* | 947 | /* |
1041 | * Because preemptible RCU does not exist, there is no need to suppress | ||
1042 | * its CPU stall warnings. | ||
1043 | */ | ||
1044 | static void rcu_preempt_stall_reset(void) | ||
1045 | { | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * Because there is no preemptible RCU, there can be no readers blocked, | 948 | * Because there is no preemptible RCU, there can be no readers blocked, |
1050 | * so there is no need to check for blocked tasks. So check only for | 949 | * so there is no need to check for blocked tasks. So check only for |
1051 | * bogus qsmask values. | 950 | * bogus qsmask values. |
@@ -1073,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
1073 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 972 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
1074 | 973 | ||
1075 | /* | 974 | /* |
1076 | * Because preemptible RCU does not exist, it never needs CPU-offline | ||
1077 | * processing. | ||
1078 | */ | ||
1079 | static void rcu_preempt_cleanup_dead_cpu(int cpu) | ||
1080 | { | ||
1081 | } | ||
1082 | |||
1083 | /* | ||
1084 | * Because preemptible RCU does not exist, it never has any callbacks | 975 | * Because preemptible RCU does not exist, it never has any callbacks |
1085 | * to check. | 976 | * to check. |
1086 | */ | 977 | */ |
@@ -1089,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
1089 | } | 980 | } |
1090 | 981 | ||
1091 | /* | 982 | /* |
1092 | * Because preemptible RCU does not exist, it never has any callbacks | ||
1093 | * to process. | ||
1094 | */ | ||
1095 | static void rcu_preempt_process_callbacks(void) | ||
1096 | { | ||
1097 | } | ||
1098 | |||
1099 | /* | ||
1100 | * Queue an RCU callback for lazy invocation after a grace period. | 983 | * Queue an RCU callback for lazy invocation after a grace period. |
1101 | * This will likely be later named something like "call_rcu_lazy()", | 984 | * This will likely be later named something like "call_rcu_lazy()", |
1102 | * but this change will require some way of tagging the lazy RCU | 985 | * but this change will require some way of tagging the lazy RCU |
@@ -1137,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1137 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1020 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
1138 | 1021 | ||
1139 | /* | 1022 | /* |
1140 | * Because preemptible RCU does not exist, it never has any work to do. | ||
1141 | */ | ||
1142 | static int rcu_preempt_pending(int cpu) | ||
1143 | { | ||
1144 | return 0; | ||
1145 | } | ||
1146 | |||
1147 | /* | ||
1148 | * Because preemptible RCU does not exist, it never has callbacks | ||
1149 | */ | ||
1150 | static int rcu_preempt_cpu_has_callbacks(int cpu) | ||
1151 | { | ||
1152 | return 0; | ||
1153 | } | ||
1154 | |||
1155 | /* | ||
1156 | * Because preemptible RCU does not exist, rcu_barrier() is just | 1023 | * Because preemptible RCU does not exist, rcu_barrier() is just |
1157 | * another name for rcu_barrier_sched(). | 1024 | * another name for rcu_barrier_sched(). |
1158 | */ | 1025 | */ |
@@ -1163,21 +1030,6 @@ void rcu_barrier(void) | |||
1163 | EXPORT_SYMBOL_GPL(rcu_barrier); | 1030 | EXPORT_SYMBOL_GPL(rcu_barrier); |
1164 | 1031 | ||
1165 | /* | 1032 | /* |
1166 | * Because preemptible RCU does not exist, there is no per-CPU | ||
1167 | * data to initialize. | ||
1168 | */ | ||
1169 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | ||
1170 | { | ||
1171 | } | ||
1172 | |||
1173 | /* | ||
1174 | * Because there is no preemptible RCU, there is no cleanup to do. | ||
1175 | */ | ||
1176 | static void rcu_preempt_cleanup_dying_cpu(void) | ||
1177 | { | ||
1178 | } | ||
1179 | |||
1180 | /* | ||
1181 | * Because preemptible RCU does not exist, it need not be initialized. | 1033 | * Because preemptible RCU does not exist, it need not be initialized. |
1182 | */ | 1034 | */ |
1183 | static void __init __rcu_init_preempt(void) | 1035 | static void __init __rcu_init_preempt(void) |
@@ -1886,8 +1738,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
1886 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs | 1738 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs |
1887 | * any flavor of RCU. | 1739 | * any flavor of RCU. |
1888 | */ | 1740 | */ |
1889 | int rcu_needs_cpu(int cpu) | 1741 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) |
1890 | { | 1742 | { |
1743 | *delta_jiffies = ULONG_MAX; | ||
1891 | return rcu_cpu_has_callbacks(cpu); | 1744 | return rcu_cpu_has_callbacks(cpu); |
1892 | } | 1745 | } |
1893 | 1746 | ||
@@ -1959,43 +1812,10 @@ static void rcu_idle_count_callbacks_posted(void) | |||
1959 | */ | 1812 | */ |
1960 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ | 1813 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ |
1961 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ | 1814 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ |
1962 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1815 | #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ |
1963 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1816 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1964 | 1817 | ||
1965 | /* Loop counter for rcu_prepare_for_idle(). */ | 1818 | extern int tick_nohz_enabled; |
1966 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | ||
1967 | /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ | ||
1968 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | ||
1969 | /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ | ||
1970 | static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); | ||
1971 | /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ | ||
1972 | static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); | ||
1973 | /* Enable special processing on first attempt to enter dyntick-idle mode. */ | ||
1974 | static DEFINE_PER_CPU(bool, rcu_idle_first_pass); | ||
1975 | /* Running count of non-lazy callbacks posted, never decremented. */ | ||
1976 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); | ||
1977 | /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ | ||
1978 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); | ||
1979 | |||
1980 | /* | ||
1981 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
1982 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
1983 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
1984 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
1985 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
1986 | * it is better to incur scheduling-clock interrupts than to spin | ||
1987 | * continuously for the same time duration! | ||
1988 | */ | ||
1989 | int rcu_needs_cpu(int cpu) | ||
1990 | { | ||
1991 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
1992 | per_cpu(rcu_idle_first_pass, cpu) = 1; | ||
1993 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
1994 | if (!rcu_cpu_has_callbacks(cpu)) | ||
1995 | return 0; | ||
1996 | /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ | ||
1997 | return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; | ||
1998 | } | ||
1999 | 1819 | ||
2000 | /* | 1820 | /* |
2001 | * Does the specified flavor of RCU have non-lazy callbacks pending on | 1821 | * Does the specified flavor of RCU have non-lazy callbacks pending on |
@@ -2040,6 +1860,50 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
2040 | } | 1860 | } |
2041 | 1861 | ||
2042 | /* | 1862 | /* |
1863 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
1864 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
1865 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
1866 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
1867 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
1868 | * it is better to incur scheduling-clock interrupts than to spin | ||
1869 | * continuously for the same time duration! | ||
1870 | * | ||
1871 | * The delta_jiffies argument is used to store the time when RCU is | ||
1872 | * going to need the CPU again if it still has callbacks. The reason | ||
1873 | * for this is that rcu_prepare_for_idle() might need to post a timer, | ||
1874 | * but if so, it will do so after tick_nohz_stop_sched_tick() has set | ||
1875 | * the wakeup time for this CPU. This means that RCU's timer can be | ||
1876 | * delayed until the wakeup time, which defeats the purpose of posting | ||
1877 | * a timer. | ||
1878 | */ | ||
1879 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | ||
1880 | { | ||
1881 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1882 | |||
1883 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
1884 | rdtp->idle_first_pass = 1; | ||
1885 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
1886 | if (!rcu_cpu_has_callbacks(cpu)) { | ||
1887 | *delta_jiffies = ULONG_MAX; | ||
1888 | return 0; | ||
1889 | } | ||
1890 | if (rdtp->dyntick_holdoff == jiffies) { | ||
1891 | /* RCU recently tried and failed, so don't try again. */ | ||
1892 | *delta_jiffies = 1; | ||
1893 | return 1; | ||
1894 | } | ||
1895 | /* Set up for the possibility that RCU will post a timer. */ | ||
1896 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | ||
1897 | *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, | ||
1898 | RCU_IDLE_GP_DELAY) - jiffies; | ||
1899 | } else { | ||
1900 | *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; | ||
1901 | *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; | ||
1902 | } | ||
1903 | return 0; | ||
1904 | } | ||
1905 | |||
1906 | /* | ||
2043 | * Handler for smp_call_function_single(). The only point of this | 1907 | * Handler for smp_call_function_single(). The only point of this |
2044 | * handler is to wake the CPU up, so the handler does only tracing. | 1908 | * handler is to wake the CPU up, so the handler does only tracing. |
2045 | */ | 1909 | */ |
@@ -2075,22 +1939,26 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) | |||
2075 | */ | 1939 | */ |
2076 | static void rcu_prepare_for_idle_init(int cpu) | 1940 | static void rcu_prepare_for_idle_init(int cpu) |
2077 | { | 1941 | { |
2078 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 1942 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2079 | setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), | 1943 | |
2080 | rcu_idle_gp_timer_func, cpu); | 1944 | rdtp->dyntick_holdoff = jiffies - 1; |
2081 | per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; | 1945 | setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); |
2082 | per_cpu(rcu_idle_first_pass, cpu) = 1; | 1946 | rdtp->idle_gp_timer_expires = jiffies - 1; |
1947 | rdtp->idle_first_pass = 1; | ||
2083 | } | 1948 | } |
2084 | 1949 | ||
2085 | /* | 1950 | /* |
2086 | * Clean up for exit from idle. Because we are exiting from idle, there | 1951 | * Clean up for exit from idle. Because we are exiting from idle, there |
2087 | * is no longer any point to rcu_idle_gp_timer, so cancel it. This will | 1952 | * is no longer any point to ->idle_gp_timer, so cancel it. This will |
2088 | * do nothing if this timer is not active, so just cancel it unconditionally. | 1953 | * do nothing if this timer is not active, so just cancel it unconditionally. |
2089 | */ | 1954 | */ |
2090 | static void rcu_cleanup_after_idle(int cpu) | 1955 | static void rcu_cleanup_after_idle(int cpu) |
2091 | { | 1956 | { |
2092 | del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); | 1957 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
1958 | |||
1959 | del_timer(&rdtp->idle_gp_timer); | ||
2093 | trace_rcu_prep_idle("Cleanup after idle"); | 1960 | trace_rcu_prep_idle("Cleanup after idle"); |
1961 | rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); | ||
2094 | } | 1962 | } |
2095 | 1963 | ||
2096 | /* | 1964 | /* |
@@ -2108,42 +1976,53 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2108 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 1976 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
2109 | * disabled, we do one pass of force_quiescent_state(), then do a | 1977 | * disabled, we do one pass of force_quiescent_state(), then do a |
2110 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | 1978 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
2111 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. | 1979 | * later. The ->dyntick_drain field controls the sequencing. |
2112 | * | 1980 | * |
2113 | * The caller must have disabled interrupts. | 1981 | * The caller must have disabled interrupts. |
2114 | */ | 1982 | */ |
2115 | static void rcu_prepare_for_idle(int cpu) | 1983 | static void rcu_prepare_for_idle(int cpu) |
2116 | { | 1984 | { |
2117 | struct timer_list *tp; | 1985 | struct timer_list *tp; |
1986 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1987 | int tne; | ||
1988 | |||
1989 | /* Handle nohz enablement switches conservatively. */ | ||
1990 | tne = ACCESS_ONCE(tick_nohz_enabled); | ||
1991 | if (tne != rdtp->tick_nohz_enabled_snap) { | ||
1992 | if (rcu_cpu_has_callbacks(cpu)) | ||
1993 | invoke_rcu_core(); /* force nohz to see update. */ | ||
1994 | rdtp->tick_nohz_enabled_snap = tne; | ||
1995 | return; | ||
1996 | } | ||
1997 | if (!tne) | ||
1998 | return; | ||
2118 | 1999 | ||
2119 | /* | 2000 | /* |
2120 | * If this is an idle re-entry, for example, due to use of | 2001 | * If this is an idle re-entry, for example, due to use of |
2121 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | 2002 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle |
2122 | * loop, then don't take any state-machine actions, unless the | 2003 | * loop, then don't take any state-machine actions, unless the |
2123 | * momentary exit from idle queued additional non-lazy callbacks. | 2004 | * momentary exit from idle queued additional non-lazy callbacks. |
2124 | * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks | 2005 | * Instead, repost the ->idle_gp_timer if this CPU has callbacks |
2125 | * pending. | 2006 | * pending. |
2126 | */ | 2007 | */ |
2127 | if (!per_cpu(rcu_idle_first_pass, cpu) && | 2008 | if (!rdtp->idle_first_pass && |
2128 | (per_cpu(rcu_nonlazy_posted, cpu) == | 2009 | (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { |
2129 | per_cpu(rcu_nonlazy_posted_snap, cpu))) { | ||
2130 | if (rcu_cpu_has_callbacks(cpu)) { | 2010 | if (rcu_cpu_has_callbacks(cpu)) { |
2131 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | 2011 | tp = &rdtp->idle_gp_timer; |
2132 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | 2012 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); |
2133 | } | 2013 | } |
2134 | return; | 2014 | return; |
2135 | } | 2015 | } |
2136 | per_cpu(rcu_idle_first_pass, cpu) = 0; | 2016 | rdtp->idle_first_pass = 0; |
2137 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | 2017 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; |
2138 | per_cpu(rcu_nonlazy_posted, cpu) - 1; | ||
2139 | 2018 | ||
2140 | /* | 2019 | /* |
2141 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2020 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2142 | * Also reset state to avoid prejudicing later attempts. | 2021 | * Also reset state to avoid prejudicing later attempts. |
2143 | */ | 2022 | */ |
2144 | if (!rcu_cpu_has_callbacks(cpu)) { | 2023 | if (!rcu_cpu_has_callbacks(cpu)) { |
2145 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2024 | rdtp->dyntick_holdoff = jiffies - 1; |
2146 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2025 | rdtp->dyntick_drain = 0; |
2147 | trace_rcu_prep_idle("No callbacks"); | 2026 | trace_rcu_prep_idle("No callbacks"); |
2148 | return; | 2027 | return; |
2149 | } | 2028 | } |
@@ -2152,36 +2031,38 @@ static void rcu_prepare_for_idle(int cpu) | |||
2152 | * If in holdoff mode, just return. We will presumably have | 2031 | * If in holdoff mode, just return. We will presumably have |
2153 | * refrained from disabling the scheduling-clock tick. | 2032 | * refrained from disabling the scheduling-clock tick. |
2154 | */ | 2033 | */ |
2155 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | 2034 | if (rdtp->dyntick_holdoff == jiffies) { |
2156 | trace_rcu_prep_idle("In holdoff"); | 2035 | trace_rcu_prep_idle("In holdoff"); |
2157 | return; | 2036 | return; |
2158 | } | 2037 | } |
2159 | 2038 | ||
2160 | /* Check and update the rcu_dyntick_drain sequencing. */ | 2039 | /* Check and update the ->dyntick_drain sequencing. */ |
2161 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2040 | if (rdtp->dyntick_drain <= 0) { |
2162 | /* First time through, initialize the counter. */ | 2041 | /* First time through, initialize the counter. */ |
2163 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; | 2042 | rdtp->dyntick_drain = RCU_IDLE_FLUSHES; |
2164 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | 2043 | } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && |
2165 | !rcu_pending(cpu) && | 2044 | !rcu_pending(cpu) && |
2166 | !local_softirq_pending()) { | 2045 | !local_softirq_pending()) { |
2167 | /* Can we go dyntick-idle despite still having callbacks? */ | 2046 | /* Can we go dyntick-idle despite still having callbacks? */ |
2168 | trace_rcu_prep_idle("Dyntick with callbacks"); | 2047 | rdtp->dyntick_drain = 0; |
2169 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2048 | rdtp->dyntick_holdoff = jiffies; |
2170 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2049 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { |
2171 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2050 | trace_rcu_prep_idle("Dyntick with callbacks"); |
2172 | per_cpu(rcu_idle_gp_timer_expires, cpu) = | 2051 | rdtp->idle_gp_timer_expires = |
2173 | jiffies + RCU_IDLE_GP_DELAY; | 2052 | round_up(jiffies + RCU_IDLE_GP_DELAY, |
2174 | else | 2053 | RCU_IDLE_GP_DELAY); |
2175 | per_cpu(rcu_idle_gp_timer_expires, cpu) = | 2054 | } else { |
2176 | jiffies + RCU_IDLE_LAZY_GP_DELAY; | 2055 | rdtp->idle_gp_timer_expires = |
2177 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | 2056 | round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); |
2178 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | 2057 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); |
2179 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | 2058 | } |
2180 | per_cpu(rcu_nonlazy_posted, cpu); | 2059 | tp = &rdtp->idle_gp_timer; |
2060 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
2061 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
2181 | return; /* Nothing more to do immediately. */ | 2062 | return; /* Nothing more to do immediately. */ |
2182 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2063 | } else if (--(rdtp->dyntick_drain) <= 0) { |
2183 | /* We have hit the limit, so time to give up. */ | 2064 | /* We have hit the limit, so time to give up. */ |
2184 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2065 | rdtp->dyntick_holdoff = jiffies; |
2185 | trace_rcu_prep_idle("Begin holdoff"); | 2066 | trace_rcu_prep_idle("Begin holdoff"); |
2186 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | 2067 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ |
2187 | return; | 2068 | return; |
@@ -2213,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu) | |||
2213 | if (rcu_cpu_has_callbacks(cpu)) { | 2094 | if (rcu_cpu_has_callbacks(cpu)) { |
2214 | trace_rcu_prep_idle("More callbacks"); | 2095 | trace_rcu_prep_idle("More callbacks"); |
2215 | invoke_rcu_core(); | 2096 | invoke_rcu_core(); |
2216 | } else | 2097 | } else { |
2217 | trace_rcu_prep_idle("Callbacks drained"); | 2098 | trace_rcu_prep_idle("Callbacks drained"); |
2099 | } | ||
2218 | } | 2100 | } |
2219 | 2101 | ||
2220 | /* | 2102 | /* |
@@ -2227,7 +2109,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
2227 | */ | 2109 | */ |
2228 | static void rcu_idle_count_callbacks_posted(void) | 2110 | static void rcu_idle_count_callbacks_posted(void) |
2229 | { | 2111 | { |
2230 | __this_cpu_add(rcu_nonlazy_posted, 1); | 2112 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); |
2231 | } | 2113 | } |
2232 | 2114 | ||
2233 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2115 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
@@ -2238,11 +2120,12 @@ static void rcu_idle_count_callbacks_posted(void) | |||
2238 | 2120 | ||
2239 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2121 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2240 | { | 2122 | { |
2241 | struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); | 2123 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2124 | struct timer_list *tltp = &rdtp->idle_gp_timer; | ||
2242 | 2125 | ||
2243 | sprintf(cp, "drain=%d %c timer=%lu", | 2126 | sprintf(cp, "drain=%d %c timer=%lu", |
2244 | per_cpu(rcu_dyntick_drain, cpu), | 2127 | rdtp->dyntick_drain, |
2245 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2128 | rdtp->dyntick_holdoff == jiffies ? 'H' : '.', |
2246 | timer_pending(tltp) ? tltp->expires - jiffies : -1); | 2129 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
2247 | } | 2130 | } |
2248 | 2131 | ||
@@ -2250,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | |||
2250 | 2133 | ||
2251 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2134 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2252 | { | 2135 | { |
2136 | *cp = '\0'; | ||
2253 | } | 2137 | } |
2254 | 2138 | ||
2255 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ | 2139 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d4bc16ddd1d4..abffb486e94e 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,6 +46,31 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | static int show_rcubarrier(struct seq_file *m, void *unused) | ||
50 | { | ||
51 | struct rcu_state *rsp; | ||
52 | |||
53 | for_each_rcu_flavor(rsp) | ||
54 | seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", | ||
55 | rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', | ||
56 | atomic_read(&rsp->barrier_cpu_count), | ||
57 | rsp->n_barrier_done); | ||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | static int rcubarrier_open(struct inode *inode, struct file *file) | ||
62 | { | ||
63 | return single_open(file, show_rcubarrier, NULL); | ||
64 | } | ||
65 | |||
66 | static const struct file_operations rcubarrier_fops = { | ||
67 | .owner = THIS_MODULE, | ||
68 | .open = rcubarrier_open, | ||
69 | .read = seq_read, | ||
70 | .llseek = seq_lseek, | ||
71 | .release = single_release, | ||
72 | }; | ||
73 | |||
49 | #ifdef CONFIG_RCU_BOOST | 74 | #ifdef CONFIG_RCU_BOOST |
50 | 75 | ||
51 | static char convert_kthread_status(unsigned int kthread_status) | 76 | static char convert_kthread_status(unsigned int kthread_status) |
@@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
95 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 120 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); |
96 | } | 121 | } |
97 | 122 | ||
98 | #define PRINT_RCU_DATA(name, func, m) \ | ||
99 | do { \ | ||
100 | int _p_r_d_i; \ | ||
101 | \ | ||
102 | for_each_possible_cpu(_p_r_d_i) \ | ||
103 | func(m, &per_cpu(name, _p_r_d_i)); \ | ||
104 | } while (0) | ||
105 | |||
106 | static int show_rcudata(struct seq_file *m, void *unused) | 123 | static int show_rcudata(struct seq_file *m, void *unused) |
107 | { | 124 | { |
108 | #ifdef CONFIG_TREE_PREEMPT_RCU | 125 | int cpu; |
109 | seq_puts(m, "rcu_preempt:\n"); | 126 | struct rcu_state *rsp; |
110 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); | 127 | |
111 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 128 | for_each_rcu_flavor(rsp) { |
112 | seq_puts(m, "rcu_sched:\n"); | 129 | seq_printf(m, "%s:\n", rsp->name); |
113 | PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); | 130 | for_each_possible_cpu(cpu) |
114 | seq_puts(m, "rcu_bh:\n"); | 131 | print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); |
115 | PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); | 132 | } |
116 | return 0; | 133 | return 0; |
117 | } | 134 | } |
118 | 135 | ||
@@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
166 | 183 | ||
167 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 184 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
168 | { | 185 | { |
186 | int cpu; | ||
187 | struct rcu_state *rsp; | ||
188 | |||
169 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); | 189 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); |
170 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 190 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
171 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); | 191 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); |
@@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
173 | seq_puts(m, "\"kt\",\"ktl\""); | 193 | seq_puts(m, "\"kt\",\"ktl\""); |
174 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 194 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
175 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); | 195 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); |
176 | #ifdef CONFIG_TREE_PREEMPT_RCU | 196 | for_each_rcu_flavor(rsp) { |
177 | seq_puts(m, "\"rcu_preempt:\"\n"); | 197 | seq_printf(m, "\"%s:\"\n", rsp->name); |
178 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); | 198 | for_each_possible_cpu(cpu) |
179 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 199 | print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); |
180 | seq_puts(m, "\"rcu_sched:\"\n"); | 200 | } |
181 | PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); | ||
182 | seq_puts(m, "\"rcu_bh:\"\n"); | ||
183 | PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); | ||
184 | return 0; | 201 | return 0; |
185 | } | 202 | } |
186 | 203 | ||
@@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = { | |||
201 | 218 | ||
202 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | 219 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) |
203 | { | 220 | { |
204 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " | 221 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", |
205 | "j=%04x bt=%04x\n", | ||
206 | rnp->grplo, rnp->grphi, | 222 | rnp->grplo, rnp->grphi, |
207 | "T."[list_empty(&rnp->blkd_tasks)], | 223 | "T."[list_empty(&rnp->blkd_tasks)], |
208 | "N."[!rnp->gp_tasks], | 224 | "N."[!rnp->gp_tasks], |
@@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | |||
210 | "B."[!rnp->boost_tasks], | 226 | "B."[!rnp->boost_tasks], |
211 | convert_kthread_status(rnp->boost_kthread_status), | 227 | convert_kthread_status(rnp->boost_kthread_status), |
212 | rnp->n_tasks_boosted, rnp->n_exp_boosts, | 228 | rnp->n_tasks_boosted, rnp->n_exp_boosts, |
213 | rnp->n_normal_boosts, | 229 | rnp->n_normal_boosts); |
230 | seq_printf(m, "j=%04x bt=%04x\n", | ||
214 | (int)(jiffies & 0xffff), | 231 | (int)(jiffies & 0xffff), |
215 | (int)(rnp->boost_time & 0xffff)); | 232 | (int)(rnp->boost_time & 0xffff)); |
216 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", | 233 | seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", |
217 | " balk", | ||
218 | rnp->n_balk_blkd_tasks, | 234 | rnp->n_balk_blkd_tasks, |
219 | rnp->n_balk_exp_gp_tasks, | 235 | rnp->n_balk_exp_gp_tasks, |
220 | rnp->n_balk_boost_tasks, | 236 | rnp->n_balk_boost_tasks, |
@@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
270 | struct rcu_node *rnp; | 286 | struct rcu_node *rnp; |
271 | 287 | ||
272 | gpnum = rsp->gpnum; | 288 | gpnum = rsp->gpnum; |
273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 289 | seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", |
274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 290 | rsp->name, rsp->completed, gpnum, rsp->fqs_state, |
275 | rsp->completed, gpnum, rsp->fqs_state, | ||
276 | (long)(rsp->jiffies_force_qs - jiffies), | 291 | (long)(rsp->jiffies_force_qs - jiffies), |
277 | (int)(jiffies & 0xffff), | 292 | (int)(jiffies & 0xffff)); |
293 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | ||
278 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 294 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
279 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 295 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); | 296 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); |
281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 297 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { |
282 | if (rnp->level != level) { | 298 | if (rnp->level != level) { |
283 | seq_puts(m, "\n"); | 299 | seq_puts(m, "\n"); |
284 | level = rnp->level; | 300 | level = rnp->level; |
@@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
295 | 311 | ||
296 | static int show_rcuhier(struct seq_file *m, void *unused) | 312 | static int show_rcuhier(struct seq_file *m, void *unused) |
297 | { | 313 | { |
298 | #ifdef CONFIG_TREE_PREEMPT_RCU | 314 | struct rcu_state *rsp; |
299 | seq_puts(m, "rcu_preempt:\n"); | 315 | |
300 | print_one_rcu_state(m, &rcu_preempt_state); | 316 | for_each_rcu_flavor(rsp) |
301 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 317 | print_one_rcu_state(m, rsp); |
302 | seq_puts(m, "rcu_sched:\n"); | ||
303 | print_one_rcu_state(m, &rcu_sched_state); | ||
304 | seq_puts(m, "rcu_bh:\n"); | ||
305 | print_one_rcu_state(m, &rcu_bh_state); | ||
306 | return 0; | 318 | return 0; |
307 | } | 319 | } |
308 | 320 | ||
@@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | |||
343 | 355 | ||
344 | static int show_rcugp(struct seq_file *m, void *unused) | 356 | static int show_rcugp(struct seq_file *m, void *unused) |
345 | { | 357 | { |
346 | #ifdef CONFIG_TREE_PREEMPT_RCU | 358 | struct rcu_state *rsp; |
347 | show_one_rcugp(m, &rcu_preempt_state); | 359 | |
348 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 360 | for_each_rcu_flavor(rsp) |
349 | show_one_rcugp(m, &rcu_sched_state); | 361 | show_one_rcugp(m, rsp); |
350 | show_one_rcugp(m, &rcu_bh_state); | ||
351 | return 0; | 362 | return 0; |
352 | } | 363 | } |
353 | 364 | ||
@@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = { | |||
366 | 377 | ||
367 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | 378 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) |
368 | { | 379 | { |
369 | seq_printf(m, "%3d%cnp=%ld " | 380 | seq_printf(m, "%3d%cnp=%ld ", |
370 | "qsp=%ld rpq=%ld cbr=%ld cng=%ld " | ||
371 | "gpc=%ld gps=%ld nf=%ld nn=%ld\n", | ||
372 | rdp->cpu, | 381 | rdp->cpu, |
373 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 382 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
374 | rdp->n_rcu_pending, | 383 | rdp->n_rcu_pending); |
384 | seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", | ||
375 | rdp->n_rp_qs_pending, | 385 | rdp->n_rp_qs_pending, |
376 | rdp->n_rp_report_qs, | 386 | rdp->n_rp_report_qs, |
377 | rdp->n_rp_cb_ready, | 387 | rdp->n_rp_cb_ready, |
378 | rdp->n_rp_cpu_needs_gp, | 388 | rdp->n_rp_cpu_needs_gp); |
389 | seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", | ||
379 | rdp->n_rp_gp_completed, | 390 | rdp->n_rp_gp_completed, |
380 | rdp->n_rp_gp_started, | 391 | rdp->n_rp_gp_started, |
381 | rdp->n_rp_need_fqs, | 392 | rdp->n_rp_need_fqs, |
382 | rdp->n_rp_need_nothing); | 393 | rdp->n_rp_need_nothing); |
383 | } | 394 | } |
384 | 395 | ||
385 | static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) | 396 | static int show_rcu_pending(struct seq_file *m, void *unused) |
386 | { | 397 | { |
387 | int cpu; | 398 | int cpu; |
388 | struct rcu_data *rdp; | 399 | struct rcu_data *rdp; |
389 | 400 | struct rcu_state *rsp; | |
390 | for_each_possible_cpu(cpu) { | 401 | |
391 | rdp = per_cpu_ptr(rsp->rda, cpu); | 402 | for_each_rcu_flavor(rsp) { |
392 | if (rdp->beenonline) | 403 | seq_printf(m, "%s:\n", rsp->name); |
393 | print_one_rcu_pending(m, rdp); | 404 | for_each_possible_cpu(cpu) { |
405 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
406 | if (rdp->beenonline) | ||
407 | print_one_rcu_pending(m, rdp); | ||
408 | } | ||
394 | } | 409 | } |
395 | } | ||
396 | |||
397 | static int show_rcu_pending(struct seq_file *m, void *unused) | ||
398 | { | ||
399 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
400 | seq_puts(m, "rcu_preempt:\n"); | ||
401 | print_rcu_pendings(m, &rcu_preempt_state); | ||
402 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
403 | seq_puts(m, "rcu_sched:\n"); | ||
404 | print_rcu_pendings(m, &rcu_sched_state); | ||
405 | seq_puts(m, "rcu_bh:\n"); | ||
406 | print_rcu_pendings(m, &rcu_bh_state); | ||
407 | return 0; | 410 | return 0; |
408 | } | 411 | } |
409 | 412 | ||
@@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void) | |||
453 | if (!rcudir) | 456 | if (!rcudir) |
454 | goto free_out; | 457 | goto free_out; |
455 | 458 | ||
459 | retval = debugfs_create_file("rcubarrier", 0444, rcudir, | ||
460 | NULL, &rcubarrier_fops); | ||
461 | if (!retval) | ||
462 | goto free_out; | ||
463 | |||
456 | retval = debugfs_create_file("rcudata", 0444, rcudir, | 464 | retval = debugfs_create_file("rcudata", 0444, rcudir, |
457 | NULL, &rcudata_fops); | 465 | NULL, &rcudata_fops); |
458 | if (!retval) | 466 | if (!retval) |
diff --git a/kernel/relay.c b/kernel/relay.c index ab56a1764d4d..e8cd2027abbd 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1235 | struct splice_pipe_desc spd = { | 1235 | struct splice_pipe_desc spd = { |
1236 | .pages = pages, | 1236 | .pages = pages, |
1237 | .nr_pages = 0, | 1237 | .nr_pages = 0, |
1238 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
1238 | .partial = partial, | 1239 | .partial = partial, |
1239 | .flags = flags, | 1240 | .flags = flags, |
1240 | .ops = &relay_pipe_buf_ops, | 1241 | .ops = &relay_pipe_buf_ops, |
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1302 | ret += padding; | 1303 | ret += padding; |
1303 | 1304 | ||
1304 | out: | 1305 | out: |
1305 | splice_shrink_spd(pipe, &spd); | 1306 | splice_shrink_spd(&spd); |
1306 | return ret; | 1307 | return ret; |
1307 | } | 1308 | } |
1308 | 1309 | ||
1309 | static ssize_t relay_file_splice_read(struct file *in, | 1310 | static ssize_t relay_file_splice_read(struct file *in, |
diff --git a/kernel/resource.c b/kernel/resource.c index e1d2b8ee76d5..34d45886ee84 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -7,6 +7,8 @@ | |||
7 | * Arbitrary resource management. | 7 | * Arbitrary resource management. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
11 | |||
10 | #include <linux/export.h> | 12 | #include <linux/export.h> |
11 | #include <linux/errno.h> | 13 | #include <linux/errno.h> |
12 | #include <linux/ioport.h> | 14 | #include <linux/ioport.h> |
@@ -722,14 +724,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t | |||
722 | 724 | ||
723 | write_lock(&resource_lock); | 725 | write_lock(&resource_lock); |
724 | 726 | ||
727 | if (!parent) | ||
728 | goto skip; | ||
729 | |||
725 | if ((start < parent->start) || (end > parent->end)) | 730 | if ((start < parent->start) || (end > parent->end)) |
726 | goto out; | 731 | goto out; |
727 | 732 | ||
728 | for (tmp = res->child; tmp; tmp = tmp->sibling) { | ||
729 | if ((tmp->start < start) || (tmp->end > end)) | ||
730 | goto out; | ||
731 | } | ||
732 | |||
733 | if (res->sibling && (res->sibling->start <= end)) | 733 | if (res->sibling && (res->sibling->start <= end)) |
734 | goto out; | 734 | goto out; |
735 | 735 | ||
@@ -741,6 +741,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t | |||
741 | goto out; | 741 | goto out; |
742 | } | 742 | } |
743 | 743 | ||
744 | skip: | ||
745 | for (tmp = res->child; tmp; tmp = tmp->sibling) | ||
746 | if ((tmp->start < start) || (tmp->end > end)) | ||
747 | goto out; | ||
748 | |||
744 | res->start = start; | 749 | res->start = start; |
745 | res->end = end; | 750 | res->end = end; |
746 | result = 0; | 751 | result = 0; |
@@ -788,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root, | |||
788 | resource_size_t start, resource_size_t end, | 793 | resource_size_t start, resource_size_t end, |
789 | const char *name) | 794 | const char *name) |
790 | { | 795 | { |
796 | int abort = 0; | ||
797 | |||
791 | write_lock(&resource_lock); | 798 | write_lock(&resource_lock); |
792 | __reserve_region_with_split(root, start, end, name); | 799 | if (root->start > start || root->end < end) { |
800 | pr_err("requested range [0x%llx-0x%llx] not in root %pr\n", | ||
801 | (unsigned long long)start, (unsigned long long)end, | ||
802 | root); | ||
803 | if (start > root->end || end < root->start) | ||
804 | abort = 1; | ||
805 | else { | ||
806 | if (end > root->end) | ||
807 | end = root->end; | ||
808 | if (start < root->start) | ||
809 | start = root->start; | ||
810 | pr_err("fixing request to [0x%llx-0x%llx]\n", | ||
811 | (unsigned long long)start, | ||
812 | (unsigned long long)end); | ||
813 | } | ||
814 | dump_stack(); | ||
815 | } | ||
816 | if (!abort) | ||
817 | __reserve_region_with_split(root, start, end, name); | ||
793 | write_unlock(&resource_lock); | 818 | write_unlock(&resource_lock); |
794 | } | 819 | } |
795 | 820 | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..82ad284f823b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1096 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | 1096 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. |
1097 | * | 1097 | * |
1098 | * sched_move_task() holds both and thus holding either pins the cgroup, | 1098 | * sched_move_task() holds both and thus holding either pins the cgroup, |
1099 | * see set_task_rq(). | 1099 | * see task_group(). |
1100 | * | 1100 | * |
1101 | * Furthermore, all task_rq users should acquire both locks, see | 1101 | * Furthermore, all task_rq users should acquire both locks, see |
1102 | * task_rq_lock(). | 1102 | * task_rq_lock(). |
@@ -1910,12 +1910,12 @@ static inline void | |||
1910 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 1910 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
1911 | struct task_struct *next) | 1911 | struct task_struct *next) |
1912 | { | 1912 | { |
1913 | trace_sched_switch(prev, next); | ||
1913 | sched_info_switch(prev, next); | 1914 | sched_info_switch(prev, next); |
1914 | perf_event_task_sched_out(prev, next); | 1915 | perf_event_task_sched_out(prev, next); |
1915 | fire_sched_out_preempt_notifiers(prev, next); | 1916 | fire_sched_out_preempt_notifiers(prev, next); |
1916 | prepare_lock_switch(rq, next); | 1917 | prepare_lock_switch(rq, next); |
1917 | prepare_arch_switch(next); | 1918 | prepare_arch_switch(next); |
1918 | trace_sched_switch(prev, next); | ||
1919 | } | 1919 | } |
1920 | 1920 | ||
1921 | /** | 1921 | /** |
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2081 | #endif | 2081 | #endif |
2082 | 2082 | ||
2083 | /* Here we just switch the register state and the stack. */ | 2083 | /* Here we just switch the register state and the stack. */ |
2084 | rcu_switch_from(prev); | ||
2085 | switch_to(prev, next, prev); | 2084 | switch_to(prev, next, prev); |
2086 | 2085 | ||
2087 | barrier(); | 2086 | barrier(); |
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void) | |||
2161 | } | 2160 | } |
2162 | 2161 | ||
2163 | 2162 | ||
2163 | /* | ||
2164 | * Global load-average calculations | ||
2165 | * | ||
2166 | * We take a distributed and async approach to calculating the global load-avg | ||
2167 | * in order to minimize overhead. | ||
2168 | * | ||
2169 | * The global load average is an exponentially decaying average of nr_running + | ||
2170 | * nr_uninterruptible. | ||
2171 | * | ||
2172 | * Once every LOAD_FREQ: | ||
2173 | * | ||
2174 | * nr_active = 0; | ||
2175 | * for_each_possible_cpu(cpu) | ||
2176 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
2177 | * | ||
2178 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
2179 | * | ||
2180 | * Due to a number of reasons the above turns in the mess below: | ||
2181 | * | ||
2182 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
2183 | * serious number of cpus, therefore we need to take a distributed approach | ||
2184 | * to calculating nr_active. | ||
2185 | * | ||
2186 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
2187 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
2188 | * | ||
2189 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
2190 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
2191 | * to obtain the same result. See calc_load_fold_active(). | ||
2192 | * | ||
2193 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
2194 | * across the machine, we assume 10 ticks is sufficient time for every | ||
2195 | * cpu to have completed this task. | ||
2196 | * | ||
2197 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
2198 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
2199 | * | ||
2200 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
2201 | * this would add another cross-cpu cacheline miss and atomic operation | ||
2202 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
2203 | * when it went into uninterruptible state and decrement on whatever cpu | ||
2204 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
2205 | * all cpus yields the correct result. | ||
2206 | * | ||
2207 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
2208 | */ | ||
2209 | |||
2164 | /* Variables and functions for calc_load */ | 2210 | /* Variables and functions for calc_load */ |
2165 | static atomic_long_t calc_load_tasks; | 2211 | static atomic_long_t calc_load_tasks; |
2166 | static unsigned long calc_load_update; | 2212 | static unsigned long calc_load_update; |
2167 | unsigned long avenrun[3]; | 2213 | unsigned long avenrun[3]; |
2168 | EXPORT_SYMBOL(avenrun); | 2214 | EXPORT_SYMBOL(avenrun); /* should be removed */ |
2215 | |||
2216 | /** | ||
2217 | * get_avenrun - get the load average array | ||
2218 | * @loads: pointer to dest load array | ||
2219 | * @offset: offset to add | ||
2220 | * @shift: shift count to shift the result left | ||
2221 | * | ||
2222 | * These values are estimates at best, so no need for locking. | ||
2223 | */ | ||
2224 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2225 | { | ||
2226 | loads[0] = (avenrun[0] + offset) << shift; | ||
2227 | loads[1] = (avenrun[1] + offset) << shift; | ||
2228 | loads[2] = (avenrun[2] + offset) << shift; | ||
2229 | } | ||
2169 | 2230 | ||
2170 | static long calc_load_fold_active(struct rq *this_rq) | 2231 | static long calc_load_fold_active(struct rq *this_rq) |
2171 | { | 2232 | { |
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
2182 | return delta; | 2243 | return delta; |
2183 | } | 2244 | } |
2184 | 2245 | ||
2246 | /* | ||
2247 | * a1 = a0 * e + a * (1 - e) | ||
2248 | */ | ||
2185 | static unsigned long | 2249 | static unsigned long |
2186 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 2250 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
2187 | { | 2251 | { |
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
2193 | 2257 | ||
2194 | #ifdef CONFIG_NO_HZ | 2258 | #ifdef CONFIG_NO_HZ |
2195 | /* | 2259 | /* |
2196 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 2260 | * Handle NO_HZ for the global load-average. |
2261 | * | ||
2262 | * Since the above described distributed algorithm to compute the global | ||
2263 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
2264 | * NO_HZ. | ||
2265 | * | ||
2266 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
2267 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
2268 | * when we read the global state. | ||
2269 | * | ||
2270 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
2271 | * | ||
2272 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
2273 | * contribution, causing under-accounting. | ||
2274 | * | ||
2275 | * We avoid this by keeping two idle-delta counters and flipping them | ||
2276 | * when the window starts, thus separating old and new NO_HZ load. | ||
2277 | * | ||
2278 | * The only trick is the slight shift in index flip for read vs write. | ||
2279 | * | ||
2280 | * 0s 5s 10s 15s | ||
2281 | * +10 +10 +10 +10 | ||
2282 | * |-|-----------|-|-----------|-|-----------|-| | ||
2283 | * r:0 0 1 1 0 0 1 1 0 | ||
2284 | * w:0 1 1 0 0 1 1 0 0 | ||
2285 | * | ||
2286 | * This ensures we'll fold the old idle contribution in this window while | ||
2287 | * accumlating the new one. | ||
2288 | * | ||
2289 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
2290 | * contribution, since we effectively move our sample point to a known | ||
2291 | * busy state. | ||
2292 | * | ||
2293 | * This is solved by pushing the window forward, and thus skipping the | ||
2294 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
2295 | * was in effect at the time the window opened). This also solves the issue | ||
2296 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
2297 | * LOAD_FREQ intervals. | ||
2197 | * | 2298 | * |
2198 | * When making the ILB scale, we should try to pull this in as well. | 2299 | * When making the ILB scale, we should try to pull this in as well. |
2199 | */ | 2300 | */ |
2200 | static atomic_long_t calc_load_tasks_idle; | 2301 | static atomic_long_t calc_load_idle[2]; |
2302 | static int calc_load_idx; | ||
2303 | |||
2304 | static inline int calc_load_write_idx(void) | ||
2305 | { | ||
2306 | int idx = calc_load_idx; | ||
2307 | |||
2308 | /* | ||
2309 | * See calc_global_nohz(), if we observe the new index, we also | ||
2310 | * need to observe the new update time. | ||
2311 | */ | ||
2312 | smp_rmb(); | ||
2313 | |||
2314 | /* | ||
2315 | * If the folding window started, make sure we start writing in the | ||
2316 | * next idle-delta. | ||
2317 | */ | ||
2318 | if (!time_before(jiffies, calc_load_update)) | ||
2319 | idx++; | ||
2320 | |||
2321 | return idx & 1; | ||
2322 | } | ||
2323 | |||
2324 | static inline int calc_load_read_idx(void) | ||
2325 | { | ||
2326 | return calc_load_idx & 1; | ||
2327 | } | ||
2201 | 2328 | ||
2202 | void calc_load_account_idle(struct rq *this_rq) | 2329 | void calc_load_enter_idle(void) |
2203 | { | 2330 | { |
2331 | struct rq *this_rq = this_rq(); | ||
2204 | long delta; | 2332 | long delta; |
2205 | 2333 | ||
2334 | /* | ||
2335 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
2336 | * into the pending idle delta. | ||
2337 | */ | ||
2206 | delta = calc_load_fold_active(this_rq); | 2338 | delta = calc_load_fold_active(this_rq); |
2207 | if (delta) | 2339 | if (delta) { |
2208 | atomic_long_add(delta, &calc_load_tasks_idle); | 2340 | int idx = calc_load_write_idx(); |
2341 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
2342 | } | ||
2209 | } | 2343 | } |
2210 | 2344 | ||
2211 | static long calc_load_fold_idle(void) | 2345 | void calc_load_exit_idle(void) |
2212 | { | 2346 | { |
2213 | long delta = 0; | 2347 | struct rq *this_rq = this_rq(); |
2348 | |||
2349 | /* | ||
2350 | * If we're still before the sample window, we're done. | ||
2351 | */ | ||
2352 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
2353 | return; | ||
2214 | 2354 | ||
2215 | /* | 2355 | /* |
2216 | * Its got a race, we don't care... | 2356 | * We woke inside or after the sample window, this means we're already |
2357 | * accounted through the nohz accounting, so skip the entire deal and | ||
2358 | * sync up for the next window. | ||
2217 | */ | 2359 | */ |
2218 | if (atomic_long_read(&calc_load_tasks_idle)) | 2360 | this_rq->calc_load_update = calc_load_update; |
2219 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | 2361 | if (time_before(jiffies, this_rq->calc_load_update + 10)) |
2362 | this_rq->calc_load_update += LOAD_FREQ; | ||
2363 | } | ||
2364 | |||
2365 | static long calc_load_fold_idle(void) | ||
2366 | { | ||
2367 | int idx = calc_load_read_idx(); | ||
2368 | long delta = 0; | ||
2369 | |||
2370 | if (atomic_long_read(&calc_load_idle[idx])) | ||
2371 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
2220 | 2372 | ||
2221 | return delta; | 2373 | return delta; |
2222 | } | 2374 | } |
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void) | |||
2302 | { | 2454 | { |
2303 | long delta, active, n; | 2455 | long delta, active, n; |
2304 | 2456 | ||
2305 | /* | 2457 | if (!time_before(jiffies, calc_load_update + 10)) { |
2306 | * If we crossed a calc_load_update boundary, make sure to fold | 2458 | /* |
2307 | * any pending idle changes, the respective CPUs might have | 2459 | * Catch-up, fold however many we are behind still |
2308 | * missed the tick driven calc_load_account_active() update | 2460 | */ |
2309 | * due to NO_HZ. | 2461 | delta = jiffies - calc_load_update - 10; |
2310 | */ | 2462 | n = 1 + (delta / LOAD_FREQ); |
2311 | delta = calc_load_fold_idle(); | ||
2312 | if (delta) | ||
2313 | atomic_long_add(delta, &calc_load_tasks); | ||
2314 | |||
2315 | /* | ||
2316 | * It could be the one fold was all it took, we done! | ||
2317 | */ | ||
2318 | if (time_before(jiffies, calc_load_update + 10)) | ||
2319 | return; | ||
2320 | 2463 | ||
2321 | /* | 2464 | active = atomic_long_read(&calc_load_tasks); |
2322 | * Catch-up, fold however many we are behind still | 2465 | active = active > 0 ? active * FIXED_1 : 0; |
2323 | */ | ||
2324 | delta = jiffies - calc_load_update - 10; | ||
2325 | n = 1 + (delta / LOAD_FREQ); | ||
2326 | 2466 | ||
2327 | active = atomic_long_read(&calc_load_tasks); | 2467 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
2328 | active = active > 0 ? active * FIXED_1 : 0; | 2468 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
2469 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2329 | 2470 | ||
2330 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | 2471 | calc_load_update += n * LOAD_FREQ; |
2331 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | 2472 | } |
2332 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2333 | |||
2334 | calc_load_update += n * LOAD_FREQ; | ||
2335 | } | ||
2336 | #else | ||
2337 | void calc_load_account_idle(struct rq *this_rq) | ||
2338 | { | ||
2339 | } | ||
2340 | 2473 | ||
2341 | static inline long calc_load_fold_idle(void) | 2474 | /* |
2342 | { | 2475 | * Flip the idle index... |
2343 | return 0; | 2476 | * |
2477 | * Make sure we first write the new time then flip the index, so that | ||
2478 | * calc_load_write_idx() will see the new time when it reads the new | ||
2479 | * index, this avoids a double flip messing things up. | ||
2480 | */ | ||
2481 | smp_wmb(); | ||
2482 | calc_load_idx++; | ||
2344 | } | 2483 | } |
2484 | #else /* !CONFIG_NO_HZ */ | ||
2345 | 2485 | ||
2346 | static void calc_global_nohz(void) | 2486 | static inline long calc_load_fold_idle(void) { return 0; } |
2347 | { | 2487 | static inline void calc_global_nohz(void) { } |
2348 | } | ||
2349 | #endif | ||
2350 | 2488 | ||
2351 | /** | 2489 | #endif /* CONFIG_NO_HZ */ |
2352 | * get_avenrun - get the load average array | ||
2353 | * @loads: pointer to dest load array | ||
2354 | * @offset: offset to add | ||
2355 | * @shift: shift count to shift the result left | ||
2356 | * | ||
2357 | * These values are estimates at best, so no need for locking. | ||
2358 | */ | ||
2359 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2360 | { | ||
2361 | loads[0] = (avenrun[0] + offset) << shift; | ||
2362 | loads[1] = (avenrun[1] + offset) << shift; | ||
2363 | loads[2] = (avenrun[2] + offset) << shift; | ||
2364 | } | ||
2365 | 2490 | ||
2366 | /* | 2491 | /* |
2367 | * calc_load - update the avenrun load estimates 10 ticks after the | 2492 | * calc_load - update the avenrun load estimates 10 ticks after the |
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
2369 | */ | 2494 | */ |
2370 | void calc_global_load(unsigned long ticks) | 2495 | void calc_global_load(unsigned long ticks) |
2371 | { | 2496 | { |
2372 | long active; | 2497 | long active, delta; |
2373 | 2498 | ||
2374 | if (time_before(jiffies, calc_load_update + 10)) | 2499 | if (time_before(jiffies, calc_load_update + 10)) |
2375 | return; | 2500 | return; |
2376 | 2501 | ||
2502 | /* | ||
2503 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
2504 | */ | ||
2505 | delta = calc_load_fold_idle(); | ||
2506 | if (delta) | ||
2507 | atomic_long_add(delta, &calc_load_tasks); | ||
2508 | |||
2377 | active = atomic_long_read(&calc_load_tasks); | 2509 | active = atomic_long_read(&calc_load_tasks); |
2378 | active = active > 0 ? active * FIXED_1 : 0; | 2510 | active = active > 0 ? active * FIXED_1 : 0; |
2379 | 2511 | ||
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks) | |||
2384 | calc_load_update += LOAD_FREQ; | 2516 | calc_load_update += LOAD_FREQ; |
2385 | 2517 | ||
2386 | /* | 2518 | /* |
2387 | * Account one period with whatever state we found before | 2519 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. |
2388 | * folding in the nohz state and ageing the entire idle period. | ||
2389 | * | ||
2390 | * This avoids loosing a sample when we go idle between | ||
2391 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
2392 | * under-accounting. | ||
2393 | */ | 2520 | */ |
2394 | calc_global_nohz(); | 2521 | calc_global_nohz(); |
2395 | } | 2522 | } |
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
2406 | return; | 2533 | return; |
2407 | 2534 | ||
2408 | delta = calc_load_fold_active(this_rq); | 2535 | delta = calc_load_fold_active(this_rq); |
2409 | delta += calc_load_fold_idle(); | ||
2410 | if (delta) | 2536 | if (delta) |
2411 | atomic_long_add(delta, &calc_load_tasks); | 2537 | atomic_long_add(delta, &calc_load_tasks); |
2412 | 2538 | ||
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) | |||
2414 | } | 2540 | } |
2415 | 2541 | ||
2416 | /* | 2542 | /* |
2543 | * End of global load-average stuff | ||
2544 | */ | ||
2545 | |||
2546 | /* | ||
2417 | * The exact cpuload at various idx values, calculated at every tick would be | 2547 | * The exact cpuload at various idx values, calculated at every tick would be |
2418 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 2548 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load |
2419 | * | 2549 | * |
@@ -4210,9 +4340,7 @@ recheck: | |||
4210 | */ | 4340 | */ |
4211 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | 4341 | if (unlikely(policy == p->policy && (!rt_policy(policy) || |
4212 | param->sched_priority == p->rt_priority))) { | 4342 | param->sched_priority == p->rt_priority))) { |
4213 | 4343 | task_rq_unlock(rq, p, &flags); | |
4214 | __task_rq_unlock(rq); | ||
4215 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4216 | return 0; | 4344 | return 0; |
4217 | } | 4345 | } |
4218 | 4346 | ||
@@ -5894,6 +6022,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5894 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | 6022 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this |
5895 | * allows us to avoid some pointer chasing select_idle_sibling(). | 6023 | * allows us to avoid some pointer chasing select_idle_sibling(). |
5896 | * | 6024 | * |
6025 | * Iterate domains and sched_groups downward, assigning CPUs to be | ||
6026 | * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing | ||
6027 | * due to random perturbation self canceling, ie sw buddies pull | ||
6028 | * their counterpart to their CPU's hw counterpart. | ||
6029 | * | ||
5897 | * Also keep a unique ID per domain (we use the first cpu number in | 6030 | * Also keep a unique ID per domain (we use the first cpu number in |
5898 | * the cpumask of the domain), this allows us to quickly tell if | 6031 | * the cpumask of the domain), this allows us to quickly tell if |
5899 | * two cpus are in the same cache domain, see cpus_share_cache(). | 6032 | * two cpus are in the same cache domain, see cpus_share_cache(). |
@@ -5907,8 +6040,40 @@ static void update_top_cache_domain(int cpu) | |||
5907 | int id = cpu; | 6040 | int id = cpu; |
5908 | 6041 | ||
5909 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | 6042 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); |
5910 | if (sd) | 6043 | if (sd) { |
6044 | struct sched_domain *tmp = sd; | ||
6045 | struct sched_group *sg, *prev; | ||
6046 | bool right; | ||
6047 | |||
6048 | /* | ||
6049 | * Traverse to first CPU in group, and count hops | ||
6050 | * to cpu from there, switching direction on each | ||
6051 | * hop, never ever pointing the last CPU rightward. | ||
6052 | */ | ||
6053 | do { | ||
6054 | id = cpumask_first(sched_domain_span(tmp)); | ||
6055 | prev = sg = tmp->groups; | ||
6056 | right = 1; | ||
6057 | |||
6058 | while (cpumask_first(sched_group_cpus(sg)) != id) | ||
6059 | sg = sg->next; | ||
6060 | |||
6061 | while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { | ||
6062 | prev = sg; | ||
6063 | sg = sg->next; | ||
6064 | right = !right; | ||
6065 | } | ||
6066 | |||
6067 | /* A CPU went down, never point back to domain start. */ | ||
6068 | if (right && cpumask_first(sched_group_cpus(sg->next)) == id) | ||
6069 | right = false; | ||
6070 | |||
6071 | sg = right ? sg->next : prev; | ||
6072 | tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); | ||
6073 | } while ((tmp = tmp->child)); | ||
6074 | |||
5911 | id = cpumask_first(sched_domain_span(sd)); | 6075 | id = cpumask_first(sched_domain_span(sd)); |
6076 | } | ||
5912 | 6077 | ||
5913 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 6078 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
5914 | per_cpu(sd_llc_id, cpu) = id; | 6079 | per_cpu(sd_llc_id, cpu) = id; |
@@ -6967,34 +7132,66 @@ match2: | |||
6967 | mutex_unlock(&sched_domains_mutex); | 7132 | mutex_unlock(&sched_domains_mutex); |
6968 | } | 7133 | } |
6969 | 7134 | ||
7135 | static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ | ||
7136 | |||
6970 | /* | 7137 | /* |
6971 | * Update cpusets according to cpu_active mask. If cpusets are | 7138 | * Update cpusets according to cpu_active mask. If cpusets are |
6972 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 7139 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
6973 | * around partition_sched_domains(). | 7140 | * around partition_sched_domains(). |
7141 | * | ||
7142 | * If we come here as part of a suspend/resume, don't touch cpusets because we | ||
7143 | * want to restore it back to its original state upon resume anyway. | ||
6974 | */ | 7144 | */ |
6975 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | 7145 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
6976 | void *hcpu) | 7146 | void *hcpu) |
6977 | { | 7147 | { |
6978 | switch (action & ~CPU_TASKS_FROZEN) { | 7148 | switch (action) { |
7149 | case CPU_ONLINE_FROZEN: | ||
7150 | case CPU_DOWN_FAILED_FROZEN: | ||
7151 | |||
7152 | /* | ||
7153 | * num_cpus_frozen tracks how many CPUs are involved in suspend | ||
7154 | * resume sequence. As long as this is not the last online | ||
7155 | * operation in the resume sequence, just build a single sched | ||
7156 | * domain, ignoring cpusets. | ||
7157 | */ | ||
7158 | num_cpus_frozen--; | ||
7159 | if (likely(num_cpus_frozen)) { | ||
7160 | partition_sched_domains(1, NULL, NULL); | ||
7161 | break; | ||
7162 | } | ||
7163 | |||
7164 | /* | ||
7165 | * This is the last CPU online operation. So fall through and | ||
7166 | * restore the original sched domains by considering the | ||
7167 | * cpuset configurations. | ||
7168 | */ | ||
7169 | |||
6979 | case CPU_ONLINE: | 7170 | case CPU_ONLINE: |
6980 | case CPU_DOWN_FAILED: | 7171 | case CPU_DOWN_FAILED: |
6981 | cpuset_update_active_cpus(); | 7172 | cpuset_update_active_cpus(true); |
6982 | return NOTIFY_OK; | 7173 | break; |
6983 | default: | 7174 | default: |
6984 | return NOTIFY_DONE; | 7175 | return NOTIFY_DONE; |
6985 | } | 7176 | } |
7177 | return NOTIFY_OK; | ||
6986 | } | 7178 | } |
6987 | 7179 | ||
6988 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7180 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
6989 | void *hcpu) | 7181 | void *hcpu) |
6990 | { | 7182 | { |
6991 | switch (action & ~CPU_TASKS_FROZEN) { | 7183 | switch (action) { |
6992 | case CPU_DOWN_PREPARE: | 7184 | case CPU_DOWN_PREPARE: |
6993 | cpuset_update_active_cpus(); | 7185 | cpuset_update_active_cpus(false); |
6994 | return NOTIFY_OK; | 7186 | break; |
7187 | case CPU_DOWN_PREPARE_FROZEN: | ||
7188 | num_cpus_frozen++; | ||
7189 | partition_sched_domains(1, NULL, NULL); | ||
7190 | break; | ||
6995 | default: | 7191 | default: |
6996 | return NOTIFY_DONE; | 7192 | return NOTIFY_DONE; |
6997 | } | 7193 | } |
7194 | return NOTIFY_OK; | ||
6998 | } | 7195 | } |
6999 | 7196 | ||
7000 | void __init sched_init_smp(void) | 7197 | void __init sched_init_smp(void) |
@@ -7459,6 +7656,7 @@ void sched_destroy_group(struct task_group *tg) | |||
7459 | */ | 7656 | */ |
7460 | void sched_move_task(struct task_struct *tsk) | 7657 | void sched_move_task(struct task_struct *tsk) |
7461 | { | 7658 | { |
7659 | struct task_group *tg; | ||
7462 | int on_rq, running; | 7660 | int on_rq, running; |
7463 | unsigned long flags; | 7661 | unsigned long flags; |
7464 | struct rq *rq; | 7662 | struct rq *rq; |
@@ -7473,6 +7671,12 @@ void sched_move_task(struct task_struct *tsk) | |||
7473 | if (unlikely(running)) | 7671 | if (unlikely(running)) |
7474 | tsk->sched_class->put_prev_task(rq, tsk); | 7672 | tsk->sched_class->put_prev_task(rq, tsk); |
7475 | 7673 | ||
7674 | tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, | ||
7675 | lockdep_is_held(&tsk->sighand->siglock)), | ||
7676 | struct task_group, css); | ||
7677 | tg = autogroup_task_group(tsk, tg); | ||
7678 | tsk->sched_task_group = tg; | ||
7679 | |||
7476 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7680 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7477 | if (tsk->sched_class->task_move_group) | 7681 | if (tsk->sched_class->task_move_group) |
7478 | tsk->sched_class->task_move_group(tsk, on_rq); | 7682 | tsk->sched_class->task_move_group(tsk, on_rq); |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d72586fdf660..23aa789c53ee 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -65,8 +65,8 @@ static int convert_prio(int prio) | |||
65 | int cpupri_find(struct cpupri *cp, struct task_struct *p, | 65 | int cpupri_find(struct cpupri *cp, struct task_struct *p, |
66 | struct cpumask *lowest_mask) | 66 | struct cpumask *lowest_mask) |
67 | { | 67 | { |
68 | int idx = 0; | 68 | int idx = 0; |
69 | int task_pri = convert_prio(p->prio); | 69 | int task_pri = convert_prio(p->prio); |
70 | 70 | ||
71 | if (task_pri >= MAX_RT_PRIO) | 71 | if (task_pri >= MAX_RT_PRIO) |
72 | return 0; | 72 | return 0; |
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
137 | */ | 137 | */ |
138 | void cpupri_set(struct cpupri *cp, int cpu, int newpri) | 138 | void cpupri_set(struct cpupri *cp, int cpu, int newpri) |
139 | { | 139 | { |
140 | int *currpri = &cp->cpu_to_pri[cpu]; | 140 | int *currpri = &cp->cpu_to_pri[cpu]; |
141 | int oldpri = *currpri; | 141 | int oldpri = *currpri; |
142 | int do_mb = 0; | 142 | int do_mb = 0; |
143 | 143 | ||
144 | newpri = convert_prio(newpri); | 144 | newpri = convert_prio(newpri); |
145 | 145 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe3..d0cc03b3e70b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2637 | int cpu = smp_processor_id(); | 2637 | int cpu = smp_processor_id(); |
2638 | int prev_cpu = task_cpu(p); | 2638 | int prev_cpu = task_cpu(p); |
2639 | struct sched_domain *sd; | 2639 | struct sched_domain *sd; |
2640 | struct sched_group *sg; | ||
2641 | int i; | ||
2642 | 2640 | ||
2643 | /* | 2641 | /* |
2644 | * If the task is going to be woken-up on this cpu and if it is | 2642 | * If the task is going to be woken-up on this cpu and if it is |
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2655 | return prev_cpu; | 2653 | return prev_cpu; |
2656 | 2654 | ||
2657 | /* | 2655 | /* |
2658 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2656 | * Otherwise, check assigned siblings to find an elegible idle cpu. |
2659 | */ | 2657 | */ |
2660 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 2658 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2661 | for_each_lower_domain(sd) { | ||
2662 | sg = sd->groups; | ||
2663 | do { | ||
2664 | if (!cpumask_intersects(sched_group_cpus(sg), | ||
2665 | tsk_cpus_allowed(p))) | ||
2666 | goto next; | ||
2667 | |||
2668 | for_each_cpu(i, sched_group_cpus(sg)) { | ||
2669 | if (!idle_cpu(i)) | ||
2670 | goto next; | ||
2671 | } | ||
2672 | 2659 | ||
2673 | target = cpumask_first_and(sched_group_cpus(sg), | 2660 | for_each_lower_domain(sd) { |
2674 | tsk_cpus_allowed(p)); | 2661 | if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) |
2675 | goto done; | 2662 | continue; |
2676 | next: | 2663 | if (idle_cpu(sd->idle_buddy)) |
2677 | sg = sg->next; | 2664 | return sd->idle_buddy; |
2678 | } while (sg != sd->groups); | ||
2679 | } | 2665 | } |
2680 | done: | 2666 | |
2681 | return target; | 2667 | return target; |
2682 | } | 2668 | } |
2683 | 2669 | ||
@@ -3068,18 +3054,24 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; | |||
3068 | 3054 | ||
3069 | #define LBF_ALL_PINNED 0x01 | 3055 | #define LBF_ALL_PINNED 0x01 |
3070 | #define LBF_NEED_BREAK 0x02 | 3056 | #define LBF_NEED_BREAK 0x02 |
3057 | #define LBF_SOME_PINNED 0x04 | ||
3071 | 3058 | ||
3072 | struct lb_env { | 3059 | struct lb_env { |
3073 | struct sched_domain *sd; | 3060 | struct sched_domain *sd; |
3074 | 3061 | ||
3075 | int src_cpu; | ||
3076 | struct rq *src_rq; | 3062 | struct rq *src_rq; |
3063 | int src_cpu; | ||
3077 | 3064 | ||
3078 | int dst_cpu; | 3065 | int dst_cpu; |
3079 | struct rq *dst_rq; | 3066 | struct rq *dst_rq; |
3080 | 3067 | ||
3068 | struct cpumask *dst_grpmask; | ||
3069 | int new_dst_cpu; | ||
3081 | enum cpu_idle_type idle; | 3070 | enum cpu_idle_type idle; |
3082 | long imbalance; | 3071 | long imbalance; |
3072 | /* The set of CPUs under consideration for load-balancing */ | ||
3073 | struct cpumask *cpus; | ||
3074 | |||
3083 | unsigned int flags; | 3075 | unsigned int flags; |
3084 | 3076 | ||
3085 | unsigned int loop; | 3077 | unsigned int loop; |
@@ -3145,9 +3137,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3145 | * 3) are cache-hot on their current CPU. | 3137 | * 3) are cache-hot on their current CPU. |
3146 | */ | 3138 | */ |
3147 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 3139 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3140 | int new_dst_cpu; | ||
3141 | |||
3148 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3142 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3143 | |||
3144 | /* | ||
3145 | * Remember if this task can be migrated to any other cpu in | ||
3146 | * our sched_group. We may want to revisit it if we couldn't | ||
3147 | * meet load balance goals by pulling other tasks on src_cpu. | ||
3148 | * | ||
3149 | * Also avoid computing new_dst_cpu if we have already computed | ||
3150 | * one in current iteration. | ||
3151 | */ | ||
3152 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | ||
3153 | return 0; | ||
3154 | |||
3155 | new_dst_cpu = cpumask_first_and(env->dst_grpmask, | ||
3156 | tsk_cpus_allowed(p)); | ||
3157 | if (new_dst_cpu < nr_cpu_ids) { | ||
3158 | env->flags |= LBF_SOME_PINNED; | ||
3159 | env->new_dst_cpu = new_dst_cpu; | ||
3160 | } | ||
3149 | return 0; | 3161 | return 0; |
3150 | } | 3162 | } |
3163 | |||
3164 | /* Record that we found atleast one task that could run on dst_cpu */ | ||
3151 | env->flags &= ~LBF_ALL_PINNED; | 3165 | env->flags &= ~LBF_ALL_PINNED; |
3152 | 3166 | ||
3153 | if (task_running(env->src_rq, p)) { | 3167 | if (task_running(env->src_rq, p)) { |
@@ -3642,8 +3656,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3642 | */ | 3656 | */ |
3643 | static inline void update_sg_lb_stats(struct lb_env *env, | 3657 | static inline void update_sg_lb_stats(struct lb_env *env, |
3644 | struct sched_group *group, int load_idx, | 3658 | struct sched_group *group, int load_idx, |
3645 | int local_group, const struct cpumask *cpus, | 3659 | int local_group, int *balance, struct sg_lb_stats *sgs) |
3646 | int *balance, struct sg_lb_stats *sgs) | ||
3647 | { | 3660 | { |
3648 | unsigned long nr_running, max_nr_running, min_nr_running; | 3661 | unsigned long nr_running, max_nr_running, min_nr_running; |
3649 | unsigned long load, max_cpu_load, min_cpu_load; | 3662 | unsigned long load, max_cpu_load, min_cpu_load; |
@@ -3660,7 +3673,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
3660 | max_nr_running = 0; | 3673 | max_nr_running = 0; |
3661 | min_nr_running = ~0UL; | 3674 | min_nr_running = ~0UL; |
3662 | 3675 | ||
3663 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 3676 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
3664 | struct rq *rq = cpu_rq(i); | 3677 | struct rq *rq = cpu_rq(i); |
3665 | 3678 | ||
3666 | nr_running = rq->nr_running; | 3679 | nr_running = rq->nr_running; |
@@ -3789,8 +3802,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
3789 | * @sds: variable to hold the statistics for this sched_domain. | 3802 | * @sds: variable to hold the statistics for this sched_domain. |
3790 | */ | 3803 | */ |
3791 | static inline void update_sd_lb_stats(struct lb_env *env, | 3804 | static inline void update_sd_lb_stats(struct lb_env *env, |
3792 | const struct cpumask *cpus, | 3805 | int *balance, struct sd_lb_stats *sds) |
3793 | int *balance, struct sd_lb_stats *sds) | ||
3794 | { | 3806 | { |
3795 | struct sched_domain *child = env->sd->child; | 3807 | struct sched_domain *child = env->sd->child; |
3796 | struct sched_group *sg = env->sd->groups; | 3808 | struct sched_group *sg = env->sd->groups; |
@@ -3807,8 +3819,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
3807 | 3819 | ||
3808 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 3820 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
3809 | memset(&sgs, 0, sizeof(sgs)); | 3821 | memset(&sgs, 0, sizeof(sgs)); |
3810 | update_sg_lb_stats(env, sg, load_idx, local_group, | 3822 | update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); |
3811 | cpus, balance, &sgs); | ||
3812 | 3823 | ||
3813 | if (local_group && !(*balance)) | 3824 | if (local_group && !(*balance)) |
3814 | return; | 3825 | return; |
@@ -4044,7 +4055,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4044 | * to restore balance. | 4055 | * to restore balance. |
4045 | * | 4056 | * |
4046 | * @env: The load balancing environment. | 4057 | * @env: The load balancing environment. |
4047 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
4048 | * @balance: Pointer to a variable indicating if this_cpu | 4058 | * @balance: Pointer to a variable indicating if this_cpu |
4049 | * is the appropriate cpu to perform load balancing at this_level. | 4059 | * is the appropriate cpu to perform load balancing at this_level. |
4050 | * | 4060 | * |
@@ -4054,7 +4064,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4054 | * put to idle by rebalancing its tasks onto our group. | 4064 | * put to idle by rebalancing its tasks onto our group. |
4055 | */ | 4065 | */ |
4056 | static struct sched_group * | 4066 | static struct sched_group * |
4057 | find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) | 4067 | find_busiest_group(struct lb_env *env, int *balance) |
4058 | { | 4068 | { |
4059 | struct sd_lb_stats sds; | 4069 | struct sd_lb_stats sds; |
4060 | 4070 | ||
@@ -4064,7 +4074,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) | |||
4064 | * Compute the various statistics relavent for load balancing at | 4074 | * Compute the various statistics relavent for load balancing at |
4065 | * this level. | 4075 | * this level. |
4066 | */ | 4076 | */ |
4067 | update_sd_lb_stats(env, cpus, balance, &sds); | 4077 | update_sd_lb_stats(env, balance, &sds); |
4068 | 4078 | ||
4069 | /* | 4079 | /* |
4070 | * this_cpu is not the appropriate cpu to perform load balancing at | 4080 | * this_cpu is not the appropriate cpu to perform load balancing at |
@@ -4144,8 +4154,7 @@ ret: | |||
4144 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4154 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
4145 | */ | 4155 | */ |
4146 | static struct rq *find_busiest_queue(struct lb_env *env, | 4156 | static struct rq *find_busiest_queue(struct lb_env *env, |
4147 | struct sched_group *group, | 4157 | struct sched_group *group) |
4148 | const struct cpumask *cpus) | ||
4149 | { | 4158 | { |
4150 | struct rq *busiest = NULL, *rq; | 4159 | struct rq *busiest = NULL, *rq; |
4151 | unsigned long max_load = 0; | 4160 | unsigned long max_load = 0; |
@@ -4160,7 +4169,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4160 | if (!capacity) | 4169 | if (!capacity) |
4161 | capacity = fix_small_capacity(env->sd, group); | 4170 | capacity = fix_small_capacity(env->sd, group); |
4162 | 4171 | ||
4163 | if (!cpumask_test_cpu(i, cpus)) | 4172 | if (!cpumask_test_cpu(i, env->cpus)) |
4164 | continue; | 4173 | continue; |
4165 | 4174 | ||
4166 | rq = cpu_rq(i); | 4175 | rq = cpu_rq(i); |
@@ -4227,7 +4236,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4227 | struct sched_domain *sd, enum cpu_idle_type idle, | 4236 | struct sched_domain *sd, enum cpu_idle_type idle, |
4228 | int *balance) | 4237 | int *balance) |
4229 | { | 4238 | { |
4230 | int ld_moved, active_balance = 0; | 4239 | int ld_moved, cur_ld_moved, active_balance = 0; |
4240 | int lb_iterations, max_lb_iterations; | ||
4231 | struct sched_group *group; | 4241 | struct sched_group *group; |
4232 | struct rq *busiest; | 4242 | struct rq *busiest; |
4233 | unsigned long flags; | 4243 | unsigned long flags; |
@@ -4237,16 +4247,19 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4237 | .sd = sd, | 4247 | .sd = sd, |
4238 | .dst_cpu = this_cpu, | 4248 | .dst_cpu = this_cpu, |
4239 | .dst_rq = this_rq, | 4249 | .dst_rq = this_rq, |
4250 | .dst_grpmask = sched_group_cpus(sd->groups), | ||
4240 | .idle = idle, | 4251 | .idle = idle, |
4241 | .loop_break = sched_nr_migrate_break, | 4252 | .loop_break = sched_nr_migrate_break, |
4253 | .cpus = cpus, | ||
4242 | }; | 4254 | }; |
4243 | 4255 | ||
4244 | cpumask_copy(cpus, cpu_active_mask); | 4256 | cpumask_copy(cpus, cpu_active_mask); |
4257 | max_lb_iterations = cpumask_weight(env.dst_grpmask); | ||
4245 | 4258 | ||
4246 | schedstat_inc(sd, lb_count[idle]); | 4259 | schedstat_inc(sd, lb_count[idle]); |
4247 | 4260 | ||
4248 | redo: | 4261 | redo: |
4249 | group = find_busiest_group(&env, cpus, balance); | 4262 | group = find_busiest_group(&env, balance); |
4250 | 4263 | ||
4251 | if (*balance == 0) | 4264 | if (*balance == 0) |
4252 | goto out_balanced; | 4265 | goto out_balanced; |
@@ -4256,7 +4269,7 @@ redo: | |||
4256 | goto out_balanced; | 4269 | goto out_balanced; |
4257 | } | 4270 | } |
4258 | 4271 | ||
4259 | busiest = find_busiest_queue(&env, group, cpus); | 4272 | busiest = find_busiest_queue(&env, group); |
4260 | if (!busiest) { | 4273 | if (!busiest) { |
4261 | schedstat_inc(sd, lb_nobusyq[idle]); | 4274 | schedstat_inc(sd, lb_nobusyq[idle]); |
4262 | goto out_balanced; | 4275 | goto out_balanced; |
@@ -4267,6 +4280,7 @@ redo: | |||
4267 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 4280 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4268 | 4281 | ||
4269 | ld_moved = 0; | 4282 | ld_moved = 0; |
4283 | lb_iterations = 1; | ||
4270 | if (busiest->nr_running > 1) { | 4284 | if (busiest->nr_running > 1) { |
4271 | /* | 4285 | /* |
4272 | * Attempt to move tasks. If find_busiest_group has found | 4286 | * Attempt to move tasks. If find_busiest_group has found |
@@ -4284,7 +4298,13 @@ more_balance: | |||
4284 | double_rq_lock(this_rq, busiest); | 4298 | double_rq_lock(this_rq, busiest); |
4285 | if (!env.loop) | 4299 | if (!env.loop) |
4286 | update_h_load(env.src_cpu); | 4300 | update_h_load(env.src_cpu); |
4287 | ld_moved += move_tasks(&env); | 4301 | |
4302 | /* | ||
4303 | * cur_ld_moved - load moved in current iteration | ||
4304 | * ld_moved - cumulative load moved across iterations | ||
4305 | */ | ||
4306 | cur_ld_moved = move_tasks(&env); | ||
4307 | ld_moved += cur_ld_moved; | ||
4288 | double_rq_unlock(this_rq, busiest); | 4308 | double_rq_unlock(this_rq, busiest); |
4289 | local_irq_restore(flags); | 4309 | local_irq_restore(flags); |
4290 | 4310 | ||
@@ -4296,14 +4316,52 @@ more_balance: | |||
4296 | /* | 4316 | /* |
4297 | * some other cpu did the load balance for us. | 4317 | * some other cpu did the load balance for us. |
4298 | */ | 4318 | */ |
4299 | if (ld_moved && this_cpu != smp_processor_id()) | 4319 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) |
4300 | resched_cpu(this_cpu); | 4320 | resched_cpu(env.dst_cpu); |
4321 | |||
4322 | /* | ||
4323 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | ||
4324 | * us and move them to an alternate dst_cpu in our sched_group | ||
4325 | * where they can run. The upper limit on how many times we | ||
4326 | * iterate on same src_cpu is dependent on number of cpus in our | ||
4327 | * sched_group. | ||
4328 | * | ||
4329 | * This changes load balance semantics a bit on who can move | ||
4330 | * load to a given_cpu. In addition to the given_cpu itself | ||
4331 | * (or a ilb_cpu acting on its behalf where given_cpu is | ||
4332 | * nohz-idle), we now have balance_cpu in a position to move | ||
4333 | * load to given_cpu. In rare situations, this may cause | ||
4334 | * conflicts (balance_cpu and given_cpu/ilb_cpu deciding | ||
4335 | * _independently_ and at _same_ time to move some load to | ||
4336 | * given_cpu) causing exceess load to be moved to given_cpu. | ||
4337 | * This however should not happen so much in practice and | ||
4338 | * moreover subsequent load balance cycles should correct the | ||
4339 | * excess load moved. | ||
4340 | */ | ||
4341 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | ||
4342 | lb_iterations++ < max_lb_iterations) { | ||
4343 | |||
4344 | this_rq = cpu_rq(env.new_dst_cpu); | ||
4345 | env.dst_rq = this_rq; | ||
4346 | env.dst_cpu = env.new_dst_cpu; | ||
4347 | env.flags &= ~LBF_SOME_PINNED; | ||
4348 | env.loop = 0; | ||
4349 | env.loop_break = sched_nr_migrate_break; | ||
4350 | /* | ||
4351 | * Go back to "more_balance" rather than "redo" since we | ||
4352 | * need to continue with same src_cpu. | ||
4353 | */ | ||
4354 | goto more_balance; | ||
4355 | } | ||
4301 | 4356 | ||
4302 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4357 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4303 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 4358 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
4304 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4359 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
4305 | if (!cpumask_empty(cpus)) | 4360 | if (!cpumask_empty(cpus)) { |
4361 | env.loop = 0; | ||
4362 | env.loop_break = sched_nr_migrate_break; | ||
4306 | goto redo; | 4363 | goto redo; |
4364 | } | ||
4307 | goto out_balanced; | 4365 | goto out_balanced; |
4308 | } | 4366 | } |
4309 | } | 4367 | } |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
26 | { | 26 | { |
27 | schedstat_inc(rq, sched_goidle); | 27 | schedstat_inc(rq, sched_goidle); |
28 | calc_load_account_idle(rq); | ||
29 | return rq->idle; | 28 | return rq->idle; |
30 | } | 29 | } |
31 | 30 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33d..c35a1a7dd4d6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg); | |||
538 | /* | 538 | /* |
539 | * Return the group to which this tasks belongs. | 539 | * Return the group to which this tasks belongs. |
540 | * | 540 | * |
541 | * We use task_subsys_state_check() and extend the RCU verification with | 541 | * We cannot use task_subsys_state() and friends because the cgroup |
542 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | 542 | * subsystem changes that value before the cgroup_subsys::attach() method |
543 | * task it moves into the cgroup. Therefore by holding either of those locks, | 543 | * is called, therefore we cannot pin it and might observe the wrong value. |
544 | * we pin the task to the current cgroup. | 544 | * |
545 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup | ||
546 | * core changes this before calling sched_move_task(). | ||
547 | * | ||
548 | * Instead we use a 'copy' which is updated from sched_move_task() while | ||
549 | * holding both task_struct::pi_lock and rq::lock. | ||
545 | */ | 550 | */ |
546 | static inline struct task_group *task_group(struct task_struct *p) | 551 | static inline struct task_group *task_group(struct task_struct *p) |
547 | { | 552 | { |
548 | struct task_group *tg; | 553 | return p->sched_task_group; |
549 | struct cgroup_subsys_state *css; | ||
550 | |||
551 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
552 | lockdep_is_held(&p->pi_lock) || | ||
553 | lockdep_is_held(&task_rq(p)->lock)); | ||
554 | tg = container_of(css, struct task_group, css); | ||
555 | |||
556 | return autogroup_task_group(p, tg); | ||
557 | } | 554 | } |
558 | 555 | ||
559 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 556 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
@@ -942,8 +939,6 @@ static inline u64 sched_avg_period(void) | |||
942 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | 939 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; |
943 | } | 940 | } |
944 | 941 | ||
945 | void calc_load_account_idle(struct rq *this_rq); | ||
946 | |||
947 | #ifdef CONFIG_SCHED_HRTICK | 942 | #ifdef CONFIG_SCHED_HRTICK |
948 | 943 | ||
949 | /* | 944 | /* |
diff --git a/kernel/signal.c b/kernel/signal.c index 677102789cf2..be4f856d52f8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why) | |||
1971 | void ptrace_notify(int exit_code) | 1971 | void ptrace_notify(int exit_code) |
1972 | { | 1972 | { |
1973 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); | 1973 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); |
1974 | if (unlikely(current->task_works)) { | ||
1975 | if (test_and_clear_ti_thread_flag(current_thread_info(), | ||
1976 | TIF_NOTIFY_RESUME)) { | ||
1977 | smp_mb__after_clear_bit(); | ||
1978 | task_work_run(); | ||
1979 | } | ||
1980 | } | ||
1974 | 1981 | ||
1975 | spin_lock_irq(¤t->sighand->siglock); | 1982 | spin_lock_irq(¤t->sighand->siglock); |
1976 | ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); | 1983 | ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); |
@@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
2191 | struct signal_struct *signal = current->signal; | 2198 | struct signal_struct *signal = current->signal; |
2192 | int signr; | 2199 | int signr; |
2193 | 2200 | ||
2201 | if (unlikely(current->task_works)) { | ||
2202 | if (test_and_clear_ti_thread_flag(current_thread_info(), | ||
2203 | TIF_NOTIFY_RESUME)) { | ||
2204 | smp_mb__after_clear_bit(); | ||
2205 | task_work_run(); | ||
2206 | } | ||
2207 | } | ||
2208 | |||
2194 | if (unlikely(uprobe_deny_signal())) | 2209 | if (unlikely(uprobe_deny_signal())) |
2195 | return 0; | 2210 | return 0; |
2196 | 2211 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index d0ae5b24875e..29dd40a9f2f4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) | |||
581 | return 0; | 581 | return 0; |
582 | } | 582 | } |
583 | EXPORT_SYMBOL(smp_call_function); | 583 | EXPORT_SYMBOL(smp_call_function); |
584 | |||
585 | void ipi_call_lock(void) | ||
586 | { | ||
587 | raw_spin_lock(&call_function.lock); | ||
588 | } | ||
589 | |||
590 | void ipi_call_unlock(void) | ||
591 | { | ||
592 | raw_spin_unlock(&call_function.lock); | ||
593 | } | ||
594 | |||
595 | void ipi_call_lock_irq(void) | ||
596 | { | ||
597 | raw_spin_lock_irq(&call_function.lock); | ||
598 | } | ||
599 | |||
600 | void ipi_call_unlock_irq(void) | ||
601 | { | ||
602 | raw_spin_unlock_irq(&call_function.lock); | ||
603 | } | ||
604 | #endif /* USE_GENERIC_SMP_HELPERS */ | 584 | #endif /* USE_GENERIC_SMP_HELPERS */ |
605 | 585 | ||
606 | /* Setup configured maximum number of CPUs to activate */ | 586 | /* Setup configured maximum number of CPUs to activate */ |
diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 80c0acfb8472..6ef9433e1c70 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h | |||
@@ -3,8 +3,6 @@ | |||
3 | 3 | ||
4 | struct task_struct; | 4 | struct task_struct; |
5 | 5 | ||
6 | int smpboot_prepare(unsigned int cpu); | ||
7 | |||
8 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | 6 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD |
9 | struct task_struct *idle_thread_get(unsigned int cpu); | 7 | struct task_struct *idle_thread_get(unsigned int cpu); |
10 | void idle_thread_set_boot_cpu(void); | 8 | void idle_thread_set_boot_cpu(void); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 671f9594e368..b73e681df09e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void) | |||
210 | __u32 pending; | 210 | __u32 pending; |
211 | int max_restart = MAX_SOFTIRQ_RESTART; | 211 | int max_restart = MAX_SOFTIRQ_RESTART; |
212 | int cpu; | 212 | int cpu; |
213 | unsigned long old_flags = current->flags; | ||
214 | |||
215 | /* | ||
216 | * Mask out PF_MEMALLOC s current task context is borrowed for the | ||
217 | * softirq. A softirq handled such as network RX might set PF_MEMALLOC | ||
218 | * again if the socket is related to swap | ||
219 | */ | ||
220 | current->flags &= ~PF_MEMALLOC; | ||
213 | 221 | ||
214 | pending = local_softirq_pending(); | 222 | pending = local_softirq_pending(); |
215 | account_system_vtime(current); | 223 | account_system_vtime(current); |
@@ -265,6 +273,7 @@ restart: | |||
265 | 273 | ||
266 | account_system_vtime(current); | 274 | account_system_vtime(current); |
267 | __local_bh_enable(SOFTIRQ_OFFSET); | 275 | __local_bh_enable(SOFTIRQ_OFFSET); |
276 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | ||
268 | } | 277 | } |
269 | 278 | ||
270 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 279 | #ifndef __ARCH_HAS_DO_SOFTIRQ |
diff --git a/kernel/sys.c b/kernel/sys.c index f0ec44dcd415..241507f23eca 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1788 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE |
1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
1790 | { | 1790 | { |
1791 | struct vm_area_struct *vma; | ||
1792 | struct file *exe_file; | 1791 | struct file *exe_file; |
1793 | struct dentry *dentry; | 1792 | struct dentry *dentry; |
1794 | int err; | 1793 | int err; |
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1816 | down_write(&mm->mmap_sem); | 1815 | down_write(&mm->mmap_sem); |
1817 | 1816 | ||
1818 | /* | 1817 | /* |
1819 | * Forbid mm->exe_file change if there are mapped other files. | 1818 | * Forbid mm->exe_file change if old file still mapped. |
1820 | */ | 1819 | */ |
1821 | err = -EBUSY; | 1820 | err = -EBUSY; |
1822 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 1821 | if (mm->exe_file) { |
1823 | if (vma->vm_file && !path_equal(&vma->vm_file->f_path, | 1822 | struct vm_area_struct *vma; |
1824 | &exe_file->f_path)) | 1823 | |
1825 | goto exit_unlock; | 1824 | for (vma = mm->mmap; vma; vma = vma->vm_next) |
1825 | if (vma->vm_file && | ||
1826 | path_equal(&vma->vm_file->f_path, | ||
1827 | &mm->exe_file->f_path)) | ||
1828 | goto exit_unlock; | ||
1826 | } | 1829 | } |
1827 | 1830 | ||
1828 | /* | 1831 | /* |
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1835 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) | 1838 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) |
1836 | goto exit_unlock; | 1839 | goto exit_unlock; |
1837 | 1840 | ||
1841 | err = 0; | ||
1838 | set_mm_exe_file(mm, exe_file); | 1842 | set_mm_exe_file(mm, exe_file); |
1839 | exit_unlock: | 1843 | exit_unlock: |
1840 | up_write(&mm->mmap_sem); | 1844 | up_write(&mm->mmap_sem); |
@@ -2011,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2011 | break; | 2015 | break; |
2012 | } | 2016 | } |
2013 | me->pdeath_signal = arg2; | 2017 | me->pdeath_signal = arg2; |
2014 | error = 0; | ||
2015 | break; | 2018 | break; |
2016 | case PR_GET_PDEATHSIG: | 2019 | case PR_GET_PDEATHSIG: |
2017 | error = put_user(me->pdeath_signal, (int __user *)arg2); | 2020 | error = put_user(me->pdeath_signal, (int __user *)arg2); |
@@ -2025,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2025 | break; | 2028 | break; |
2026 | } | 2029 | } |
2027 | set_dumpable(me->mm, arg2); | 2030 | set_dumpable(me->mm, arg2); |
2028 | error = 0; | ||
2029 | break; | 2031 | break; |
2030 | 2032 | ||
2031 | case PR_SET_UNALIGN: | 2033 | case PR_SET_UNALIGN: |
@@ -2052,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2052 | case PR_SET_TIMING: | 2054 | case PR_SET_TIMING: |
2053 | if (arg2 != PR_TIMING_STATISTICAL) | 2055 | if (arg2 != PR_TIMING_STATISTICAL) |
2054 | error = -EINVAL; | 2056 | error = -EINVAL; |
2055 | else | ||
2056 | error = 0; | ||
2057 | break; | 2057 | break; |
2058 | |||
2059 | case PR_SET_NAME: | 2058 | case PR_SET_NAME: |
2060 | comm[sizeof(me->comm)-1] = 0; | 2059 | comm[sizeof(me->comm)-1] = 0; |
2061 | if (strncpy_from_user(comm, (char __user *)arg2, | 2060 | if (strncpy_from_user(comm, (char __user *)arg2, |
@@ -2063,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2063 | return -EFAULT; | 2062 | return -EFAULT; |
2064 | set_task_comm(me, comm); | 2063 | set_task_comm(me, comm); |
2065 | proc_comm_connector(me); | 2064 | proc_comm_connector(me); |
2066 | return 0; | 2065 | break; |
2067 | case PR_GET_NAME: | 2066 | case PR_GET_NAME: |
2068 | get_task_comm(comm, me); | 2067 | get_task_comm(comm, me); |
2069 | if (copy_to_user((char __user *)arg2, comm, | 2068 | if (copy_to_user((char __user *)arg2, comm, |
2070 | sizeof(comm))) | 2069 | sizeof(comm))) |
2071 | return -EFAULT; | 2070 | return -EFAULT; |
2072 | return 0; | 2071 | break; |
2073 | case PR_GET_ENDIAN: | 2072 | case PR_GET_ENDIAN: |
2074 | error = GET_ENDIAN(me, arg2); | 2073 | error = GET_ENDIAN(me, arg2); |
2075 | break; | 2074 | break; |
2076 | case PR_SET_ENDIAN: | 2075 | case PR_SET_ENDIAN: |
2077 | error = SET_ENDIAN(me, arg2); | 2076 | error = SET_ENDIAN(me, arg2); |
2078 | break; | 2077 | break; |
2079 | |||
2080 | case PR_GET_SECCOMP: | 2078 | case PR_GET_SECCOMP: |
2081 | error = prctl_get_seccomp(); | 2079 | error = prctl_get_seccomp(); |
2082 | break; | 2080 | break; |
@@ -2104,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2104 | current->default_timer_slack_ns; | 2102 | current->default_timer_slack_ns; |
2105 | else | 2103 | else |
2106 | current->timer_slack_ns = arg2; | 2104 | current->timer_slack_ns = arg2; |
2107 | error = 0; | ||
2108 | break; | 2105 | break; |
2109 | case PR_MCE_KILL: | 2106 | case PR_MCE_KILL: |
2110 | if (arg4 | arg5) | 2107 | if (arg4 | arg5) |
@@ -2127,13 +2124,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2127 | else | 2124 | else |
2128 | return -EINVAL; | 2125 | return -EINVAL; |
2129 | break; | 2126 | break; |
2130 | case PR_GET_TID_ADDRESS: | ||
2131 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
2132 | break; | ||
2133 | default: | 2127 | default: |
2134 | return -EINVAL; | 2128 | return -EINVAL; |
2135 | } | 2129 | } |
2136 | error = 0; | ||
2137 | break; | 2130 | break; |
2138 | case PR_MCE_KILL_GET: | 2131 | case PR_MCE_KILL_GET: |
2139 | if (arg2 | arg3 | arg4 | arg5) | 2132 | if (arg2 | arg3 | arg4 | arg5) |
@@ -2147,9 +2140,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2147 | case PR_SET_MM: | 2140 | case PR_SET_MM: |
2148 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | 2141 | error = prctl_set_mm(arg2, arg3, arg4, arg5); |
2149 | break; | 2142 | break; |
2143 | case PR_GET_TID_ADDRESS: | ||
2144 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
2145 | break; | ||
2150 | case PR_SET_CHILD_SUBREAPER: | 2146 | case PR_SET_CHILD_SUBREAPER: |
2151 | me->signal->is_child_subreaper = !!arg2; | 2147 | me->signal->is_child_subreaper = !!arg2; |
2152 | error = 0; | ||
2153 | break; | 2148 | break; |
2154 | case PR_GET_CHILD_SUBREAPER: | 2149 | case PR_GET_CHILD_SUBREAPER: |
2155 | error = put_user(me->signal->is_child_subreaper, | 2150 | error = put_user(me->signal->is_child_subreaper, |
@@ -2191,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info) | |||
2191 | argv_free(info->argv); | 2186 | argv_free(info->argv); |
2192 | } | 2187 | } |
2193 | 2188 | ||
2194 | /** | 2189 | static int __orderly_poweroff(void) |
2195 | * orderly_poweroff - Trigger an orderly system poweroff | ||
2196 | * @force: force poweroff if command execution fails | ||
2197 | * | ||
2198 | * This may be called from any context to trigger a system shutdown. | ||
2199 | * If the orderly shutdown fails, it will force an immediate shutdown. | ||
2200 | */ | ||
2201 | int orderly_poweroff(bool force) | ||
2202 | { | 2190 | { |
2203 | int argc; | 2191 | int argc; |
2204 | char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); | 2192 | char **argv; |
2205 | static char *envp[] = { | 2193 | static char *envp[] = { |
2206 | "HOME=/", | 2194 | "HOME=/", |
2207 | "PATH=/sbin:/bin:/usr/sbin:/usr/bin", | 2195 | "PATH=/sbin:/bin:/usr/sbin:/usr/bin", |
2208 | NULL | 2196 | NULL |
2209 | }; | 2197 | }; |
2210 | int ret = -ENOMEM; | 2198 | int ret; |
2211 | 2199 | ||
2200 | argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); | ||
2212 | if (argv == NULL) { | 2201 | if (argv == NULL) { |
2213 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", | 2202 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", |
2214 | __func__, poweroff_cmd); | 2203 | __func__, poweroff_cmd); |
2215 | goto out; | 2204 | return -ENOMEM; |
2216 | } | 2205 | } |
2217 | 2206 | ||
2218 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, | 2207 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, |
2219 | NULL, argv_cleanup, NULL); | 2208 | NULL, argv_cleanup, NULL); |
2220 | out: | ||
2221 | if (likely(!ret)) | ||
2222 | return 0; | ||
2223 | |||
2224 | if (ret == -ENOMEM) | 2209 | if (ret == -ENOMEM) |
2225 | argv_free(argv); | 2210 | argv_free(argv); |
2226 | 2211 | ||
2227 | if (force) { | 2212 | return ret; |
2213 | } | ||
2214 | |||
2215 | /** | ||
2216 | * orderly_poweroff - Trigger an orderly system poweroff | ||
2217 | * @force: force poweroff if command execution fails | ||
2218 | * | ||
2219 | * This may be called from any context to trigger a system shutdown. | ||
2220 | * If the orderly shutdown fails, it will force an immediate shutdown. | ||
2221 | */ | ||
2222 | int orderly_poweroff(bool force) | ||
2223 | { | ||
2224 | int ret = __orderly_poweroff(); | ||
2225 | |||
2226 | if (ret && force) { | ||
2228 | printk(KERN_WARNING "Failed to start orderly shutdown: " | 2227 | printk(KERN_WARNING "Failed to start orderly shutdown: " |
2229 | "forcing the issue\n"); | 2228 | "forcing the issue\n"); |
2230 | 2229 | ||
2231 | /* I guess this should try to kick off some daemon to | 2230 | /* |
2232 | sync and poweroff asap. Or not even bother syncing | 2231 | * I guess this should try to kick off some daemon to sync and |
2233 | if we're doing an emergency shutdown? */ | 2232 | * poweroff asap. Or not even bother syncing if we're doing an |
2233 | * emergency shutdown? | ||
2234 | */ | ||
2234 | emergency_sync(); | 2235 | emergency_sync(); |
2235 | kernel_power_off(); | 2236 | kernel_power_off(); |
2236 | } | 2237 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ab11879aeb4..87174ef59161 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
31 | #include <linux/ctype.h> | 31 | #include <linux/ctype.h> |
32 | #include <linux/kmemcheck.h> | 32 | #include <linux/kmemcheck.h> |
33 | #include <linux/kmemleak.h> | ||
33 | #include <linux/fs.h> | 34 | #include <linux/fs.h> |
34 | #include <linux/init.h> | 35 | #include <linux/init.h> |
35 | #include <linux/kernel.h> | 36 | #include <linux/kernel.h> |
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, | |||
174 | void __user *buffer, size_t *lenp, loff_t *ppos); | 175 | void __user *buffer, size_t *lenp, loff_t *ppos); |
175 | #endif | 176 | #endif |
176 | 177 | ||
178 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, | ||
179 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
180 | static int proc_dostring_coredump(struct ctl_table *table, int write, | ||
181 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
182 | |||
177 | #ifdef CONFIG_MAGIC_SYSRQ | 183 | #ifdef CONFIG_MAGIC_SYSRQ |
178 | /* Note: sysrq code uses it's own private copy */ | 184 | /* Note: sysrq code uses it's own private copy */ |
179 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; | 185 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; |
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = { | |||
410 | .data = core_pattern, | 416 | .data = core_pattern, |
411 | .maxlen = CORENAME_MAX_SIZE, | 417 | .maxlen = CORENAME_MAX_SIZE, |
412 | .mode = 0644, | 418 | .mode = 0644, |
413 | .proc_handler = proc_dostring, | 419 | .proc_handler = proc_dostring_coredump, |
414 | }, | 420 | }, |
415 | { | 421 | { |
416 | .procname = "core_pipe_limit", | 422 | .procname = "core_pipe_limit", |
@@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = { | |||
1095 | .extra1 = &zero, | 1101 | .extra1 = &zero, |
1096 | }, | 1102 | }, |
1097 | { | 1103 | { |
1098 | .procname = "nr_pdflush_threads", | 1104 | .procname = "nr_pdflush_threads", |
1099 | .data = &nr_pdflush_threads, | 1105 | .mode = 0444 /* read-only */, |
1100 | .maxlen = sizeof nr_pdflush_threads, | 1106 | .proc_handler = pdflush_proc_obsolete, |
1101 | .mode = 0444 /* read-only*/, | ||
1102 | .proc_handler = proc_dointvec, | ||
1103 | }, | 1107 | }, |
1104 | { | 1108 | { |
1105 | .procname = "swappiness", | 1109 | .procname = "swappiness", |
@@ -1494,11 +1498,29 @@ static struct ctl_table fs_table[] = { | |||
1494 | #endif | 1498 | #endif |
1495 | #endif | 1499 | #endif |
1496 | { | 1500 | { |
1501 | .procname = "protected_symlinks", | ||
1502 | .data = &sysctl_protected_symlinks, | ||
1503 | .maxlen = sizeof(int), | ||
1504 | .mode = 0600, | ||
1505 | .proc_handler = proc_dointvec_minmax, | ||
1506 | .extra1 = &zero, | ||
1507 | .extra2 = &one, | ||
1508 | }, | ||
1509 | { | ||
1510 | .procname = "protected_hardlinks", | ||
1511 | .data = &sysctl_protected_hardlinks, | ||
1512 | .maxlen = sizeof(int), | ||
1513 | .mode = 0600, | ||
1514 | .proc_handler = proc_dointvec_minmax, | ||
1515 | .extra1 = &zero, | ||
1516 | .extra2 = &one, | ||
1517 | }, | ||
1518 | { | ||
1497 | .procname = "suid_dumpable", | 1519 | .procname = "suid_dumpable", |
1498 | .data = &suid_dumpable, | 1520 | .data = &suid_dumpable, |
1499 | .maxlen = sizeof(int), | 1521 | .maxlen = sizeof(int), |
1500 | .mode = 0644, | 1522 | .mode = 0644, |
1501 | .proc_handler = proc_dointvec_minmax, | 1523 | .proc_handler = proc_dointvec_minmax_coredump, |
1502 | .extra1 = &zero, | 1524 | .extra1 = &zero, |
1503 | .extra2 = &two, | 1525 | .extra2 = &two, |
1504 | }, | 1526 | }, |
@@ -1551,7 +1573,10 @@ static struct ctl_table dev_table[] = { | |||
1551 | 1573 | ||
1552 | int __init sysctl_init(void) | 1574 | int __init sysctl_init(void) |
1553 | { | 1575 | { |
1554 | register_sysctl_table(sysctl_base_table); | 1576 | struct ctl_table_header *hdr; |
1577 | |||
1578 | hdr = register_sysctl_table(sysctl_base_table); | ||
1579 | kmemleak_not_leak(hdr); | ||
1555 | return 0; | 1580 | return 0; |
1556 | } | 1581 | } |
1557 | 1582 | ||
@@ -2009,6 +2034,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, | |||
2009 | do_proc_dointvec_minmax_conv, ¶m); | 2034 | do_proc_dointvec_minmax_conv, ¶m); |
2010 | } | 2035 | } |
2011 | 2036 | ||
2037 | static void validate_coredump_safety(void) | ||
2038 | { | ||
2039 | if (suid_dumpable == SUID_DUMPABLE_SAFE && | ||
2040 | core_pattern[0] != '/' && core_pattern[0] != '|') { | ||
2041 | printk(KERN_WARNING "Unsafe core_pattern used with "\ | ||
2042 | "suid_dumpable=2. Pipe handler or fully qualified "\ | ||
2043 | "core dump path required.\n"); | ||
2044 | } | ||
2045 | } | ||
2046 | |||
2047 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, | ||
2048 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2049 | { | ||
2050 | int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
2051 | if (!error) | ||
2052 | validate_coredump_safety(); | ||
2053 | return error; | ||
2054 | } | ||
2055 | |||
2056 | static int proc_dostring_coredump(struct ctl_table *table, int write, | ||
2057 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2058 | { | ||
2059 | int error = proc_dostring(table, write, buffer, lenp, ppos); | ||
2060 | if (!error) | ||
2061 | validate_coredump_safety(); | ||
2062 | return error; | ||
2063 | } | ||
2064 | |||
2012 | static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, | 2065 | static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, |
2013 | void __user *buffer, | 2066 | void __user *buffer, |
2014 | size_t *lenp, loff_t *ppos, | 2067 | size_t *lenp, loff_t *ppos, |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index a650694883a1..65bdcf198d4e 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = { | |||
147 | { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, | 147 | { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, |
148 | /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ | 148 | /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ |
149 | /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ | 149 | /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ |
150 | { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, | 150 | /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ |
151 | { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, | 151 | { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, |
152 | /* VM_PAGEBUF unused */ | 152 | /* VM_PAGEBUF unused */ |
153 | /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ | 153 | /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ |
diff --git a/kernel/task_work.c b/kernel/task_work.c index 82d1c794066d..91d4e1742a0c 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
@@ -3,82 +3,78 @@ | |||
3 | #include <linux/tracehook.h> | 3 | #include <linux/tracehook.h> |
4 | 4 | ||
5 | int | 5 | int |
6 | task_work_add(struct task_struct *task, struct task_work *twork, bool notify) | 6 | task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) |
7 | { | 7 | { |
8 | struct callback_head *last, *first; | ||
8 | unsigned long flags; | 9 | unsigned long flags; |
9 | int err = -ESRCH; | ||
10 | 10 | ||
11 | #ifndef TIF_NOTIFY_RESUME | ||
12 | if (notify) | ||
13 | return -ENOTSUPP; | ||
14 | #endif | ||
15 | /* | 11 | /* |
16 | * We must not insert the new work if the task has already passed | 12 | * Not inserting the new work if the task has already passed |
17 | * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() | 13 | * exit_task_work() is the responisbility of callers. |
18 | * and check PF_EXITING under pi_lock. | ||
19 | */ | 14 | */ |
20 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 15 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
21 | if (likely(!(task->flags & PF_EXITING))) { | 16 | last = task->task_works; |
22 | hlist_add_head(&twork->hlist, &task->task_works); | 17 | first = last ? last->next : twork; |
23 | err = 0; | 18 | twork->next = first; |
24 | } | 19 | if (last) |
20 | last->next = twork; | ||
21 | task->task_works = twork; | ||
25 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 22 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
26 | 23 | ||
27 | /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ | 24 | /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ |
28 | if (likely(!err) && notify) | 25 | if (notify) |
29 | set_notify_resume(task); | 26 | set_notify_resume(task); |
30 | return err; | 27 | return 0; |
31 | } | 28 | } |
32 | 29 | ||
33 | struct task_work * | 30 | struct callback_head * |
34 | task_work_cancel(struct task_struct *task, task_work_func_t func) | 31 | task_work_cancel(struct task_struct *task, task_work_func_t func) |
35 | { | 32 | { |
36 | unsigned long flags; | 33 | unsigned long flags; |
37 | struct task_work *twork; | 34 | struct callback_head *last, *res = NULL; |
38 | struct hlist_node *pos; | ||
39 | 35 | ||
40 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 36 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
41 | hlist_for_each_entry(twork, pos, &task->task_works, hlist) { | 37 | last = task->task_works; |
42 | if (twork->func == func) { | 38 | if (last) { |
43 | hlist_del(&twork->hlist); | 39 | struct callback_head *q = last, *p = q->next; |
44 | goto found; | 40 | while (1) { |
41 | if (p->func == func) { | ||
42 | q->next = p->next; | ||
43 | if (p == last) | ||
44 | task->task_works = q == p ? NULL : q; | ||
45 | res = p; | ||
46 | break; | ||
47 | } | ||
48 | if (p == last) | ||
49 | break; | ||
50 | q = p; | ||
51 | p = q->next; | ||
45 | } | 52 | } |
46 | } | 53 | } |
47 | twork = NULL; | ||
48 | found: | ||
49 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 54 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
50 | 55 | return res; | |
51 | return twork; | ||
52 | } | 56 | } |
53 | 57 | ||
54 | void task_work_run(void) | 58 | void task_work_run(void) |
55 | { | 59 | { |
56 | struct task_struct *task = current; | 60 | struct task_struct *task = current; |
57 | struct hlist_head task_works; | 61 | struct callback_head *p, *q; |
58 | struct hlist_node *pos; | ||
59 | 62 | ||
60 | raw_spin_lock_irq(&task->pi_lock); | 63 | while (1) { |
61 | hlist_move_list(&task->task_works, &task_works); | 64 | raw_spin_lock_irq(&task->pi_lock); |
62 | raw_spin_unlock_irq(&task->pi_lock); | 65 | p = task->task_works; |
66 | task->task_works = NULL; | ||
67 | raw_spin_unlock_irq(&task->pi_lock); | ||
63 | 68 | ||
64 | if (unlikely(hlist_empty(&task_works))) | 69 | if (unlikely(!p)) |
65 | return; | 70 | return; |
66 | /* | ||
67 | * We use hlist to save the space in task_struct, but we want fifo. | ||
68 | * Find the last entry, the list should be short, then process them | ||
69 | * in reverse order. | ||
70 | */ | ||
71 | for (pos = task_works.first; pos->next; pos = pos->next) | ||
72 | ; | ||
73 | 71 | ||
74 | for (;;) { | 72 | q = p->next; /* head */ |
75 | struct hlist_node **pprev = pos->pprev; | 73 | p->next = NULL; /* cut it */ |
76 | struct task_work *twork = container_of(pos, struct task_work, | 74 | while (q) { |
77 | hlist); | 75 | p = q->next; |
78 | twork->func(twork); | 76 | q->func(q); |
79 | 77 | q = p; | |
80 | if (pprev == &task_works.first) | 78 | } |
81 | break; | ||
82 | pos = container_of(pprev, struct hlist_node, next); | ||
83 | } | 79 | } |
84 | } | 80 | } |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e66046456f4f..d0a32796550f 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
436 | 436 | ||
437 | na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, | 437 | na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, |
438 | sizeof(struct cgroupstats)); | 438 | sizeof(struct cgroupstats)); |
439 | if (na == NULL) { | ||
440 | rc = -EMSGSIZE; | ||
441 | goto err; | ||
442 | } | ||
443 | |||
439 | stats = nla_data(na); | 444 | stats = nla_data(na); |
440 | memset(stats, 0, sizeof(*stats)); | 445 | memset(stats, 0, sizeof(*stats)); |
441 | 446 | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a470154e0408..46da0537c10b 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -37,7 +37,7 @@ | |||
37 | * requested HZ value. It is also not recommended | 37 | * requested HZ value. It is also not recommended |
38 | * for "tick-less" systems. | 38 | * for "tick-less" systems. |
39 | */ | 39 | */ |
40 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) | 40 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) |
41 | 41 | ||
42 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier | 42 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier |
43 | * conversion, the .shift value could be zero. However | 43 | * conversion, the .shift value could be zero. However |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 70b33abcc7bb..24174b4d669b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock); | |||
28 | /* USER_HZ period (usecs): */ | 28 | /* USER_HZ period (usecs): */ |
29 | unsigned long tick_usec = TICK_USEC; | 29 | unsigned long tick_usec = TICK_USEC; |
30 | 30 | ||
31 | /* ACTHZ period (nsecs): */ | 31 | /* SHIFTED_HZ period (nsecs): */ |
32 | unsigned long tick_nsec; | 32 | unsigned long tick_nsec; |
33 | 33 | ||
34 | static u64 tick_length; | 34 | static u64 tick_length; |
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs) | |||
409 | time_state = TIME_DEL; | 409 | time_state = TIME_DEL; |
410 | break; | 410 | break; |
411 | case TIME_INS: | 411 | case TIME_INS: |
412 | if (secs % 86400 == 0) { | 412 | if (!(time_status & STA_INS)) |
413 | time_state = TIME_OK; | ||
414 | else if (secs % 86400 == 0) { | ||
413 | leap = -1; | 415 | leap = -1; |
414 | time_state = TIME_OOP; | 416 | time_state = TIME_OOP; |
415 | time_tai++; | 417 | time_tai++; |
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs) | |||
418 | } | 420 | } |
419 | break; | 421 | break; |
420 | case TIME_DEL: | 422 | case TIME_DEL: |
421 | if ((secs + 1) % 86400 == 0) { | 423 | if (!(time_status & STA_DEL)) |
424 | time_state = TIME_OK; | ||
425 | else if ((secs + 1) % 86400 == 0) { | ||
422 | leap = 1; | 426 | leap = 1; |
423 | time_tai--; | 427 | time_tai--; |
424 | time_state = TIME_WAIT; | 428 | time_state = TIME_WAIT; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da70c6db496c..024540f97f74 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void) | |||
105 | /* | 105 | /* |
106 | * NO HZ enabled ? | 106 | * NO HZ enabled ? |
107 | */ | 107 | */ |
108 | static int tick_nohz_enabled __read_mostly = 1; | 108 | int tick_nohz_enabled __read_mostly = 1; |
109 | 109 | ||
110 | /* | 110 | /* |
111 | * Enable / Disable tickless mode | 111 | * Enable / Disable tickless mode |
@@ -271,49 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | |||
271 | } | 271 | } |
272 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 272 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
273 | 273 | ||
274 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | 274 | static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, |
275 | ktime_t now, int cpu) | ||
275 | { | 276 | { |
276 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 277 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
277 | ktime_t last_update, expires, now; | 278 | ktime_t last_update, expires, ret = { .tv64 = 0 }; |
279 | unsigned long rcu_delta_jiffies; | ||
278 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 280 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
279 | u64 time_delta; | 281 | u64 time_delta; |
280 | int cpu; | ||
281 | |||
282 | cpu = smp_processor_id(); | ||
283 | ts = &per_cpu(tick_cpu_sched, cpu); | ||
284 | |||
285 | now = tick_nohz_start_idle(cpu, ts); | ||
286 | |||
287 | /* | ||
288 | * If this cpu is offline and it is the one which updates | ||
289 | * jiffies, then give up the assignment and let it be taken by | ||
290 | * the cpu which runs the tick timer next. If we don't drop | ||
291 | * this here the jiffies might be stale and do_timer() never | ||
292 | * invoked. | ||
293 | */ | ||
294 | if (unlikely(!cpu_online(cpu))) { | ||
295 | if (cpu == tick_do_timer_cpu) | ||
296 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | ||
297 | } | ||
298 | |||
299 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | ||
300 | return; | ||
301 | 282 | ||
302 | if (need_resched()) | ||
303 | return; | ||
304 | |||
305 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | ||
306 | static int ratelimit; | ||
307 | |||
308 | if (ratelimit < 10) { | ||
309 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | ||
310 | (unsigned int) local_softirq_pending()); | ||
311 | ratelimit++; | ||
312 | } | ||
313 | return; | ||
314 | } | ||
315 | |||
316 | ts->idle_calls++; | ||
317 | /* Read jiffies and the time when jiffies were updated last */ | 283 | /* Read jiffies and the time when jiffies were updated last */ |
318 | do { | 284 | do { |
319 | seq = read_seqbegin(&xtime_lock); | 285 | seq = read_seqbegin(&xtime_lock); |
@@ -322,7 +288,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
322 | time_delta = timekeeping_max_deferment(); | 288 | time_delta = timekeeping_max_deferment(); |
323 | } while (read_seqretry(&xtime_lock, seq)); | 289 | } while (read_seqretry(&xtime_lock, seq)); |
324 | 290 | ||
325 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || | 291 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || |
326 | arch_needs_cpu(cpu)) { | 292 | arch_needs_cpu(cpu)) { |
327 | next_jiffies = last_jiffies + 1; | 293 | next_jiffies = last_jiffies + 1; |
328 | delta_jiffies = 1; | 294 | delta_jiffies = 1; |
@@ -330,6 +296,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
330 | /* Get the next timer wheel timer */ | 296 | /* Get the next timer wheel timer */ |
331 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 297 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
332 | delta_jiffies = next_jiffies - last_jiffies; | 298 | delta_jiffies = next_jiffies - last_jiffies; |
299 | if (rcu_delta_jiffies < delta_jiffies) { | ||
300 | next_jiffies = last_jiffies + rcu_delta_jiffies; | ||
301 | delta_jiffies = rcu_delta_jiffies; | ||
302 | } | ||
333 | } | 303 | } |
334 | /* | 304 | /* |
335 | * Do not stop the tick, if we are only one off | 305 | * Do not stop the tick, if we are only one off |
@@ -392,6 +362,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
392 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) | 362 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) |
393 | goto out; | 363 | goto out; |
394 | 364 | ||
365 | ret = expires; | ||
366 | |||
395 | /* | 367 | /* |
396 | * nohz_stop_sched_tick can be called several times before | 368 | * nohz_stop_sched_tick can be called several times before |
397 | * the nohz_restart_sched_tick is called. This happens when | 369 | * the nohz_restart_sched_tick is called. This happens when |
@@ -401,17 +373,12 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
401 | */ | 373 | */ |
402 | if (!ts->tick_stopped) { | 374 | if (!ts->tick_stopped) { |
403 | select_nohz_load_balancer(1); | 375 | select_nohz_load_balancer(1); |
376 | calc_load_enter_idle(); | ||
404 | 377 | ||
405 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
406 | ts->tick_stopped = 1; | 379 | ts->tick_stopped = 1; |
407 | ts->idle_jiffies = last_jiffies; | ||
408 | } | 380 | } |
409 | 381 | ||
410 | ts->idle_sleeps++; | ||
411 | |||
412 | /* Mark expires */ | ||
413 | ts->idle_expires = expires; | ||
414 | |||
415 | /* | 382 | /* |
416 | * If the expiration time == KTIME_MAX, then | 383 | * If the expiration time == KTIME_MAX, then |
417 | * in this case we simply stop the tick timer. | 384 | * in this case we simply stop the tick timer. |
@@ -442,6 +409,65 @@ out: | |||
442 | ts->next_jiffies = next_jiffies; | 409 | ts->next_jiffies = next_jiffies; |
443 | ts->last_jiffies = last_jiffies; | 410 | ts->last_jiffies = last_jiffies; |
444 | ts->sleep_length = ktime_sub(dev->next_event, now); | 411 | ts->sleep_length = ktime_sub(dev->next_event, now); |
412 | |||
413 | return ret; | ||
414 | } | ||
415 | |||
416 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | ||
417 | { | ||
418 | /* | ||
419 | * If this cpu is offline and it is the one which updates | ||
420 | * jiffies, then give up the assignment and let it be taken by | ||
421 | * the cpu which runs the tick timer next. If we don't drop | ||
422 | * this here the jiffies might be stale and do_timer() never | ||
423 | * invoked. | ||
424 | */ | ||
425 | if (unlikely(!cpu_online(cpu))) { | ||
426 | if (cpu == tick_do_timer_cpu) | ||
427 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | ||
428 | } | ||
429 | |||
430 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | ||
431 | return false; | ||
432 | |||
433 | if (need_resched()) | ||
434 | return false; | ||
435 | |||
436 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | ||
437 | static int ratelimit; | ||
438 | |||
439 | if (ratelimit < 10) { | ||
440 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | ||
441 | (unsigned int) local_softirq_pending()); | ||
442 | ratelimit++; | ||
443 | } | ||
444 | return false; | ||
445 | } | ||
446 | |||
447 | return true; | ||
448 | } | ||
449 | |||
450 | static void __tick_nohz_idle_enter(struct tick_sched *ts) | ||
451 | { | ||
452 | ktime_t now, expires; | ||
453 | int cpu = smp_processor_id(); | ||
454 | |||
455 | now = tick_nohz_start_idle(cpu, ts); | ||
456 | |||
457 | if (can_stop_idle_tick(cpu, ts)) { | ||
458 | int was_stopped = ts->tick_stopped; | ||
459 | |||
460 | ts->idle_calls++; | ||
461 | |||
462 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); | ||
463 | if (expires.tv64 > 0LL) { | ||
464 | ts->idle_sleeps++; | ||
465 | ts->idle_expires = expires; | ||
466 | } | ||
467 | |||
468 | if (!was_stopped && ts->tick_stopped) | ||
469 | ts->idle_jiffies = ts->last_jiffies; | ||
470 | } | ||
445 | } | 471 | } |
446 | 472 | ||
447 | /** | 473 | /** |
@@ -479,7 +505,7 @@ void tick_nohz_idle_enter(void) | |||
479 | * update of the idle time accounting in tick_nohz_start_idle(). | 505 | * update of the idle time accounting in tick_nohz_start_idle(). |
480 | */ | 506 | */ |
481 | ts->inidle = 1; | 507 | ts->inidle = 1; |
482 | tick_nohz_stop_sched_tick(ts); | 508 | __tick_nohz_idle_enter(ts); |
483 | 509 | ||
484 | local_irq_enable(); | 510 | local_irq_enable(); |
485 | } | 511 | } |
@@ -499,7 +525,7 @@ void tick_nohz_irq_exit(void) | |||
499 | if (!ts->inidle) | 525 | if (!ts->inidle) |
500 | return; | 526 | return; |
501 | 527 | ||
502 | tick_nohz_stop_sched_tick(ts); | 528 | __tick_nohz_idle_enter(ts); |
503 | } | 529 | } |
504 | 530 | ||
505 | /** | 531 | /** |
@@ -517,7 +543,7 @@ ktime_t tick_nohz_get_sleep_length(void) | |||
517 | static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | 543 | static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) |
518 | { | 544 | { |
519 | hrtimer_cancel(&ts->sched_timer); | 545 | hrtimer_cancel(&ts->sched_timer); |
520 | hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); | 546 | hrtimer_set_expires(&ts->sched_timer, ts->last_tick); |
521 | 547 | ||
522 | while (1) { | 548 | while (1) { |
523 | /* Forward the time to expire in the future */ | 549 | /* Forward the time to expire in the future */ |
@@ -540,6 +566,41 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
540 | } | 566 | } |
541 | } | 567 | } |
542 | 568 | ||
569 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | ||
570 | { | ||
571 | /* Update jiffies first */ | ||
572 | select_nohz_load_balancer(0); | ||
573 | tick_do_update_jiffies64(now); | ||
574 | update_cpu_load_nohz(); | ||
575 | |||
576 | touch_softlockup_watchdog(); | ||
577 | /* | ||
578 | * Cancel the scheduled timer and restore the tick | ||
579 | */ | ||
580 | ts->tick_stopped = 0; | ||
581 | ts->idle_exittime = now; | ||
582 | |||
583 | tick_nohz_restart(ts, now); | ||
584 | } | ||
585 | |||
586 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | ||
587 | { | ||
588 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
589 | unsigned long ticks; | ||
590 | /* | ||
591 | * We stopped the tick in idle. Update process times would miss the | ||
592 | * time we slept as update_process_times does only a 1 tick | ||
593 | * accounting. Enforce that this is accounted to idle ! | ||
594 | */ | ||
595 | ticks = jiffies - ts->idle_jiffies; | ||
596 | /* | ||
597 | * We might be one off. Do not randomly account a huge number of ticks! | ||
598 | */ | ||
599 | if (ticks && ticks < LONG_MAX) | ||
600 | account_idle_ticks(ticks); | ||
601 | #endif | ||
602 | } | ||
603 | |||
543 | /** | 604 | /** |
544 | * tick_nohz_idle_exit - restart the idle tick from the idle task | 605 | * tick_nohz_idle_exit - restart the idle tick from the idle task |
545 | * | 606 | * |
@@ -551,9 +612,6 @@ void tick_nohz_idle_exit(void) | |||
551 | { | 612 | { |
552 | int cpu = smp_processor_id(); | 613 | int cpu = smp_processor_id(); |
553 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 614 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
554 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
555 | unsigned long ticks; | ||
556 | #endif | ||
557 | ktime_t now; | 615 | ktime_t now; |
558 | 616 | ||
559 | local_irq_disable(); | 617 | local_irq_disable(); |
@@ -568,39 +626,11 @@ void tick_nohz_idle_exit(void) | |||
568 | if (ts->idle_active) | 626 | if (ts->idle_active) |
569 | tick_nohz_stop_idle(cpu, now); | 627 | tick_nohz_stop_idle(cpu, now); |
570 | 628 | ||
571 | if (!ts->tick_stopped) { | 629 | if (ts->tick_stopped) { |
572 | local_irq_enable(); | 630 | tick_nohz_restart_sched_tick(ts, now); |
573 | return; | 631 | tick_nohz_account_idle_ticks(ts); |
574 | } | 632 | } |
575 | 633 | ||
576 | /* Update jiffies first */ | ||
577 | select_nohz_load_balancer(0); | ||
578 | tick_do_update_jiffies64(now); | ||
579 | update_cpu_load_nohz(); | ||
580 | |||
581 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
582 | /* | ||
583 | * We stopped the tick in idle. Update process times would miss the | ||
584 | * time we slept as update_process_times does only a 1 tick | ||
585 | * accounting. Enforce that this is accounted to idle ! | ||
586 | */ | ||
587 | ticks = jiffies - ts->idle_jiffies; | ||
588 | /* | ||
589 | * We might be one off. Do not randomly account a huge number of ticks! | ||
590 | */ | ||
591 | if (ticks && ticks < LONG_MAX) | ||
592 | account_idle_ticks(ticks); | ||
593 | #endif | ||
594 | |||
595 | touch_softlockup_watchdog(); | ||
596 | /* | ||
597 | * Cancel the scheduled timer and restore the tick | ||
598 | */ | ||
599 | ts->tick_stopped = 0; | ||
600 | ts->idle_exittime = now; | ||
601 | |||
602 | tick_nohz_restart(ts, now); | ||
603 | |||
604 | local_irq_enable(); | 634 | local_irq_enable(); |
605 | } | 635 | } |
606 | 636 | ||
@@ -804,7 +834,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
804 | */ | 834 | */ |
805 | if (ts->tick_stopped) { | 835 | if (ts->tick_stopped) { |
806 | touch_softlockup_watchdog(); | 836 | touch_softlockup_watchdog(); |
807 | ts->idle_jiffies++; | 837 | if (idle_cpu(cpu)) |
838 | ts->idle_jiffies++; | ||
808 | } | 839 | } |
809 | update_process_times(user_mode(regs)); | 840 | update_process_times(user_mode(regs)); |
810 | profile_tick(CPU_PROFILING); | 841 | profile_tick(CPU_PROFILING); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6f46a00a1e8a..e16af197a2bc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -24,32 +24,32 @@ | |||
24 | /* Structure holding internal timekeeping values. */ | 24 | /* Structure holding internal timekeeping values. */ |
25 | struct timekeeper { | 25 | struct timekeeper { |
26 | /* Current clocksource used for timekeeping. */ | 26 | /* Current clocksource used for timekeeping. */ |
27 | struct clocksource *clock; | 27 | struct clocksource *clock; |
28 | /* NTP adjusted clock multiplier */ | 28 | /* NTP adjusted clock multiplier */ |
29 | u32 mult; | 29 | u32 mult; |
30 | /* The shift value of the current clocksource. */ | 30 | /* The shift value of the current clocksource. */ |
31 | int shift; | 31 | u32 shift; |
32 | |||
33 | /* Number of clock cycles in one NTP interval. */ | 32 | /* Number of clock cycles in one NTP interval. */ |
34 | cycle_t cycle_interval; | 33 | cycle_t cycle_interval; |
35 | /* Number of clock shifted nano seconds in one NTP interval. */ | 34 | /* Number of clock shifted nano seconds in one NTP interval. */ |
36 | u64 xtime_interval; | 35 | u64 xtime_interval; |
37 | /* shifted nano seconds left over when rounding cycle_interval */ | 36 | /* shifted nano seconds left over when rounding cycle_interval */ |
38 | s64 xtime_remainder; | 37 | s64 xtime_remainder; |
39 | /* Raw nano seconds accumulated per NTP interval. */ | 38 | /* Raw nano seconds accumulated per NTP interval. */ |
40 | u32 raw_interval; | 39 | u32 raw_interval; |
40 | |||
41 | /* Current CLOCK_REALTIME time in seconds */ | ||
42 | u64 xtime_sec; | ||
43 | /* Clock shifted nano seconds */ | ||
44 | u64 xtime_nsec; | ||
41 | 45 | ||
42 | /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ | ||
43 | u64 xtime_nsec; | ||
44 | /* Difference between accumulated time and NTP time in ntp | 46 | /* Difference between accumulated time and NTP time in ntp |
45 | * shifted nano seconds. */ | 47 | * shifted nano seconds. */ |
46 | s64 ntp_error; | 48 | s64 ntp_error; |
47 | /* Shift conversion between clock shifted nano seconds and | 49 | /* Shift conversion between clock shifted nano seconds and |
48 | * ntp shifted nano seconds. */ | 50 | * ntp shifted nano seconds. */ |
49 | int ntp_error_shift; | 51 | u32 ntp_error_shift; |
50 | 52 | ||
51 | /* The current time */ | ||
52 | struct timespec xtime; | ||
53 | /* | 53 | /* |
54 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | 54 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected |
55 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | 55 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged |
@@ -64,14 +64,17 @@ struct timekeeper { | |||
64 | * - wall_to_monotonic is no longer the boot time, getboottime must be | 64 | * - wall_to_monotonic is no longer the boot time, getboottime must be |
65 | * used instead. | 65 | * used instead. |
66 | */ | 66 | */ |
67 | struct timespec wall_to_monotonic; | 67 | struct timespec wall_to_monotonic; |
68 | /* Offset clock monotonic -> clock realtime */ | ||
69 | ktime_t offs_real; | ||
68 | /* time spent in suspend */ | 70 | /* time spent in suspend */ |
69 | struct timespec total_sleep_time; | 71 | struct timespec total_sleep_time; |
72 | /* Offset clock monotonic -> clock boottime */ | ||
73 | ktime_t offs_boot; | ||
70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ | 74 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ |
71 | struct timespec raw_time; | 75 | struct timespec raw_time; |
72 | |||
73 | /* Seqlock for all timekeeper values */ | 76 | /* Seqlock for all timekeeper values */ |
74 | seqlock_t lock; | 77 | seqlock_t lock; |
75 | }; | 78 | }; |
76 | 79 | ||
77 | static struct timekeeper timekeeper; | 80 | static struct timekeeper timekeeper; |
@@ -82,11 +85,62 @@ static struct timekeeper timekeeper; | |||
82 | */ | 85 | */ |
83 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | 86 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
84 | 87 | ||
85 | |||
86 | /* flag for if timekeeping is suspended */ | 88 | /* flag for if timekeeping is suspended */ |
87 | int __read_mostly timekeeping_suspended; | 89 | int __read_mostly timekeeping_suspended; |
88 | 90 | ||
91 | static inline void tk_normalize_xtime(struct timekeeper *tk) | ||
92 | { | ||
93 | while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { | ||
94 | tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; | ||
95 | tk->xtime_sec++; | ||
96 | } | ||
97 | } | ||
98 | |||
99 | static struct timespec tk_xtime(struct timekeeper *tk) | ||
100 | { | ||
101 | struct timespec ts; | ||
102 | |||
103 | ts.tv_sec = tk->xtime_sec; | ||
104 | ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); | ||
105 | return ts; | ||
106 | } | ||
89 | 107 | ||
108 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) | ||
109 | { | ||
110 | tk->xtime_sec = ts->tv_sec; | ||
111 | tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; | ||
112 | } | ||
113 | |||
114 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) | ||
115 | { | ||
116 | tk->xtime_sec += ts->tv_sec; | ||
117 | tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; | ||
118 | } | ||
119 | |||
120 | static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) | ||
121 | { | ||
122 | struct timespec tmp; | ||
123 | |||
124 | /* | ||
125 | * Verify consistency of: offset_real = -wall_to_monotonic | ||
126 | * before modifying anything | ||
127 | */ | ||
128 | set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, | ||
129 | -tk->wall_to_monotonic.tv_nsec); | ||
130 | WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); | ||
131 | tk->wall_to_monotonic = wtm; | ||
132 | set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); | ||
133 | tk->offs_real = timespec_to_ktime(tmp); | ||
134 | } | ||
135 | |||
136 | static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) | ||
137 | { | ||
138 | /* Verify consistency before modifying */ | ||
139 | WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); | ||
140 | |||
141 | tk->total_sleep_time = t; | ||
142 | tk->offs_boot = timespec_to_ktime(t); | ||
143 | } | ||
90 | 144 | ||
91 | /** | 145 | /** |
92 | * timekeeper_setup_internals - Set up internals to use clocksource clock. | 146 | * timekeeper_setup_internals - Set up internals to use clocksource clock. |
@@ -98,12 +152,14 @@ int __read_mostly timekeeping_suspended; | |||
98 | * | 152 | * |
99 | * Unless you're the timekeeping code, you should not be using this! | 153 | * Unless you're the timekeeping code, you should not be using this! |
100 | */ | 154 | */ |
101 | static void timekeeper_setup_internals(struct clocksource *clock) | 155 | static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) |
102 | { | 156 | { |
103 | cycle_t interval; | 157 | cycle_t interval; |
104 | u64 tmp, ntpinterval; | 158 | u64 tmp, ntpinterval; |
159 | struct clocksource *old_clock; | ||
105 | 160 | ||
106 | timekeeper.clock = clock; | 161 | old_clock = tk->clock; |
162 | tk->clock = clock; | ||
107 | clock->cycle_last = clock->read(clock); | 163 | clock->cycle_last = clock->read(clock); |
108 | 164 | ||
109 | /* Do the ns -> cycle conversion first, using original mult */ | 165 | /* Do the ns -> cycle conversion first, using original mult */ |
@@ -116,74 +172,89 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
116 | tmp = 1; | 172 | tmp = 1; |
117 | 173 | ||
118 | interval = (cycle_t) tmp; | 174 | interval = (cycle_t) tmp; |
119 | timekeeper.cycle_interval = interval; | 175 | tk->cycle_interval = interval; |
120 | 176 | ||
121 | /* Go back from cycles -> shifted ns */ | 177 | /* Go back from cycles -> shifted ns */ |
122 | timekeeper.xtime_interval = (u64) interval * clock->mult; | 178 | tk->xtime_interval = (u64) interval * clock->mult; |
123 | timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; | 179 | tk->xtime_remainder = ntpinterval - tk->xtime_interval; |
124 | timekeeper.raw_interval = | 180 | tk->raw_interval = |
125 | ((u64) interval * clock->mult) >> clock->shift; | 181 | ((u64) interval * clock->mult) >> clock->shift; |
126 | 182 | ||
127 | timekeeper.xtime_nsec = 0; | 183 | /* if changing clocks, convert xtime_nsec shift units */ |
128 | timekeeper.shift = clock->shift; | 184 | if (old_clock) { |
185 | int shift_change = clock->shift - old_clock->shift; | ||
186 | if (shift_change < 0) | ||
187 | tk->xtime_nsec >>= -shift_change; | ||
188 | else | ||
189 | tk->xtime_nsec <<= shift_change; | ||
190 | } | ||
191 | tk->shift = clock->shift; | ||
129 | 192 | ||
130 | timekeeper.ntp_error = 0; | 193 | tk->ntp_error = 0; |
131 | timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; | 194 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; |
132 | 195 | ||
133 | /* | 196 | /* |
134 | * The timekeeper keeps its own mult values for the currently | 197 | * The timekeeper keeps its own mult values for the currently |
135 | * active clocksource. These value will be adjusted via NTP | 198 | * active clocksource. These value will be adjusted via NTP |
136 | * to counteract clock drifting. | 199 | * to counteract clock drifting. |
137 | */ | 200 | */ |
138 | timekeeper.mult = clock->mult; | 201 | tk->mult = clock->mult; |
139 | } | 202 | } |
140 | 203 | ||
141 | /* Timekeeper helper functions. */ | 204 | /* Timekeeper helper functions. */ |
142 | static inline s64 timekeeping_get_ns(void) | 205 | static inline s64 timekeeping_get_ns(struct timekeeper *tk) |
143 | { | 206 | { |
144 | cycle_t cycle_now, cycle_delta; | 207 | cycle_t cycle_now, cycle_delta; |
145 | struct clocksource *clock; | 208 | struct clocksource *clock; |
209 | s64 nsec; | ||
146 | 210 | ||
147 | /* read clocksource: */ | 211 | /* read clocksource: */ |
148 | clock = timekeeper.clock; | 212 | clock = tk->clock; |
149 | cycle_now = clock->read(clock); | 213 | cycle_now = clock->read(clock); |
150 | 214 | ||
151 | /* calculate the delta since the last update_wall_time: */ | 215 | /* calculate the delta since the last update_wall_time: */ |
152 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 216 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
153 | 217 | ||
154 | /* return delta convert to nanoseconds using ntp adjusted mult. */ | 218 | nsec = cycle_delta * tk->mult + tk->xtime_nsec; |
155 | return clocksource_cyc2ns(cycle_delta, timekeeper.mult, | 219 | nsec >>= tk->shift; |
156 | timekeeper.shift); | 220 | |
221 | /* If arch requires, add in gettimeoffset() */ | ||
222 | return nsec + arch_gettimeoffset(); | ||
157 | } | 223 | } |
158 | 224 | ||
159 | static inline s64 timekeeping_get_ns_raw(void) | 225 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) |
160 | { | 226 | { |
161 | cycle_t cycle_now, cycle_delta; | 227 | cycle_t cycle_now, cycle_delta; |
162 | struct clocksource *clock; | 228 | struct clocksource *clock; |
229 | s64 nsec; | ||
163 | 230 | ||
164 | /* read clocksource: */ | 231 | /* read clocksource: */ |
165 | clock = timekeeper.clock; | 232 | clock = tk->clock; |
166 | cycle_now = clock->read(clock); | 233 | cycle_now = clock->read(clock); |
167 | 234 | ||
168 | /* calculate the delta since the last update_wall_time: */ | 235 | /* calculate the delta since the last update_wall_time: */ |
169 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 236 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
170 | 237 | ||
171 | /* return delta convert to nanoseconds. */ | 238 | /* convert delta to nanoseconds. */ |
172 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 239 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
240 | |||
241 | /* If arch requires, add in gettimeoffset() */ | ||
242 | return nsec + arch_gettimeoffset(); | ||
173 | } | 243 | } |
174 | 244 | ||
175 | /* must hold write on timekeeper.lock */ | 245 | /* must hold write on timekeeper.lock */ |
176 | static void timekeeping_update(bool clearntp) | 246 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) |
177 | { | 247 | { |
248 | struct timespec xt; | ||
249 | |||
178 | if (clearntp) { | 250 | if (clearntp) { |
179 | timekeeper.ntp_error = 0; | 251 | tk->ntp_error = 0; |
180 | ntp_clear(); | 252 | ntp_clear(); |
181 | } | 253 | } |
182 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, | 254 | xt = tk_xtime(tk); |
183 | timekeeper.clock, timekeeper.mult); | 255 | update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); |
184 | } | 256 | } |
185 | 257 | ||
186 | |||
187 | /** | 258 | /** |
188 | * timekeeping_forward_now - update clock to the current time | 259 | * timekeeping_forward_now - update clock to the current time |
189 | * | 260 | * |
@@ -191,27 +262,26 @@ static void timekeeping_update(bool clearntp) | |||
191 | * update_wall_time(). This is useful before significant clock changes, | 262 | * update_wall_time(). This is useful before significant clock changes, |
192 | * as it avoids having to deal with this time offset explicitly. | 263 | * as it avoids having to deal with this time offset explicitly. |
193 | */ | 264 | */ |
194 | static void timekeeping_forward_now(void) | 265 | static void timekeeping_forward_now(struct timekeeper *tk) |
195 | { | 266 | { |
196 | cycle_t cycle_now, cycle_delta; | 267 | cycle_t cycle_now, cycle_delta; |
197 | struct clocksource *clock; | 268 | struct clocksource *clock; |
198 | s64 nsec; | 269 | s64 nsec; |
199 | 270 | ||
200 | clock = timekeeper.clock; | 271 | clock = tk->clock; |
201 | cycle_now = clock->read(clock); | 272 | cycle_now = clock->read(clock); |
202 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 273 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
203 | clock->cycle_last = cycle_now; | 274 | clock->cycle_last = cycle_now; |
204 | 275 | ||
205 | nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, | 276 | tk->xtime_nsec += cycle_delta * tk->mult; |
206 | timekeeper.shift); | ||
207 | 277 | ||
208 | /* If arch requires, add in gettimeoffset() */ | 278 | /* If arch requires, add in gettimeoffset() */ |
209 | nsec += arch_gettimeoffset(); | 279 | tk->xtime_nsec += arch_gettimeoffset() << tk->shift; |
210 | 280 | ||
211 | timespec_add_ns(&timekeeper.xtime, nsec); | 281 | tk_normalize_xtime(tk); |
212 | 282 | ||
213 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 283 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
214 | timespec_add_ns(&timekeeper.raw_time, nsec); | 284 | timespec_add_ns(&tk->raw_time, nsec); |
215 | } | 285 | } |
216 | 286 | ||
217 | /** | 287 | /** |
@@ -222,21 +292,19 @@ static void timekeeping_forward_now(void) | |||
222 | */ | 292 | */ |
223 | void getnstimeofday(struct timespec *ts) | 293 | void getnstimeofday(struct timespec *ts) |
224 | { | 294 | { |
295 | struct timekeeper *tk = &timekeeper; | ||
225 | unsigned long seq; | 296 | unsigned long seq; |
226 | s64 nsecs; | 297 | s64 nsecs = 0; |
227 | 298 | ||
228 | WARN_ON(timekeeping_suspended); | 299 | WARN_ON(timekeeping_suspended); |
229 | 300 | ||
230 | do { | 301 | do { |
231 | seq = read_seqbegin(&timekeeper.lock); | 302 | seq = read_seqbegin(&tk->lock); |
232 | |||
233 | *ts = timekeeper.xtime; | ||
234 | nsecs = timekeeping_get_ns(); | ||
235 | 303 | ||
236 | /* If arch requires, add in gettimeoffset() */ | 304 | ts->tv_sec = tk->xtime_sec; |
237 | nsecs += arch_gettimeoffset(); | 305 | ts->tv_nsec = timekeeping_get_ns(tk); |
238 | 306 | ||
239 | } while (read_seqretry(&timekeeper.lock, seq)); | 307 | } while (read_seqretry(&tk->lock, seq)); |
240 | 308 | ||
241 | timespec_add_ns(ts, nsecs); | 309 | timespec_add_ns(ts, nsecs); |
242 | } | 310 | } |
@@ -244,22 +312,18 @@ EXPORT_SYMBOL(getnstimeofday); | |||
244 | 312 | ||
245 | ktime_t ktime_get(void) | 313 | ktime_t ktime_get(void) |
246 | { | 314 | { |
315 | struct timekeeper *tk = &timekeeper; | ||
247 | unsigned int seq; | 316 | unsigned int seq; |
248 | s64 secs, nsecs; | 317 | s64 secs, nsecs; |
249 | 318 | ||
250 | WARN_ON(timekeeping_suspended); | 319 | WARN_ON(timekeeping_suspended); |
251 | 320 | ||
252 | do { | 321 | do { |
253 | seq = read_seqbegin(&timekeeper.lock); | 322 | seq = read_seqbegin(&tk->lock); |
254 | secs = timekeeper.xtime.tv_sec + | 323 | secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; |
255 | timekeeper.wall_to_monotonic.tv_sec; | 324 | nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; |
256 | nsecs = timekeeper.xtime.tv_nsec + | 325 | |
257 | timekeeper.wall_to_monotonic.tv_nsec; | 326 | } while (read_seqretry(&tk->lock, seq)); |
258 | nsecs += timekeeping_get_ns(); | ||
259 | /* If arch requires, add in gettimeoffset() */ | ||
260 | nsecs += arch_gettimeoffset(); | ||
261 | |||
262 | } while (read_seqretry(&timekeeper.lock, seq)); | ||
263 | /* | 327 | /* |
264 | * Use ktime_set/ktime_add_ns to create a proper ktime on | 328 | * Use ktime_set/ktime_add_ns to create a proper ktime on |
265 | * 32-bit architectures without CONFIG_KTIME_SCALAR. | 329 | * 32-bit architectures without CONFIG_KTIME_SCALAR. |
@@ -278,24 +342,22 @@ EXPORT_SYMBOL_GPL(ktime_get); | |||
278 | */ | 342 | */ |
279 | void ktime_get_ts(struct timespec *ts) | 343 | void ktime_get_ts(struct timespec *ts) |
280 | { | 344 | { |
345 | struct timekeeper *tk = &timekeeper; | ||
281 | struct timespec tomono; | 346 | struct timespec tomono; |
282 | unsigned int seq; | 347 | unsigned int seq; |
283 | s64 nsecs; | ||
284 | 348 | ||
285 | WARN_ON(timekeeping_suspended); | 349 | WARN_ON(timekeeping_suspended); |
286 | 350 | ||
287 | do { | 351 | do { |
288 | seq = read_seqbegin(&timekeeper.lock); | 352 | seq = read_seqbegin(&tk->lock); |
289 | *ts = timekeeper.xtime; | 353 | ts->tv_sec = tk->xtime_sec; |
290 | tomono = timekeeper.wall_to_monotonic; | 354 | ts->tv_nsec = timekeeping_get_ns(tk); |
291 | nsecs = timekeeping_get_ns(); | 355 | tomono = tk->wall_to_monotonic; |
292 | /* If arch requires, add in gettimeoffset() */ | ||
293 | nsecs += arch_gettimeoffset(); | ||
294 | 356 | ||
295 | } while (read_seqretry(&timekeeper.lock, seq)); | 357 | } while (read_seqretry(&tk->lock, seq)); |
296 | 358 | ||
297 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, | 359 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, |
298 | ts->tv_nsec + tomono.tv_nsec + nsecs); | 360 | ts->tv_nsec + tomono.tv_nsec); |
299 | } | 361 | } |
300 | EXPORT_SYMBOL_GPL(ktime_get_ts); | 362 | EXPORT_SYMBOL_GPL(ktime_get_ts); |
301 | 363 | ||
@@ -312,28 +374,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); | |||
312 | */ | 374 | */ |
313 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | 375 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) |
314 | { | 376 | { |
377 | struct timekeeper *tk = &timekeeper; | ||
315 | unsigned long seq; | 378 | unsigned long seq; |
316 | s64 nsecs_raw, nsecs_real; | 379 | s64 nsecs_raw, nsecs_real; |
317 | 380 | ||
318 | WARN_ON_ONCE(timekeeping_suspended); | 381 | WARN_ON_ONCE(timekeeping_suspended); |
319 | 382 | ||
320 | do { | 383 | do { |
321 | u32 arch_offset; | 384 | seq = read_seqbegin(&tk->lock); |
322 | 385 | ||
323 | seq = read_seqbegin(&timekeeper.lock); | 386 | *ts_raw = tk->raw_time; |
387 | ts_real->tv_sec = tk->xtime_sec; | ||
388 | ts_real->tv_nsec = 0; | ||
324 | 389 | ||
325 | *ts_raw = timekeeper.raw_time; | 390 | nsecs_raw = timekeeping_get_ns_raw(tk); |
326 | *ts_real = timekeeper.xtime; | 391 | nsecs_real = timekeeping_get_ns(tk); |
327 | 392 | ||
328 | nsecs_raw = timekeeping_get_ns_raw(); | 393 | } while (read_seqretry(&tk->lock, seq)); |
329 | nsecs_real = timekeeping_get_ns(); | ||
330 | |||
331 | /* If arch requires, add in gettimeoffset() */ | ||
332 | arch_offset = arch_gettimeoffset(); | ||
333 | nsecs_raw += arch_offset; | ||
334 | nsecs_real += arch_offset; | ||
335 | |||
336 | } while (read_seqretry(&timekeeper.lock, seq)); | ||
337 | 394 | ||
338 | timespec_add_ns(ts_raw, nsecs_raw); | 395 | timespec_add_ns(ts_raw, nsecs_raw); |
339 | timespec_add_ns(ts_real, nsecs_real); | 396 | timespec_add_ns(ts_real, nsecs_real); |
@@ -366,25 +423,28 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
366 | */ | 423 | */ |
367 | int do_settimeofday(const struct timespec *tv) | 424 | int do_settimeofday(const struct timespec *tv) |
368 | { | 425 | { |
369 | struct timespec ts_delta; | 426 | struct timekeeper *tk = &timekeeper; |
427 | struct timespec ts_delta, xt; | ||
370 | unsigned long flags; | 428 | unsigned long flags; |
371 | 429 | ||
372 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | 430 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) |
373 | return -EINVAL; | 431 | return -EINVAL; |
374 | 432 | ||
375 | write_seqlock_irqsave(&timekeeper.lock, flags); | 433 | write_seqlock_irqsave(&tk->lock, flags); |
434 | |||
435 | timekeeping_forward_now(tk); | ||
376 | 436 | ||
377 | timekeeping_forward_now(); | 437 | xt = tk_xtime(tk); |
438 | ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; | ||
439 | ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; | ||
378 | 440 | ||
379 | ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; | 441 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); |
380 | ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; | ||
381 | timekeeper.wall_to_monotonic = | ||
382 | timespec_sub(timekeeper.wall_to_monotonic, ts_delta); | ||
383 | 442 | ||
384 | timekeeper.xtime = *tv; | 443 | tk_set_xtime(tk, tv); |
385 | timekeeping_update(true); | ||
386 | 444 | ||
387 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 445 | timekeeping_update(tk, true); |
446 | |||
447 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
388 | 448 | ||
389 | /* signal hrtimers about time change */ | 449 | /* signal hrtimers about time change */ |
390 | clock_was_set(); | 450 | clock_was_set(); |
@@ -393,7 +453,6 @@ int do_settimeofday(const struct timespec *tv) | |||
393 | } | 453 | } |
394 | EXPORT_SYMBOL(do_settimeofday); | 454 | EXPORT_SYMBOL(do_settimeofday); |
395 | 455 | ||
396 | |||
397 | /** | 456 | /** |
398 | * timekeeping_inject_offset - Adds or subtracts from the current time. | 457 | * timekeeping_inject_offset - Adds or subtracts from the current time. |
399 | * @tv: pointer to the timespec variable containing the offset | 458 | * @tv: pointer to the timespec variable containing the offset |
@@ -402,22 +461,23 @@ EXPORT_SYMBOL(do_settimeofday); | |||
402 | */ | 461 | */ |
403 | int timekeeping_inject_offset(struct timespec *ts) | 462 | int timekeeping_inject_offset(struct timespec *ts) |
404 | { | 463 | { |
464 | struct timekeeper *tk = &timekeeper; | ||
405 | unsigned long flags; | 465 | unsigned long flags; |
406 | 466 | ||
407 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) | 467 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) |
408 | return -EINVAL; | 468 | return -EINVAL; |
409 | 469 | ||
410 | write_seqlock_irqsave(&timekeeper.lock, flags); | 470 | write_seqlock_irqsave(&tk->lock, flags); |
411 | 471 | ||
412 | timekeeping_forward_now(); | 472 | timekeeping_forward_now(tk); |
413 | 473 | ||
414 | timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); | ||
415 | timekeeper.wall_to_monotonic = | ||
416 | timespec_sub(timekeeper.wall_to_monotonic, *ts); | ||
417 | 474 | ||
418 | timekeeping_update(true); | 475 | tk_xtime_add(tk, ts); |
476 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); | ||
419 | 477 | ||
420 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 478 | timekeeping_update(tk, true); |
479 | |||
480 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
421 | 481 | ||
422 | /* signal hrtimers about time change */ | 482 | /* signal hrtimers about time change */ |
423 | clock_was_set(); | 483 | clock_was_set(); |
@@ -433,23 +493,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset); | |||
433 | */ | 493 | */ |
434 | static int change_clocksource(void *data) | 494 | static int change_clocksource(void *data) |
435 | { | 495 | { |
496 | struct timekeeper *tk = &timekeeper; | ||
436 | struct clocksource *new, *old; | 497 | struct clocksource *new, *old; |
437 | unsigned long flags; | 498 | unsigned long flags; |
438 | 499 | ||
439 | new = (struct clocksource *) data; | 500 | new = (struct clocksource *) data; |
440 | 501 | ||
441 | write_seqlock_irqsave(&timekeeper.lock, flags); | 502 | write_seqlock_irqsave(&tk->lock, flags); |
442 | 503 | ||
443 | timekeeping_forward_now(); | 504 | timekeeping_forward_now(tk); |
444 | if (!new->enable || new->enable(new) == 0) { | 505 | if (!new->enable || new->enable(new) == 0) { |
445 | old = timekeeper.clock; | 506 | old = tk->clock; |
446 | timekeeper_setup_internals(new); | 507 | tk_setup_internals(tk, new); |
447 | if (old->disable) | 508 | if (old->disable) |
448 | old->disable(old); | 509 | old->disable(old); |
449 | } | 510 | } |
450 | timekeeping_update(true); | 511 | timekeeping_update(tk, true); |
451 | 512 | ||
452 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 513 | write_sequnlock_irqrestore(&tk->lock, flags); |
453 | 514 | ||
454 | return 0; | 515 | return 0; |
455 | } | 516 | } |
@@ -463,7 +524,9 @@ static int change_clocksource(void *data) | |||
463 | */ | 524 | */ |
464 | void timekeeping_notify(struct clocksource *clock) | 525 | void timekeeping_notify(struct clocksource *clock) |
465 | { | 526 | { |
466 | if (timekeeper.clock == clock) | 527 | struct timekeeper *tk = &timekeeper; |
528 | |||
529 | if (tk->clock == clock) | ||
467 | return; | 530 | return; |
468 | stop_machine(change_clocksource, clock, NULL); | 531 | stop_machine(change_clocksource, clock, NULL); |
469 | tick_clock_notify(); | 532 | tick_clock_notify(); |
@@ -492,35 +555,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real); | |||
492 | */ | 555 | */ |
493 | void getrawmonotonic(struct timespec *ts) | 556 | void getrawmonotonic(struct timespec *ts) |
494 | { | 557 | { |
558 | struct timekeeper *tk = &timekeeper; | ||
495 | unsigned long seq; | 559 | unsigned long seq; |
496 | s64 nsecs; | 560 | s64 nsecs; |
497 | 561 | ||
498 | do { | 562 | do { |
499 | seq = read_seqbegin(&timekeeper.lock); | 563 | seq = read_seqbegin(&tk->lock); |
500 | nsecs = timekeeping_get_ns_raw(); | 564 | nsecs = timekeeping_get_ns_raw(tk); |
501 | *ts = timekeeper.raw_time; | 565 | *ts = tk->raw_time; |
502 | 566 | ||
503 | } while (read_seqretry(&timekeeper.lock, seq)); | 567 | } while (read_seqretry(&tk->lock, seq)); |
504 | 568 | ||
505 | timespec_add_ns(ts, nsecs); | 569 | timespec_add_ns(ts, nsecs); |
506 | } | 570 | } |
507 | EXPORT_SYMBOL(getrawmonotonic); | 571 | EXPORT_SYMBOL(getrawmonotonic); |
508 | 572 | ||
509 | |||
510 | /** | 573 | /** |
511 | * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres | 574 | * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres |
512 | */ | 575 | */ |
513 | int timekeeping_valid_for_hres(void) | 576 | int timekeeping_valid_for_hres(void) |
514 | { | 577 | { |
578 | struct timekeeper *tk = &timekeeper; | ||
515 | unsigned long seq; | 579 | unsigned long seq; |
516 | int ret; | 580 | int ret; |
517 | 581 | ||
518 | do { | 582 | do { |
519 | seq = read_seqbegin(&timekeeper.lock); | 583 | seq = read_seqbegin(&tk->lock); |
520 | 584 | ||
521 | ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | 585 | ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; |
522 | 586 | ||
523 | } while (read_seqretry(&timekeeper.lock, seq)); | 587 | } while (read_seqretry(&tk->lock, seq)); |
524 | 588 | ||
525 | return ret; | 589 | return ret; |
526 | } | 590 | } |
@@ -530,14 +594,16 @@ int timekeeping_valid_for_hres(void) | |||
530 | */ | 594 | */ |
531 | u64 timekeeping_max_deferment(void) | 595 | u64 timekeeping_max_deferment(void) |
532 | { | 596 | { |
597 | struct timekeeper *tk = &timekeeper; | ||
533 | unsigned long seq; | 598 | unsigned long seq; |
534 | u64 ret; | 599 | u64 ret; |
600 | |||
535 | do { | 601 | do { |
536 | seq = read_seqbegin(&timekeeper.lock); | 602 | seq = read_seqbegin(&tk->lock); |
537 | 603 | ||
538 | ret = timekeeper.clock->max_idle_ns; | 604 | ret = tk->clock->max_idle_ns; |
539 | 605 | ||
540 | } while (read_seqretry(&timekeeper.lock, seq)); | 606 | } while (read_seqretry(&tk->lock, seq)); |
541 | 607 | ||
542 | return ret; | 608 | return ret; |
543 | } | 609 | } |
@@ -577,36 +643,38 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts) | |||
577 | */ | 643 | */ |
578 | void __init timekeeping_init(void) | 644 | void __init timekeeping_init(void) |
579 | { | 645 | { |
646 | struct timekeeper *tk = &timekeeper; | ||
580 | struct clocksource *clock; | 647 | struct clocksource *clock; |
581 | unsigned long flags; | 648 | unsigned long flags; |
582 | struct timespec now, boot; | 649 | struct timespec now, boot, tmp; |
583 | 650 | ||
584 | read_persistent_clock(&now); | 651 | read_persistent_clock(&now); |
585 | read_boot_clock(&boot); | 652 | read_boot_clock(&boot); |
586 | 653 | ||
587 | seqlock_init(&timekeeper.lock); | 654 | seqlock_init(&tk->lock); |
588 | 655 | ||
589 | ntp_init(); | 656 | ntp_init(); |
590 | 657 | ||
591 | write_seqlock_irqsave(&timekeeper.lock, flags); | 658 | write_seqlock_irqsave(&tk->lock, flags); |
592 | clock = clocksource_default_clock(); | 659 | clock = clocksource_default_clock(); |
593 | if (clock->enable) | 660 | if (clock->enable) |
594 | clock->enable(clock); | 661 | clock->enable(clock); |
595 | timekeeper_setup_internals(clock); | 662 | tk_setup_internals(tk, clock); |
596 | 663 | ||
597 | timekeeper.xtime.tv_sec = now.tv_sec; | 664 | tk_set_xtime(tk, &now); |
598 | timekeeper.xtime.tv_nsec = now.tv_nsec; | 665 | tk->raw_time.tv_sec = 0; |
599 | timekeeper.raw_time.tv_sec = 0; | 666 | tk->raw_time.tv_nsec = 0; |
600 | timekeeper.raw_time.tv_nsec = 0; | 667 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) |
601 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) { | 668 | boot = tk_xtime(tk); |
602 | boot.tv_sec = timekeeper.xtime.tv_sec; | 669 | |
603 | boot.tv_nsec = timekeeper.xtime.tv_nsec; | 670 | set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); |
604 | } | 671 | tk_set_wall_to_mono(tk, tmp); |
605 | set_normalized_timespec(&timekeeper.wall_to_monotonic, | 672 | |
606 | -boot.tv_sec, -boot.tv_nsec); | 673 | tmp.tv_sec = 0; |
607 | timekeeper.total_sleep_time.tv_sec = 0; | 674 | tmp.tv_nsec = 0; |
608 | timekeeper.total_sleep_time.tv_nsec = 0; | 675 | tk_set_sleep_time(tk, tmp); |
609 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 676 | |
677 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
610 | } | 678 | } |
611 | 679 | ||
612 | /* time in seconds when suspend began */ | 680 | /* time in seconds when suspend began */ |
@@ -619,22 +687,19 @@ static struct timespec timekeeping_suspend_time; | |||
619 | * Takes a timespec offset measuring a suspend interval and properly | 687 | * Takes a timespec offset measuring a suspend interval and properly |
620 | * adds the sleep offset to the timekeeping variables. | 688 | * adds the sleep offset to the timekeeping variables. |
621 | */ | 689 | */ |
622 | static void __timekeeping_inject_sleeptime(struct timespec *delta) | 690 | static void __timekeeping_inject_sleeptime(struct timekeeper *tk, |
691 | struct timespec *delta) | ||
623 | { | 692 | { |
624 | if (!timespec_valid(delta)) { | 693 | if (!timespec_valid(delta)) { |
625 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " | 694 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " |
626 | "sleep delta value!\n"); | 695 | "sleep delta value!\n"); |
627 | return; | 696 | return; |
628 | } | 697 | } |
629 | 698 | tk_xtime_add(tk, delta); | |
630 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); | 699 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); |
631 | timekeeper.wall_to_monotonic = | 700 | tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); |
632 | timespec_sub(timekeeper.wall_to_monotonic, *delta); | ||
633 | timekeeper.total_sleep_time = timespec_add( | ||
634 | timekeeper.total_sleep_time, *delta); | ||
635 | } | 701 | } |
636 | 702 | ||
637 | |||
638 | /** | 703 | /** |
639 | * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values | 704 | * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values |
640 | * @delta: pointer to a timespec delta value | 705 | * @delta: pointer to a timespec delta value |
@@ -647,6 +712,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) | |||
647 | */ | 712 | */ |
648 | void timekeeping_inject_sleeptime(struct timespec *delta) | 713 | void timekeeping_inject_sleeptime(struct timespec *delta) |
649 | { | 714 | { |
715 | struct timekeeper *tk = &timekeeper; | ||
650 | unsigned long flags; | 716 | unsigned long flags; |
651 | struct timespec ts; | 717 | struct timespec ts; |
652 | 718 | ||
@@ -655,21 +721,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
655 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) | 721 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) |
656 | return; | 722 | return; |
657 | 723 | ||
658 | write_seqlock_irqsave(&timekeeper.lock, flags); | 724 | write_seqlock_irqsave(&tk->lock, flags); |
659 | 725 | ||
660 | timekeeping_forward_now(); | 726 | timekeeping_forward_now(tk); |
661 | 727 | ||
662 | __timekeeping_inject_sleeptime(delta); | 728 | __timekeeping_inject_sleeptime(tk, delta); |
663 | 729 | ||
664 | timekeeping_update(true); | 730 | timekeeping_update(tk, true); |
665 | 731 | ||
666 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 732 | write_sequnlock_irqrestore(&tk->lock, flags); |
667 | 733 | ||
668 | /* signal hrtimers about time change */ | 734 | /* signal hrtimers about time change */ |
669 | clock_was_set(); | 735 | clock_was_set(); |
670 | } | 736 | } |
671 | 737 | ||
672 | |||
673 | /** | 738 | /** |
674 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 739 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
675 | * | 740 | * |
@@ -679,6 +744,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
679 | */ | 744 | */ |
680 | static void timekeeping_resume(void) | 745 | static void timekeeping_resume(void) |
681 | { | 746 | { |
747 | struct timekeeper *tk = &timekeeper; | ||
682 | unsigned long flags; | 748 | unsigned long flags; |
683 | struct timespec ts; | 749 | struct timespec ts; |
684 | 750 | ||
@@ -686,17 +752,18 @@ static void timekeeping_resume(void) | |||
686 | 752 | ||
687 | clocksource_resume(); | 753 | clocksource_resume(); |
688 | 754 | ||
689 | write_seqlock_irqsave(&timekeeper.lock, flags); | 755 | write_seqlock_irqsave(&tk->lock, flags); |
690 | 756 | ||
691 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 757 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
692 | ts = timespec_sub(ts, timekeeping_suspend_time); | 758 | ts = timespec_sub(ts, timekeeping_suspend_time); |
693 | __timekeeping_inject_sleeptime(&ts); | 759 | __timekeeping_inject_sleeptime(tk, &ts); |
694 | } | 760 | } |
695 | /* re-base the last cycle value */ | 761 | /* re-base the last cycle value */ |
696 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 762 | tk->clock->cycle_last = tk->clock->read(tk->clock); |
697 | timekeeper.ntp_error = 0; | 763 | tk->ntp_error = 0; |
698 | timekeeping_suspended = 0; | 764 | timekeeping_suspended = 0; |
699 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 765 | timekeeping_update(tk, false); |
766 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
700 | 767 | ||
701 | touch_softlockup_watchdog(); | 768 | touch_softlockup_watchdog(); |
702 | 769 | ||
@@ -708,14 +775,15 @@ static void timekeeping_resume(void) | |||
708 | 775 | ||
709 | static int timekeeping_suspend(void) | 776 | static int timekeeping_suspend(void) |
710 | { | 777 | { |
778 | struct timekeeper *tk = &timekeeper; | ||
711 | unsigned long flags; | 779 | unsigned long flags; |
712 | struct timespec delta, delta_delta; | 780 | struct timespec delta, delta_delta; |
713 | static struct timespec old_delta; | 781 | static struct timespec old_delta; |
714 | 782 | ||
715 | read_persistent_clock(&timekeeping_suspend_time); | 783 | read_persistent_clock(&timekeeping_suspend_time); |
716 | 784 | ||
717 | write_seqlock_irqsave(&timekeeper.lock, flags); | 785 | write_seqlock_irqsave(&tk->lock, flags); |
718 | timekeeping_forward_now(); | 786 | timekeeping_forward_now(tk); |
719 | timekeeping_suspended = 1; | 787 | timekeeping_suspended = 1; |
720 | 788 | ||
721 | /* | 789 | /* |
@@ -724,7 +792,7 @@ static int timekeeping_suspend(void) | |||
724 | * try to compensate so the difference in system time | 792 | * try to compensate so the difference in system time |
725 | * and persistent_clock time stays close to constant. | 793 | * and persistent_clock time stays close to constant. |
726 | */ | 794 | */ |
727 | delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); | 795 | delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); |
728 | delta_delta = timespec_sub(delta, old_delta); | 796 | delta_delta = timespec_sub(delta, old_delta); |
729 | if (abs(delta_delta.tv_sec) >= 2) { | 797 | if (abs(delta_delta.tv_sec) >= 2) { |
730 | /* | 798 | /* |
@@ -737,7 +805,7 @@ static int timekeeping_suspend(void) | |||
737 | timekeeping_suspend_time = | 805 | timekeeping_suspend_time = |
738 | timespec_add(timekeeping_suspend_time, delta_delta); | 806 | timespec_add(timekeeping_suspend_time, delta_delta); |
739 | } | 807 | } |
740 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 808 | write_sequnlock_irqrestore(&tk->lock, flags); |
741 | 809 | ||
742 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 810 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
743 | clocksource_suspend(); | 811 | clocksource_suspend(); |
@@ -763,7 +831,8 @@ device_initcall(timekeeping_init_ops); | |||
763 | * If the error is already larger, we look ahead even further | 831 | * If the error is already larger, we look ahead even further |
764 | * to compensate for late or lost adjustments. | 832 | * to compensate for late or lost adjustments. |
765 | */ | 833 | */ |
766 | static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | 834 | static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, |
835 | s64 error, s64 *interval, | ||
767 | s64 *offset) | 836 | s64 *offset) |
768 | { | 837 | { |
769 | s64 tick_error, i; | 838 | s64 tick_error, i; |
@@ -779,7 +848,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | |||
779 | * here. This is tuned so that an error of about 1 msec is adjusted | 848 | * here. This is tuned so that an error of about 1 msec is adjusted |
780 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). | 849 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). |
781 | */ | 850 | */ |
782 | error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); | 851 | error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); |
783 | error2 = abs(error2); | 852 | error2 = abs(error2); |
784 | for (look_ahead = 0; error2 > 0; look_ahead++) | 853 | for (look_ahead = 0; error2 > 0; look_ahead++) |
785 | error2 >>= 2; | 854 | error2 >>= 2; |
@@ -788,8 +857,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | |||
788 | * Now calculate the error in (1 << look_ahead) ticks, but first | 857 | * Now calculate the error in (1 << look_ahead) ticks, but first |
789 | * remove the single look ahead already included in the error. | 858 | * remove the single look ahead already included in the error. |
790 | */ | 859 | */ |
791 | tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); | 860 | tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); |
792 | tick_error -= timekeeper.xtime_interval >> 1; | 861 | tick_error -= tk->xtime_interval >> 1; |
793 | error = ((error - tick_error) >> look_ahead) + tick_error; | 862 | error = ((error - tick_error) >> look_ahead) + tick_error; |
794 | 863 | ||
795 | /* Finally calculate the adjustment shift value. */ | 864 | /* Finally calculate the adjustment shift value. */ |
@@ -814,9 +883,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | |||
814 | * this is optimized for the most common adjustments of -1,0,1, | 883 | * this is optimized for the most common adjustments of -1,0,1, |
815 | * for other values we can do a bit more work. | 884 | * for other values we can do a bit more work. |
816 | */ | 885 | */ |
817 | static void timekeeping_adjust(s64 offset) | 886 | static void timekeeping_adjust(struct timekeeper *tk, s64 offset) |
818 | { | 887 | { |
819 | s64 error, interval = timekeeper.cycle_interval; | 888 | s64 error, interval = tk->cycle_interval; |
820 | int adj; | 889 | int adj; |
821 | 890 | ||
822 | /* | 891 | /* |
@@ -832,7 +901,7 @@ static void timekeeping_adjust(s64 offset) | |||
832 | * | 901 | * |
833 | * Note: It does not "save" on aggravation when reading the code. | 902 | * Note: It does not "save" on aggravation when reading the code. |
834 | */ | 903 | */ |
835 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); | 904 | error = tk->ntp_error >> (tk->ntp_error_shift - 1); |
836 | if (error > interval) { | 905 | if (error > interval) { |
837 | /* | 906 | /* |
838 | * We now divide error by 4(via shift), which checks if | 907 | * We now divide error by 4(via shift), which checks if |
@@ -847,34 +916,36 @@ static void timekeeping_adjust(s64 offset) | |||
847 | * the error. This causes the likely below to be unlikely. | 916 | * the error. This causes the likely below to be unlikely. |
848 | * | 917 | * |
849 | * The proper fix is to avoid rounding up by using | 918 | * The proper fix is to avoid rounding up by using |
850 | * the high precision timekeeper.xtime_nsec instead of | 919 | * the high precision tk->xtime_nsec instead of |
851 | * xtime.tv_nsec everywhere. Fixing this will take some | 920 | * xtime.tv_nsec everywhere. Fixing this will take some |
852 | * time. | 921 | * time. |
853 | */ | 922 | */ |
854 | if (likely(error <= interval)) | 923 | if (likely(error <= interval)) |
855 | adj = 1; | 924 | adj = 1; |
856 | else | 925 | else |
857 | adj = timekeeping_bigadjust(error, &interval, &offset); | 926 | adj = timekeeping_bigadjust(tk, error, &interval, &offset); |
858 | } else if (error < -interval) { | 927 | } else { |
859 | /* See comment above, this is just switched for the negative */ | 928 | if (error < -interval) { |
860 | error >>= 2; | 929 | /* See comment above, this is just switched for the negative */ |
861 | if (likely(error >= -interval)) { | 930 | error >>= 2; |
862 | adj = -1; | 931 | if (likely(error >= -interval)) { |
863 | interval = -interval; | 932 | adj = -1; |
864 | offset = -offset; | 933 | interval = -interval; |
865 | } else | 934 | offset = -offset; |
866 | adj = timekeeping_bigadjust(error, &interval, &offset); | 935 | } else { |
867 | } else /* No adjustment needed */ | 936 | adj = timekeeping_bigadjust(tk, error, &interval, &offset); |
868 | return; | 937 | } |
938 | } else { | ||
939 | goto out_adjust; | ||
940 | } | ||
941 | } | ||
869 | 942 | ||
870 | if (unlikely(timekeeper.clock->maxadj && | 943 | if (unlikely(tk->clock->maxadj && |
871 | (timekeeper.mult + adj > | 944 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { |
872 | timekeeper.clock->mult + timekeeper.clock->maxadj))) { | ||
873 | printk_once(KERN_WARNING | 945 | printk_once(KERN_WARNING |
874 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 946 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
875 | timekeeper.clock->name, (long)timekeeper.mult + adj, | 947 | tk->clock->name, (long)tk->mult + adj, |
876 | (long)timekeeper.clock->mult + | 948 | (long)tk->clock->mult + tk->clock->maxadj); |
877 | timekeeper.clock->maxadj); | ||
878 | } | 949 | } |
879 | /* | 950 | /* |
880 | * So the following can be confusing. | 951 | * So the following can be confusing. |
@@ -925,13 +996,68 @@ static void timekeeping_adjust(s64 offset) | |||
925 | * | 996 | * |
926 | * XXX - TODO: Doc ntp_error calculation. | 997 | * XXX - TODO: Doc ntp_error calculation. |
927 | */ | 998 | */ |
928 | timekeeper.mult += adj; | 999 | tk->mult += adj; |
929 | timekeeper.xtime_interval += interval; | 1000 | tk->xtime_interval += interval; |
930 | timekeeper.xtime_nsec -= offset; | 1001 | tk->xtime_nsec -= offset; |
931 | timekeeper.ntp_error -= (interval - offset) << | 1002 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; |
932 | timekeeper.ntp_error_shift; | 1003 | |
1004 | out_adjust: | ||
1005 | /* | ||
1006 | * It may be possible that when we entered this function, xtime_nsec | ||
1007 | * was very small. Further, if we're slightly speeding the clocksource | ||
1008 | * in the code above, its possible the required corrective factor to | ||
1009 | * xtime_nsec could cause it to underflow. | ||
1010 | * | ||
1011 | * Now, since we already accumulated the second, cannot simply roll | ||
1012 | * the accumulated second back, since the NTP subsystem has been | ||
1013 | * notified via second_overflow. So instead we push xtime_nsec forward | ||
1014 | * by the amount we underflowed, and add that amount into the error. | ||
1015 | * | ||
1016 | * We'll correct this error next time through this function, when | ||
1017 | * xtime_nsec is not as small. | ||
1018 | */ | ||
1019 | if (unlikely((s64)tk->xtime_nsec < 0)) { | ||
1020 | s64 neg = -(s64)tk->xtime_nsec; | ||
1021 | tk->xtime_nsec = 0; | ||
1022 | tk->ntp_error += neg << tk->ntp_error_shift; | ||
1023 | } | ||
1024 | |||
933 | } | 1025 | } |
934 | 1026 | ||
1027 | /** | ||
1028 | * accumulate_nsecs_to_secs - Accumulates nsecs into secs | ||
1029 | * | ||
1030 | * Helper function that accumulates a the nsecs greater then a second | ||
1031 | * from the xtime_nsec field to the xtime_secs field. | ||
1032 | * It also calls into the NTP code to handle leapsecond processing. | ||
1033 | * | ||
1034 | */ | ||
1035 | static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) | ||
1036 | { | ||
1037 | u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; | ||
1038 | |||
1039 | while (tk->xtime_nsec >= nsecps) { | ||
1040 | int leap; | ||
1041 | |||
1042 | tk->xtime_nsec -= nsecps; | ||
1043 | tk->xtime_sec++; | ||
1044 | |||
1045 | /* Figure out if its a leap sec and apply if needed */ | ||
1046 | leap = second_overflow(tk->xtime_sec); | ||
1047 | if (unlikely(leap)) { | ||
1048 | struct timespec ts; | ||
1049 | |||
1050 | tk->xtime_sec += leap; | ||
1051 | |||
1052 | ts.tv_sec = leap; | ||
1053 | ts.tv_nsec = 0; | ||
1054 | tk_set_wall_to_mono(tk, | ||
1055 | timespec_sub(tk->wall_to_monotonic, ts)); | ||
1056 | |||
1057 | clock_was_set_delayed(); | ||
1058 | } | ||
1059 | } | ||
1060 | } | ||
935 | 1061 | ||
936 | /** | 1062 | /** |
937 | * logarithmic_accumulation - shifted accumulation of cycles | 1063 | * logarithmic_accumulation - shifted accumulation of cycles |
@@ -942,49 +1068,40 @@ static void timekeeping_adjust(s64 offset) | |||
942 | * | 1068 | * |
943 | * Returns the unconsumed cycles. | 1069 | * Returns the unconsumed cycles. |
944 | */ | 1070 | */ |
945 | static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | 1071 | static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, |
1072 | u32 shift) | ||
946 | { | 1073 | { |
947 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; | ||
948 | u64 raw_nsecs; | 1074 | u64 raw_nsecs; |
949 | 1075 | ||
950 | /* If the offset is smaller than a shifted interval, do nothing */ | 1076 | /* If the offset is smaller then a shifted interval, do nothing */ |
951 | if (offset < timekeeper.cycle_interval<<shift) | 1077 | if (offset < tk->cycle_interval<<shift) |
952 | return offset; | 1078 | return offset; |
953 | 1079 | ||
954 | /* Accumulate one shifted interval */ | 1080 | /* Accumulate one shifted interval */ |
955 | offset -= timekeeper.cycle_interval << shift; | 1081 | offset -= tk->cycle_interval << shift; |
956 | timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; | 1082 | tk->clock->cycle_last += tk->cycle_interval << shift; |
957 | 1083 | ||
958 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; | 1084 | tk->xtime_nsec += tk->xtime_interval << shift; |
959 | while (timekeeper.xtime_nsec >= nsecps) { | 1085 | accumulate_nsecs_to_secs(tk); |
960 | int leap; | ||
961 | timekeeper.xtime_nsec -= nsecps; | ||
962 | timekeeper.xtime.tv_sec++; | ||
963 | leap = second_overflow(timekeeper.xtime.tv_sec); | ||
964 | timekeeper.xtime.tv_sec += leap; | ||
965 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
966 | } | ||
967 | 1086 | ||
968 | /* Accumulate raw time */ | 1087 | /* Accumulate raw time */ |
969 | raw_nsecs = timekeeper.raw_interval << shift; | 1088 | raw_nsecs = tk->raw_interval << shift; |
970 | raw_nsecs += timekeeper.raw_time.tv_nsec; | 1089 | raw_nsecs += tk->raw_time.tv_nsec; |
971 | if (raw_nsecs >= NSEC_PER_SEC) { | 1090 | if (raw_nsecs >= NSEC_PER_SEC) { |
972 | u64 raw_secs = raw_nsecs; | 1091 | u64 raw_secs = raw_nsecs; |
973 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); | 1092 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); |
974 | timekeeper.raw_time.tv_sec += raw_secs; | 1093 | tk->raw_time.tv_sec += raw_secs; |
975 | } | 1094 | } |
976 | timekeeper.raw_time.tv_nsec = raw_nsecs; | 1095 | tk->raw_time.tv_nsec = raw_nsecs; |
977 | 1096 | ||
978 | /* Accumulate error between NTP and clock interval */ | 1097 | /* Accumulate error between NTP and clock interval */ |
979 | timekeeper.ntp_error += ntp_tick_length() << shift; | 1098 | tk->ntp_error += ntp_tick_length() << shift; |
980 | timekeeper.ntp_error -= | 1099 | tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << |
981 | (timekeeper.xtime_interval + timekeeper.xtime_remainder) << | 1100 | (tk->ntp_error_shift + shift); |
982 | (timekeeper.ntp_error_shift + shift); | ||
983 | 1101 | ||
984 | return offset; | 1102 | return offset; |
985 | } | 1103 | } |
986 | 1104 | ||
987 | |||
988 | /** | 1105 | /** |
989 | * update_wall_time - Uses the current clocksource to increment the wall time | 1106 | * update_wall_time - Uses the current clocksource to increment the wall time |
990 | * | 1107 | * |
@@ -992,25 +1109,25 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
992 | static void update_wall_time(void) | 1109 | static void update_wall_time(void) |
993 | { | 1110 | { |
994 | struct clocksource *clock; | 1111 | struct clocksource *clock; |
1112 | struct timekeeper *tk = &timekeeper; | ||
995 | cycle_t offset; | 1113 | cycle_t offset; |
996 | int shift = 0, maxshift; | 1114 | int shift = 0, maxshift; |
997 | unsigned long flags; | 1115 | unsigned long flags; |
1116 | s64 remainder; | ||
998 | 1117 | ||
999 | write_seqlock_irqsave(&timekeeper.lock, flags); | 1118 | write_seqlock_irqsave(&tk->lock, flags); |
1000 | 1119 | ||
1001 | /* Make sure we're fully resumed: */ | 1120 | /* Make sure we're fully resumed: */ |
1002 | if (unlikely(timekeeping_suspended)) | 1121 | if (unlikely(timekeeping_suspended)) |
1003 | goto out; | 1122 | goto out; |
1004 | 1123 | ||
1005 | clock = timekeeper.clock; | 1124 | clock = tk->clock; |
1006 | 1125 | ||
1007 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 1126 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
1008 | offset = timekeeper.cycle_interval; | 1127 | offset = tk->cycle_interval; |
1009 | #else | 1128 | #else |
1010 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 1129 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; |
1011 | #endif | 1130 | #endif |
1012 | timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << | ||
1013 | timekeeper.shift; | ||
1014 | 1131 | ||
1015 | /* | 1132 | /* |
1016 | * With NO_HZ we may have to accumulate many cycle_intervals | 1133 | * With NO_HZ we may have to accumulate many cycle_intervals |
@@ -1020,71 +1137,45 @@ static void update_wall_time(void) | |||
1020 | * chunk in one go, and then try to consume the next smaller | 1137 | * chunk in one go, and then try to consume the next smaller |
1021 | * doubled multiple. | 1138 | * doubled multiple. |
1022 | */ | 1139 | */ |
1023 | shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); | 1140 | shift = ilog2(offset) - ilog2(tk->cycle_interval); |
1024 | shift = max(0, shift); | 1141 | shift = max(0, shift); |
1025 | /* Bound shift to one less than what overflows tick_length */ | 1142 | /* Bound shift to one less than what overflows tick_length */ |
1026 | maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; | 1143 | maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; |
1027 | shift = min(shift, maxshift); | 1144 | shift = min(shift, maxshift); |
1028 | while (offset >= timekeeper.cycle_interval) { | 1145 | while (offset >= tk->cycle_interval) { |
1029 | offset = logarithmic_accumulation(offset, shift); | 1146 | offset = logarithmic_accumulation(tk, offset, shift); |
1030 | if(offset < timekeeper.cycle_interval<<shift) | 1147 | if (offset < tk->cycle_interval<<shift) |
1031 | shift--; | 1148 | shift--; |
1032 | } | 1149 | } |
1033 | 1150 | ||
1034 | /* correct the clock when NTP error is too big */ | 1151 | /* correct the clock when NTP error is too big */ |
1035 | timekeeping_adjust(offset); | 1152 | timekeeping_adjust(tk, offset); |
1036 | |||
1037 | /* | ||
1038 | * Since in the loop above, we accumulate any amount of time | ||
1039 | * in xtime_nsec over a second into xtime.tv_sec, its possible for | ||
1040 | * xtime_nsec to be fairly small after the loop. Further, if we're | ||
1041 | * slightly speeding the clocksource up in timekeeping_adjust(), | ||
1042 | * its possible the required corrective factor to xtime_nsec could | ||
1043 | * cause it to underflow. | ||
1044 | * | ||
1045 | * Now, we cannot simply roll the accumulated second back, since | ||
1046 | * the NTP subsystem has been notified via second_overflow. So | ||
1047 | * instead we push xtime_nsec forward by the amount we underflowed, | ||
1048 | * and add that amount into the error. | ||
1049 | * | ||
1050 | * We'll correct this error next time through this function, when | ||
1051 | * xtime_nsec is not as small. | ||
1052 | */ | ||
1053 | if (unlikely((s64)timekeeper.xtime_nsec < 0)) { | ||
1054 | s64 neg = -(s64)timekeeper.xtime_nsec; | ||
1055 | timekeeper.xtime_nsec = 0; | ||
1056 | timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; | ||
1057 | } | ||
1058 | 1153 | ||
1059 | 1154 | ||
1060 | /* | 1155 | /* |
1061 | * Store full nanoseconds into xtime after rounding it up and | 1156 | * Store only full nanoseconds into xtime_nsec after rounding |
1062 | * add the remainder to the error difference. | 1157 | * it up and add the remainder to the error difference. |
1063 | */ | 1158 | * XXX - This is necessary to avoid small 1ns inconsistnecies caused |
1064 | timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> | 1159 | * by truncating the remainder in vsyscalls. However, it causes |
1065 | timekeeper.shift) + 1; | 1160 | * additional work to be done in timekeeping_adjust(). Once |
1066 | timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << | 1161 | * the vsyscall implementations are converted to use xtime_nsec |
1067 | timekeeper.shift; | 1162 | * (shifted nanoseconds), this can be killed. |
1068 | timekeeper.ntp_error += timekeeper.xtime_nsec << | 1163 | */ |
1069 | timekeeper.ntp_error_shift; | 1164 | remainder = tk->xtime_nsec & ((1 << tk->shift) - 1); |
1165 | tk->xtime_nsec -= remainder; | ||
1166 | tk->xtime_nsec += 1 << tk->shift; | ||
1167 | tk->ntp_error += remainder << tk->ntp_error_shift; | ||
1070 | 1168 | ||
1071 | /* | 1169 | /* |
1072 | * Finally, make sure that after the rounding | 1170 | * Finally, make sure that after the rounding |
1073 | * xtime.tv_nsec isn't larger than NSEC_PER_SEC | 1171 | * xtime_nsec isn't larger than NSEC_PER_SEC |
1074 | */ | 1172 | */ |
1075 | if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { | 1173 | accumulate_nsecs_to_secs(tk); |
1076 | int leap; | ||
1077 | timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; | ||
1078 | timekeeper.xtime.tv_sec++; | ||
1079 | leap = second_overflow(timekeeper.xtime.tv_sec); | ||
1080 | timekeeper.xtime.tv_sec += leap; | ||
1081 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
1082 | } | ||
1083 | 1174 | ||
1084 | timekeeping_update(false); | 1175 | timekeeping_update(tk, false); |
1085 | 1176 | ||
1086 | out: | 1177 | out: |
1087 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 1178 | write_sequnlock_irqrestore(&tk->lock, flags); |
1088 | 1179 | ||
1089 | } | 1180 | } |
1090 | 1181 | ||
@@ -1101,18 +1192,18 @@ out: | |||
1101 | */ | 1192 | */ |
1102 | void getboottime(struct timespec *ts) | 1193 | void getboottime(struct timespec *ts) |
1103 | { | 1194 | { |
1195 | struct timekeeper *tk = &timekeeper; | ||
1104 | struct timespec boottime = { | 1196 | struct timespec boottime = { |
1105 | .tv_sec = timekeeper.wall_to_monotonic.tv_sec + | 1197 | .tv_sec = tk->wall_to_monotonic.tv_sec + |
1106 | timekeeper.total_sleep_time.tv_sec, | 1198 | tk->total_sleep_time.tv_sec, |
1107 | .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + | 1199 | .tv_nsec = tk->wall_to_monotonic.tv_nsec + |
1108 | timekeeper.total_sleep_time.tv_nsec | 1200 | tk->total_sleep_time.tv_nsec |
1109 | }; | 1201 | }; |
1110 | 1202 | ||
1111 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); | 1203 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); |
1112 | } | 1204 | } |
1113 | EXPORT_SYMBOL_GPL(getboottime); | 1205 | EXPORT_SYMBOL_GPL(getboottime); |
1114 | 1206 | ||
1115 | |||
1116 | /** | 1207 | /** |
1117 | * get_monotonic_boottime - Returns monotonic time since boot | 1208 | * get_monotonic_boottime - Returns monotonic time since boot |
1118 | * @ts: pointer to the timespec to be set | 1209 | * @ts: pointer to the timespec to be set |
@@ -1124,23 +1215,23 @@ EXPORT_SYMBOL_GPL(getboottime); | |||
1124 | */ | 1215 | */ |
1125 | void get_monotonic_boottime(struct timespec *ts) | 1216 | void get_monotonic_boottime(struct timespec *ts) |
1126 | { | 1217 | { |
1218 | struct timekeeper *tk = &timekeeper; | ||
1127 | struct timespec tomono, sleep; | 1219 | struct timespec tomono, sleep; |
1128 | unsigned int seq; | 1220 | unsigned int seq; |
1129 | s64 nsecs; | ||
1130 | 1221 | ||
1131 | WARN_ON(timekeeping_suspended); | 1222 | WARN_ON(timekeeping_suspended); |
1132 | 1223 | ||
1133 | do { | 1224 | do { |
1134 | seq = read_seqbegin(&timekeeper.lock); | 1225 | seq = read_seqbegin(&tk->lock); |
1135 | *ts = timekeeper.xtime; | 1226 | ts->tv_sec = tk->xtime_sec; |
1136 | tomono = timekeeper.wall_to_monotonic; | 1227 | ts->tv_nsec = timekeeping_get_ns(tk); |
1137 | sleep = timekeeper.total_sleep_time; | 1228 | tomono = tk->wall_to_monotonic; |
1138 | nsecs = timekeeping_get_ns(); | 1229 | sleep = tk->total_sleep_time; |
1139 | 1230 | ||
1140 | } while (read_seqretry(&timekeeper.lock, seq)); | 1231 | } while (read_seqretry(&tk->lock, seq)); |
1141 | 1232 | ||
1142 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, | 1233 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, |
1143 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); | 1234 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); |
1144 | } | 1235 | } |
1145 | EXPORT_SYMBOL_GPL(get_monotonic_boottime); | 1236 | EXPORT_SYMBOL_GPL(get_monotonic_boottime); |
1146 | 1237 | ||
@@ -1167,31 +1258,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); | |||
1167 | */ | 1258 | */ |
1168 | void monotonic_to_bootbased(struct timespec *ts) | 1259 | void monotonic_to_bootbased(struct timespec *ts) |
1169 | { | 1260 | { |
1170 | *ts = timespec_add(*ts, timekeeper.total_sleep_time); | 1261 | struct timekeeper *tk = &timekeeper; |
1262 | |||
1263 | *ts = timespec_add(*ts, tk->total_sleep_time); | ||
1171 | } | 1264 | } |
1172 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | 1265 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); |
1173 | 1266 | ||
1174 | unsigned long get_seconds(void) | 1267 | unsigned long get_seconds(void) |
1175 | { | 1268 | { |
1176 | return timekeeper.xtime.tv_sec; | 1269 | struct timekeeper *tk = &timekeeper; |
1270 | |||
1271 | return tk->xtime_sec; | ||
1177 | } | 1272 | } |
1178 | EXPORT_SYMBOL(get_seconds); | 1273 | EXPORT_SYMBOL(get_seconds); |
1179 | 1274 | ||
1180 | struct timespec __current_kernel_time(void) | 1275 | struct timespec __current_kernel_time(void) |
1181 | { | 1276 | { |
1182 | return timekeeper.xtime; | 1277 | struct timekeeper *tk = &timekeeper; |
1278 | |||
1279 | return tk_xtime(tk); | ||
1183 | } | 1280 | } |
1184 | 1281 | ||
1185 | struct timespec current_kernel_time(void) | 1282 | struct timespec current_kernel_time(void) |
1186 | { | 1283 | { |
1284 | struct timekeeper *tk = &timekeeper; | ||
1187 | struct timespec now; | 1285 | struct timespec now; |
1188 | unsigned long seq; | 1286 | unsigned long seq; |
1189 | 1287 | ||
1190 | do { | 1288 | do { |
1191 | seq = read_seqbegin(&timekeeper.lock); | 1289 | seq = read_seqbegin(&tk->lock); |
1192 | 1290 | ||
1193 | now = timekeeper.xtime; | 1291 | now = tk_xtime(tk); |
1194 | } while (read_seqretry(&timekeeper.lock, seq)); | 1292 | } while (read_seqretry(&tk->lock, seq)); |
1195 | 1293 | ||
1196 | return now; | 1294 | return now; |
1197 | } | 1295 | } |
@@ -1199,15 +1297,16 @@ EXPORT_SYMBOL(current_kernel_time); | |||
1199 | 1297 | ||
1200 | struct timespec get_monotonic_coarse(void) | 1298 | struct timespec get_monotonic_coarse(void) |
1201 | { | 1299 | { |
1300 | struct timekeeper *tk = &timekeeper; | ||
1202 | struct timespec now, mono; | 1301 | struct timespec now, mono; |
1203 | unsigned long seq; | 1302 | unsigned long seq; |
1204 | 1303 | ||
1205 | do { | 1304 | do { |
1206 | seq = read_seqbegin(&timekeeper.lock); | 1305 | seq = read_seqbegin(&tk->lock); |
1207 | 1306 | ||
1208 | now = timekeeper.xtime; | 1307 | now = tk_xtime(tk); |
1209 | mono = timekeeper.wall_to_monotonic; | 1308 | mono = tk->wall_to_monotonic; |
1210 | } while (read_seqretry(&timekeeper.lock, seq)); | 1309 | } while (read_seqretry(&tk->lock, seq)); |
1211 | 1310 | ||
1212 | set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, | 1311 | set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, |
1213 | now.tv_nsec + mono.tv_nsec); | 1312 | now.tv_nsec + mono.tv_nsec); |
@@ -1236,34 +1335,67 @@ void do_timer(unsigned long ticks) | |||
1236 | void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | 1335 | void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, |
1237 | struct timespec *wtom, struct timespec *sleep) | 1336 | struct timespec *wtom, struct timespec *sleep) |
1238 | { | 1337 | { |
1338 | struct timekeeper *tk = &timekeeper; | ||
1239 | unsigned long seq; | 1339 | unsigned long seq; |
1240 | 1340 | ||
1241 | do { | 1341 | do { |
1242 | seq = read_seqbegin(&timekeeper.lock); | 1342 | seq = read_seqbegin(&tk->lock); |
1243 | *xtim = timekeeper.xtime; | 1343 | *xtim = tk_xtime(tk); |
1244 | *wtom = timekeeper.wall_to_monotonic; | 1344 | *wtom = tk->wall_to_monotonic; |
1245 | *sleep = timekeeper.total_sleep_time; | 1345 | *sleep = tk->total_sleep_time; |
1246 | } while (read_seqretry(&timekeeper.lock, seq)); | 1346 | } while (read_seqretry(&tk->lock, seq)); |
1247 | } | 1347 | } |
1248 | 1348 | ||
1349 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
1350 | /** | ||
1351 | * ktime_get_update_offsets - hrtimer helper | ||
1352 | * @offs_real: pointer to storage for monotonic -> realtime offset | ||
1353 | * @offs_boot: pointer to storage for monotonic -> boottime offset | ||
1354 | * | ||
1355 | * Returns current monotonic time and updates the offsets | ||
1356 | * Called from hrtimer_interupt() or retrigger_next_event() | ||
1357 | */ | ||
1358 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) | ||
1359 | { | ||
1360 | struct timekeeper *tk = &timekeeper; | ||
1361 | ktime_t now; | ||
1362 | unsigned int seq; | ||
1363 | u64 secs, nsecs; | ||
1364 | |||
1365 | do { | ||
1366 | seq = read_seqbegin(&tk->lock); | ||
1367 | |||
1368 | secs = tk->xtime_sec; | ||
1369 | nsecs = timekeeping_get_ns(tk); | ||
1370 | |||
1371 | *offs_real = tk->offs_real; | ||
1372 | *offs_boot = tk->offs_boot; | ||
1373 | } while (read_seqretry(&tk->lock, seq)); | ||
1374 | |||
1375 | now = ktime_add_ns(ktime_set(secs, 0), nsecs); | ||
1376 | now = ktime_sub(now, *offs_real); | ||
1377 | return now; | ||
1378 | } | ||
1379 | #endif | ||
1380 | |||
1249 | /** | 1381 | /** |
1250 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | 1382 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format |
1251 | */ | 1383 | */ |
1252 | ktime_t ktime_get_monotonic_offset(void) | 1384 | ktime_t ktime_get_monotonic_offset(void) |
1253 | { | 1385 | { |
1386 | struct timekeeper *tk = &timekeeper; | ||
1254 | unsigned long seq; | 1387 | unsigned long seq; |
1255 | struct timespec wtom; | 1388 | struct timespec wtom; |
1256 | 1389 | ||
1257 | do { | 1390 | do { |
1258 | seq = read_seqbegin(&timekeeper.lock); | 1391 | seq = read_seqbegin(&tk->lock); |
1259 | wtom = timekeeper.wall_to_monotonic; | 1392 | wtom = tk->wall_to_monotonic; |
1260 | } while (read_seqretry(&timekeeper.lock, seq)); | 1393 | } while (read_seqretry(&tk->lock, seq)); |
1261 | 1394 | ||
1262 | return timespec_to_ktime(wtom); | 1395 | return timespec_to_ktime(wtom); |
1263 | } | 1396 | } |
1264 | EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); | 1397 | EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); |
1265 | 1398 | ||
1266 | |||
1267 | /** | 1399 | /** |
1268 | * xtime_update() - advances the timekeeping infrastructure | 1400 | * xtime_update() - advances the timekeeping infrastructure |
1269 | * @ticks: number of ticks, that have elapsed since the last call. | 1401 | * @ticks: number of ticks, that have elapsed since the last call. |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3258455549f4..af5a7e9f164b 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
167 | { | 167 | { |
168 | struct tick_sched *ts = tick_get_tick_sched(cpu); | 168 | struct tick_sched *ts = tick_get_tick_sched(cpu); |
169 | P(nohz_mode); | 169 | P(nohz_mode); |
170 | P_ns(idle_tick); | 170 | P_ns(last_tick); |
171 | P(tick_stopped); | 171 | P(tick_stopped); |
172 | P(idle_jiffies); | 172 | P(idle_jiffies); |
173 | P(idle_calls); | 173 | P(idle_calls); |
@@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v) | |||
259 | u64 now = ktime_to_ns(ktime_get()); | 259 | u64 now = ktime_to_ns(ktime_get()); |
260 | int cpu; | 260 | int cpu; |
261 | 261 | ||
262 | SEQ_printf(m, "Timer List Version: v0.6\n"); | 262 | SEQ_printf(m, "Timer List Version: v0.7\n"); |
263 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); | 263 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); |
264 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); | 264 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); |
265 | 265 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 6ec7e7e0db43..a61c09374eba 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -77,6 +77,7 @@ struct tvec_base { | |||
77 | struct timer_list *running_timer; | 77 | struct timer_list *running_timer; |
78 | unsigned long timer_jiffies; | 78 | unsigned long timer_jiffies; |
79 | unsigned long next_timer; | 79 | unsigned long next_timer; |
80 | unsigned long active_timers; | ||
80 | struct tvec_root tv1; | 81 | struct tvec_root tv1; |
81 | struct tvec tv2; | 82 | struct tvec tv2; |
82 | struct tvec tv3; | 83 | struct tvec tv3; |
@@ -330,7 +331,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) | |||
330 | } | 331 | } |
331 | EXPORT_SYMBOL_GPL(set_timer_slack); | 332 | EXPORT_SYMBOL_GPL(set_timer_slack); |
332 | 333 | ||
333 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | 334 | static void |
335 | __internal_add_timer(struct tvec_base *base, struct timer_list *timer) | ||
334 | { | 336 | { |
335 | unsigned long expires = timer->expires; | 337 | unsigned long expires = timer->expires; |
336 | unsigned long idx = expires - base->timer_jiffies; | 338 | unsigned long idx = expires - base->timer_jiffies; |
@@ -372,6 +374,19 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | |||
372 | list_add_tail(&timer->entry, vec); | 374 | list_add_tail(&timer->entry, vec); |
373 | } | 375 | } |
374 | 376 | ||
377 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | ||
378 | { | ||
379 | __internal_add_timer(base, timer); | ||
380 | /* | ||
381 | * Update base->active_timers and base->next_timer | ||
382 | */ | ||
383 | if (!tbase_get_deferrable(timer->base)) { | ||
384 | if (time_before(timer->expires, base->next_timer)) | ||
385 | base->next_timer = timer->expires; | ||
386 | base->active_timers++; | ||
387 | } | ||
388 | } | ||
389 | |||
375 | #ifdef CONFIG_TIMER_STATS | 390 | #ifdef CONFIG_TIMER_STATS |
376 | void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) | 391 | void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) |
377 | { | 392 | { |
@@ -654,8 +669,7 @@ void init_timer_deferrable_key(struct timer_list *timer, | |||
654 | } | 669 | } |
655 | EXPORT_SYMBOL(init_timer_deferrable_key); | 670 | EXPORT_SYMBOL(init_timer_deferrable_key); |
656 | 671 | ||
657 | static inline void detach_timer(struct timer_list *timer, | 672 | static inline void detach_timer(struct timer_list *timer, bool clear_pending) |
658 | int clear_pending) | ||
659 | { | 673 | { |
660 | struct list_head *entry = &timer->entry; | 674 | struct list_head *entry = &timer->entry; |
661 | 675 | ||
@@ -667,6 +681,29 @@ static inline void detach_timer(struct timer_list *timer, | |||
667 | entry->prev = LIST_POISON2; | 681 | entry->prev = LIST_POISON2; |
668 | } | 682 | } |
669 | 683 | ||
684 | static inline void | ||
685 | detach_expired_timer(struct timer_list *timer, struct tvec_base *base) | ||
686 | { | ||
687 | detach_timer(timer, true); | ||
688 | if (!tbase_get_deferrable(timer->base)) | ||
689 | timer->base->active_timers--; | ||
690 | } | ||
691 | |||
692 | static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, | ||
693 | bool clear_pending) | ||
694 | { | ||
695 | if (!timer_pending(timer)) | ||
696 | return 0; | ||
697 | |||
698 | detach_timer(timer, clear_pending); | ||
699 | if (!tbase_get_deferrable(timer->base)) { | ||
700 | timer->base->active_timers--; | ||
701 | if (timer->expires == base->next_timer) | ||
702 | base->next_timer = base->timer_jiffies; | ||
703 | } | ||
704 | return 1; | ||
705 | } | ||
706 | |||
670 | /* | 707 | /* |
671 | * We are using hashed locking: holding per_cpu(tvec_bases).lock | 708 | * We are using hashed locking: holding per_cpu(tvec_bases).lock |
672 | * means that all timers which are tied to this base via timer->base are | 709 | * means that all timers which are tied to this base via timer->base are |
@@ -712,16 +749,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
712 | 749 | ||
713 | base = lock_timer_base(timer, &flags); | 750 | base = lock_timer_base(timer, &flags); |
714 | 751 | ||
715 | if (timer_pending(timer)) { | 752 | ret = detach_if_pending(timer, base, false); |
716 | detach_timer(timer, 0); | 753 | if (!ret && pending_only) |
717 | if (timer->expires == base->next_timer && | 754 | goto out_unlock; |
718 | !tbase_get_deferrable(timer->base)) | ||
719 | base->next_timer = base->timer_jiffies; | ||
720 | ret = 1; | ||
721 | } else { | ||
722 | if (pending_only) | ||
723 | goto out_unlock; | ||
724 | } | ||
725 | 755 | ||
726 | debug_activate(timer, expires); | 756 | debug_activate(timer, expires); |
727 | 757 | ||
@@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
752 | } | 782 | } |
753 | 783 | ||
754 | timer->expires = expires; | 784 | timer->expires = expires; |
755 | if (time_before(timer->expires, base->next_timer) && | ||
756 | !tbase_get_deferrable(timer->base)) | ||
757 | base->next_timer = timer->expires; | ||
758 | internal_add_timer(base, timer); | 785 | internal_add_timer(base, timer); |
759 | 786 | ||
760 | out_unlock: | 787 | out_unlock: |
@@ -920,9 +947,6 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
920 | spin_lock_irqsave(&base->lock, flags); | 947 | spin_lock_irqsave(&base->lock, flags); |
921 | timer_set_base(timer, base); | 948 | timer_set_base(timer, base); |
922 | debug_activate(timer, timer->expires); | 949 | debug_activate(timer, timer->expires); |
923 | if (time_before(timer->expires, base->next_timer) && | ||
924 | !tbase_get_deferrable(timer->base)) | ||
925 | base->next_timer = timer->expires; | ||
926 | internal_add_timer(base, timer); | 950 | internal_add_timer(base, timer); |
927 | /* | 951 | /* |
928 | * Check whether the other CPU is idle and needs to be | 952 | * Check whether the other CPU is idle and needs to be |
@@ -959,13 +983,7 @@ int del_timer(struct timer_list *timer) | |||
959 | timer_stats_timer_clear_start_info(timer); | 983 | timer_stats_timer_clear_start_info(timer); |
960 | if (timer_pending(timer)) { | 984 | if (timer_pending(timer)) { |
961 | base = lock_timer_base(timer, &flags); | 985 | base = lock_timer_base(timer, &flags); |
962 | if (timer_pending(timer)) { | 986 | ret = detach_if_pending(timer, base, true); |
963 | detach_timer(timer, 1); | ||
964 | if (timer->expires == base->next_timer && | ||
965 | !tbase_get_deferrable(timer->base)) | ||
966 | base->next_timer = base->timer_jiffies; | ||
967 | ret = 1; | ||
968 | } | ||
969 | spin_unlock_irqrestore(&base->lock, flags); | 987 | spin_unlock_irqrestore(&base->lock, flags); |
970 | } | 988 | } |
971 | 989 | ||
@@ -990,19 +1008,10 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
990 | 1008 | ||
991 | base = lock_timer_base(timer, &flags); | 1009 | base = lock_timer_base(timer, &flags); |
992 | 1010 | ||
993 | if (base->running_timer == timer) | 1011 | if (base->running_timer != timer) { |
994 | goto out; | 1012 | timer_stats_timer_clear_start_info(timer); |
995 | 1013 | ret = detach_if_pending(timer, base, true); | |
996 | timer_stats_timer_clear_start_info(timer); | ||
997 | ret = 0; | ||
998 | if (timer_pending(timer)) { | ||
999 | detach_timer(timer, 1); | ||
1000 | if (timer->expires == base->next_timer && | ||
1001 | !tbase_get_deferrable(timer->base)) | ||
1002 | base->next_timer = base->timer_jiffies; | ||
1003 | ret = 1; | ||
1004 | } | 1014 | } |
1005 | out: | ||
1006 | spin_unlock_irqrestore(&base->lock, flags); | 1015 | spin_unlock_irqrestore(&base->lock, flags); |
1007 | 1016 | ||
1008 | return ret; | 1017 | return ret; |
@@ -1089,7 +1098,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) | |||
1089 | */ | 1098 | */ |
1090 | list_for_each_entry_safe(timer, tmp, &tv_list, entry) { | 1099 | list_for_each_entry_safe(timer, tmp, &tv_list, entry) { |
1091 | BUG_ON(tbase_get_base(timer->base) != base); | 1100 | BUG_ON(tbase_get_base(timer->base) != base); |
1092 | internal_add_timer(base, timer); | 1101 | /* No accounting, while moving them */ |
1102 | __internal_add_timer(base, timer); | ||
1093 | } | 1103 | } |
1094 | 1104 | ||
1095 | return index; | 1105 | return index; |
@@ -1178,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1178 | timer_stats_account_timer(timer); | 1188 | timer_stats_account_timer(timer); |
1179 | 1189 | ||
1180 | base->running_timer = timer; | 1190 | base->running_timer = timer; |
1181 | detach_timer(timer, 1); | 1191 | detach_expired_timer(timer, base); |
1182 | 1192 | ||
1183 | spin_unlock_irq(&base->lock); | 1193 | spin_unlock_irq(&base->lock); |
1184 | call_timer_fn(timer, fn, data); | 1194 | call_timer_fn(timer, fn, data); |
@@ -1316,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
1316 | unsigned long get_next_timer_interrupt(unsigned long now) | 1326 | unsigned long get_next_timer_interrupt(unsigned long now) |
1317 | { | 1327 | { |
1318 | struct tvec_base *base = __this_cpu_read(tvec_bases); | 1328 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
1319 | unsigned long expires; | 1329 | unsigned long expires = now + NEXT_TIMER_MAX_DELTA; |
1320 | 1330 | ||
1321 | /* | 1331 | /* |
1322 | * Pretend that there is no timer pending if the cpu is offline. | 1332 | * Pretend that there is no timer pending if the cpu is offline. |
1323 | * Possible pending timers will be migrated later to an active cpu. | 1333 | * Possible pending timers will be migrated later to an active cpu. |
1324 | */ | 1334 | */ |
1325 | if (cpu_is_offline(smp_processor_id())) | 1335 | if (cpu_is_offline(smp_processor_id())) |
1326 | return now + NEXT_TIMER_MAX_DELTA; | 1336 | return expires; |
1337 | |||
1327 | spin_lock(&base->lock); | 1338 | spin_lock(&base->lock); |
1328 | if (time_before_eq(base->next_timer, base->timer_jiffies)) | 1339 | if (base->active_timers) { |
1329 | base->next_timer = __next_timer_interrupt(base); | 1340 | if (time_before_eq(base->next_timer, base->timer_jiffies)) |
1330 | expires = base->next_timer; | 1341 | base->next_timer = __next_timer_interrupt(base); |
1342 | expires = base->next_timer; | ||
1343 | } | ||
1331 | spin_unlock(&base->lock); | 1344 | spin_unlock(&base->lock); |
1332 | 1345 | ||
1333 | if (time_before_eq(expires, now)) | 1346 | if (time_before_eq(expires, now)) |
@@ -1704,6 +1717,7 @@ static int __cpuinit init_timers_cpu(int cpu) | |||
1704 | 1717 | ||
1705 | base->timer_jiffies = jiffies; | 1718 | base->timer_jiffies = jiffies; |
1706 | base->next_timer = base->timer_jiffies; | 1719 | base->next_timer = base->timer_jiffies; |
1720 | base->active_timers = 0; | ||
1707 | return 0; | 1721 | return 0; |
1708 | } | 1722 | } |
1709 | 1723 | ||
@@ -1714,11 +1728,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea | |||
1714 | 1728 | ||
1715 | while (!list_empty(head)) { | 1729 | while (!list_empty(head)) { |
1716 | timer = list_first_entry(head, struct timer_list, entry); | 1730 | timer = list_first_entry(head, struct timer_list, entry); |
1717 | detach_timer(timer, 0); | 1731 | /* We ignore the accounting on the dying cpu */ |
1732 | detach_timer(timer, false); | ||
1718 | timer_set_base(timer, new_base); | 1733 | timer_set_base(timer, new_base); |
1719 | if (time_before(timer->expires, new_base->next_timer) && | ||
1720 | !tbase_get_deferrable(timer->base)) | ||
1721 | new_base->next_timer = timer->expires; | ||
1722 | internal_add_timer(new_base, timer); | 1734 | internal_add_timer(new_base, timer); |
1723 | } | 1735 | } |
1724 | } | 1736 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a008663d86c8..b4f20fba09fc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, | |||
312 | 312 | ||
313 | static int __register_ftrace_function(struct ftrace_ops *ops) | 313 | static int __register_ftrace_function(struct ftrace_ops *ops) |
314 | { | 314 | { |
315 | if (ftrace_disabled) | 315 | if (unlikely(ftrace_disabled)) |
316 | return -ENODEV; | 316 | return -ENODEV; |
317 | 317 | ||
318 | if (FTRACE_WARN_ON(ops == &global_ops)) | 318 | if (FTRACE_WARN_ON(ops == &global_ops)) |
@@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops) | |||
4299 | 4299 | ||
4300 | mutex_lock(&ftrace_lock); | 4300 | mutex_lock(&ftrace_lock); |
4301 | 4301 | ||
4302 | if (unlikely(ftrace_disabled)) | ||
4303 | goto out_unlock; | ||
4304 | |||
4305 | ret = __register_ftrace_function(ops); | 4302 | ret = __register_ftrace_function(ops); |
4306 | if (!ret) | 4303 | if (!ret) |
4307 | ret = ftrace_startup(ops, 0); | 4304 | ret = ftrace_startup(ops, 0); |
4308 | 4305 | ||
4309 | |||
4310 | out_unlock: | ||
4311 | mutex_unlock(&ftrace_lock); | 4306 | mutex_unlock(&ftrace_lock); |
4307 | |||
4312 | return ret; | 4308 | return ret; |
4313 | } | 4309 | } |
4314 | EXPORT_SYMBOL_GPL(register_ftrace_function); | 4310 | EXPORT_SYMBOL_GPL(register_ftrace_function); |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..49491fa7daa2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) | |||
1075 | rb_init_page(bpage->page); | 1075 | rb_init_page(bpage->page); |
1076 | 1076 | ||
1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
1078 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
1078 | 1079 | ||
1079 | ret = rb_allocate_pages(cpu_buffer, nr_pages); | 1080 | ret = rb_allocate_pages(cpu_buffer, nr_pages); |
1080 | if (ret < 0) | 1081 | if (ret < 0) |
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | |||
1346 | * If something was added to this page, it was full | 1347 | * If something was added to this page, it was full |
1347 | * since it is not the tail page. So we deduct the | 1348 | * since it is not the tail page. So we deduct the |
1348 | * bytes consumed in ring buffer from here. | 1349 | * bytes consumed in ring buffer from here. |
1349 | * No need to update overruns, since this page is | 1350 | * Increment overrun to account for the lost events. |
1350 | * deleted from ring buffer and its entries are | ||
1351 | * already accounted for. | ||
1352 | */ | 1351 | */ |
1352 | local_add(page_entries, &cpu_buffer->overrun); | ||
1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | 1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); |
1354 | } | 1354 | } |
1355 | 1355 | ||
@@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
3239 | if (cpu_buffer->commit_page == cpu_buffer->reader_page) | 3239 | if (cpu_buffer->commit_page == cpu_buffer->reader_page) |
3240 | goto out; | 3240 | goto out; |
3241 | 3241 | ||
3242 | /* Don't bother swapping if the ring buffer is empty */ | ||
3243 | if (rb_num_of_entries(cpu_buffer) == 0) | ||
3244 | goto out; | ||
3245 | |||
3242 | /* | 3246 | /* |
3243 | * Reset the reader page to size zero. | 3247 | * Reset the reader page to size zero. |
3244 | */ | 3248 | */ |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6177db..5c38c81496ce 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); | |||
371 | void tracing_off(void) | 371 | void tracing_off(void) |
372 | { | 372 | { |
373 | if (global_trace.buffer) | 373 | if (global_trace.buffer) |
374 | ring_buffer_record_on(global_trace.buffer); | 374 | ring_buffer_record_off(global_trace.buffer); |
375 | /* | 375 | /* |
376 | * This flag is only looked at when buffers haven't been | 376 | * This flag is only looked at when buffers haven't been |
377 | * allocated yet. We don't really care about the race | 377 | * allocated yet. We don't really care about the race |
@@ -830,6 +830,8 @@ int register_tracer(struct tracer *type) | |||
830 | current_trace = saved_tracer; | 830 | current_trace = saved_tracer; |
831 | if (ret) { | 831 | if (ret) { |
832 | printk(KERN_CONT "FAILED!\n"); | 832 | printk(KERN_CONT "FAILED!\n"); |
833 | /* Add the warning after printing 'FAILED' */ | ||
834 | WARN_ON(1); | ||
833 | goto out; | 835 | goto out; |
834 | } | 836 | } |
835 | /* Only reset on passing, to avoid touching corrupted buffers */ | 837 | /* Only reset on passing, to avoid touching corrupted buffers */ |
@@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk); | |||
1708 | 1710 | ||
1709 | static void trace_iterator_increment(struct trace_iterator *iter) | 1711 | static void trace_iterator_increment(struct trace_iterator *iter) |
1710 | { | 1712 | { |
1713 | struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); | ||
1714 | |||
1711 | iter->idx++; | 1715 | iter->idx++; |
1712 | if (iter->buffer_iter[iter->cpu]) | 1716 | if (buf_iter) |
1713 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); | 1717 | ring_buffer_read(buf_iter, NULL); |
1714 | } | 1718 | } |
1715 | 1719 | ||
1716 | static struct trace_entry * | 1720 | static struct trace_entry * |
@@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
1718 | unsigned long *lost_events) | 1722 | unsigned long *lost_events) |
1719 | { | 1723 | { |
1720 | struct ring_buffer_event *event; | 1724 | struct ring_buffer_event *event; |
1721 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; | 1725 | struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); |
1722 | 1726 | ||
1723 | if (buf_iter) | 1727 | if (buf_iter) |
1724 | event = ring_buffer_iter_peek(buf_iter, ts); | 1728 | event = ring_buffer_iter_peek(buf_iter, ts); |
@@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
1856 | 1860 | ||
1857 | tr->data[cpu]->skipped_entries = 0; | 1861 | tr->data[cpu]->skipped_entries = 0; |
1858 | 1862 | ||
1859 | if (!iter->buffer_iter[cpu]) | 1863 | buf_iter = trace_buffer_iter(iter, cpu); |
1864 | if (!buf_iter) | ||
1860 | return; | 1865 | return; |
1861 | 1866 | ||
1862 | buf_iter = iter->buffer_iter[cpu]; | ||
1863 | ring_buffer_iter_reset(buf_iter); | 1867 | ring_buffer_iter_reset(buf_iter); |
1864 | 1868 | ||
1865 | /* | 1869 | /* |
@@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) | |||
2205 | 2209 | ||
2206 | int trace_empty(struct trace_iterator *iter) | 2210 | int trace_empty(struct trace_iterator *iter) |
2207 | { | 2211 | { |
2212 | struct ring_buffer_iter *buf_iter; | ||
2208 | int cpu; | 2213 | int cpu; |
2209 | 2214 | ||
2210 | /* If we are looking at one CPU buffer, only check that one */ | 2215 | /* If we are looking at one CPU buffer, only check that one */ |
2211 | if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { | 2216 | if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { |
2212 | cpu = iter->cpu_file; | 2217 | cpu = iter->cpu_file; |
2213 | if (iter->buffer_iter[cpu]) { | 2218 | buf_iter = trace_buffer_iter(iter, cpu); |
2214 | if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) | 2219 | if (buf_iter) { |
2220 | if (!ring_buffer_iter_empty(buf_iter)) | ||
2215 | return 0; | 2221 | return 0; |
2216 | } else { | 2222 | } else { |
2217 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) | 2223 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) |
@@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter) | |||
2221 | } | 2227 | } |
2222 | 2228 | ||
2223 | for_each_tracing_cpu(cpu) { | 2229 | for_each_tracing_cpu(cpu) { |
2224 | if (iter->buffer_iter[cpu]) { | 2230 | buf_iter = trace_buffer_iter(iter, cpu); |
2225 | if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) | 2231 | if (buf_iter) { |
2232 | if (!ring_buffer_iter_empty(buf_iter)) | ||
2226 | return 0; | 2233 | return 0; |
2227 | } else { | 2234 | } else { |
2228 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) | 2235 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) |
@@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2381 | if (!iter) | 2388 | if (!iter) |
2382 | return ERR_PTR(-ENOMEM); | 2389 | return ERR_PTR(-ENOMEM); |
2383 | 2390 | ||
2391 | iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), | ||
2392 | GFP_KERNEL); | ||
2393 | if (!iter->buffer_iter) | ||
2394 | goto release; | ||
2395 | |||
2384 | /* | 2396 | /* |
2385 | * We make a copy of the current tracer to avoid concurrent | 2397 | * We make a copy of the current tracer to avoid concurrent |
2386 | * changes on it while we are reading. | 2398 | * changes on it while we are reading. |
@@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2441 | fail: | 2453 | fail: |
2442 | mutex_unlock(&trace_types_lock); | 2454 | mutex_unlock(&trace_types_lock); |
2443 | kfree(iter->trace); | 2455 | kfree(iter->trace); |
2456 | kfree(iter->buffer_iter); | ||
2457 | release: | ||
2444 | seq_release_private(inode, file); | 2458 | seq_release_private(inode, file); |
2445 | return ERR_PTR(-ENOMEM); | 2459 | return ERR_PTR(-ENOMEM); |
2446 | } | 2460 | } |
@@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2481 | mutex_destroy(&iter->mutex); | 2495 | mutex_destroy(&iter->mutex); |
2482 | free_cpumask_var(iter->started); | 2496 | free_cpumask_var(iter->started); |
2483 | kfree(iter->trace); | 2497 | kfree(iter->trace); |
2498 | kfree(iter->buffer_iter); | ||
2484 | seq_release_private(inode, file); | 2499 | seq_release_private(inode, file); |
2485 | return 0; | 2500 | return 0; |
2486 | } | 2501 | } |
@@ -3172,10 +3187,10 @@ static int tracing_set_tracer(const char *buf) | |||
3172 | } | 3187 | } |
3173 | destroy_trace_option_files(topts); | 3188 | destroy_trace_option_files(topts); |
3174 | 3189 | ||
3175 | current_trace = t; | 3190 | current_trace = &nop_trace; |
3176 | 3191 | ||
3177 | topts = create_trace_option_files(current_trace); | 3192 | topts = create_trace_option_files(t); |
3178 | if (current_trace->use_max_tr) { | 3193 | if (t->use_max_tr) { |
3179 | int cpu; | 3194 | int cpu; |
3180 | /* we need to make per cpu buffer sizes equivalent */ | 3195 | /* we need to make per cpu buffer sizes equivalent */ |
3181 | for_each_tracing_cpu(cpu) { | 3196 | for_each_tracing_cpu(cpu) { |
@@ -3195,6 +3210,7 @@ static int tracing_set_tracer(const char *buf) | |||
3195 | goto out; | 3210 | goto out; |
3196 | } | 3211 | } |
3197 | 3212 | ||
3213 | current_trace = t; | ||
3198 | trace_branch_enable(tr); | 3214 | trace_branch_enable(tr); |
3199 | out: | 3215 | out: |
3200 | mutex_unlock(&trace_types_lock); | 3216 | mutex_unlock(&trace_types_lock); |
@@ -3609,6 +3625,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3609 | .pages = pages_def, | 3625 | .pages = pages_def, |
3610 | .partial = partial_def, | 3626 | .partial = partial_def, |
3611 | .nr_pages = 0, /* This gets updated below. */ | 3627 | .nr_pages = 0, /* This gets updated below. */ |
3628 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
3612 | .flags = flags, | 3629 | .flags = flags, |
3613 | .ops = &tracing_pipe_buf_ops, | 3630 | .ops = &tracing_pipe_buf_ops, |
3614 | .spd_release = tracing_spd_release_pipe, | 3631 | .spd_release = tracing_spd_release_pipe, |
@@ -3680,7 +3697,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3680 | 3697 | ||
3681 | ret = splice_to_pipe(pipe, &spd); | 3698 | ret = splice_to_pipe(pipe, &spd); |
3682 | out: | 3699 | out: |
3683 | splice_shrink_spd(pipe, &spd); | 3700 | splice_shrink_spd(&spd); |
3684 | return ret; | 3701 | return ret; |
3685 | 3702 | ||
3686 | out_err: | 3703 | out_err: |
@@ -4231,6 +4248,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4231 | struct splice_pipe_desc spd = { | 4248 | struct splice_pipe_desc spd = { |
4232 | .pages = pages_def, | 4249 | .pages = pages_def, |
4233 | .partial = partial_def, | 4250 | .partial = partial_def, |
4251 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
4234 | .flags = flags, | 4252 | .flags = flags, |
4235 | .ops = &buffer_pipe_buf_ops, | 4253 | .ops = &buffer_pipe_buf_ops, |
4236 | .spd_release = buffer_spd_release, | 4254 | .spd_release = buffer_spd_release, |
@@ -4318,7 +4336,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4318 | } | 4336 | } |
4319 | 4337 | ||
4320 | ret = splice_to_pipe(pipe, &spd); | 4338 | ret = splice_to_pipe(pipe, &spd); |
4321 | splice_shrink_spd(pipe, &spd); | 4339 | splice_shrink_spd(&spd); |
4322 | out: | 4340 | out: |
4323 | return ret; | 4341 | return ret; |
4324 | } | 4342 | } |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5aec220d2de0..55e1f7f0db12 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -317,6 +317,14 @@ struct tracer { | |||
317 | 317 | ||
318 | #define TRACE_PIPE_ALL_CPU -1 | 318 | #define TRACE_PIPE_ALL_CPU -1 |
319 | 319 | ||
320 | static inline struct ring_buffer_iter * | ||
321 | trace_buffer_iter(struct trace_iterator *iter, int cpu) | ||
322 | { | ||
323 | if (iter->buffer_iter && iter->buffer_iter[cpu]) | ||
324 | return iter->buffer_iter[cpu]; | ||
325 | return NULL; | ||
326 | } | ||
327 | |||
320 | int tracer_init(struct tracer *t, struct trace_array *tr); | 328 | int tracer_init(struct tracer *t, struct trace_array *tr); |
321 | int tracing_is_enabled(void); | 329 | int tracing_is_enabled(void); |
322 | void trace_wake_up(void); | 330 | void trace_wake_up(void); |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752ae8f6..8a6d2ee2086c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) | |||
281 | 281 | ||
282 | head = this_cpu_ptr(event_function.perf_events); | 282 | head = this_cpu_ptr(event_function.perf_events); |
283 | perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, | 283 | perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, |
284 | 1, ®s, head); | 284 | 1, ®s, head, NULL); |
285 | 285 | ||
286 | #undef ENTRY_SIZE | 286 | #undef ENTRY_SIZE |
287 | } | 287 | } |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db09..a426f410c060 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
14 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
15 | #include <linux/ftrace.h> | 15 | #include <linux/ftrace.h> |
16 | #include <linux/pstore.h> | ||
16 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
17 | 18 | ||
18 | #include "trace.h" | 19 | #include "trace.h" |
@@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
74 | preempt_enable_notrace(); | 75 | preempt_enable_notrace(); |
75 | } | 76 | } |
76 | 77 | ||
78 | /* Our two options */ | ||
79 | enum { | ||
80 | TRACE_FUNC_OPT_STACK = 0x1, | ||
81 | TRACE_FUNC_OPT_PSTORE = 0x2, | ||
82 | }; | ||
83 | |||
84 | static struct tracer_flags func_flags; | ||
85 | |||
77 | static void | 86 | static void |
78 | function_trace_call(unsigned long ip, unsigned long parent_ip) | 87 | function_trace_call(unsigned long ip, unsigned long parent_ip) |
79 | { | 88 | { |
@@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) | |||
97 | disabled = atomic_inc_return(&data->disabled); | 106 | disabled = atomic_inc_return(&data->disabled); |
98 | 107 | ||
99 | if (likely(disabled == 1)) { | 108 | if (likely(disabled == 1)) { |
109 | /* | ||
110 | * So far tracing doesn't support multiple buffers, so | ||
111 | * we make an explicit call for now. | ||
112 | */ | ||
113 | if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) | ||
114 | pstore_ftrace_call(ip, parent_ip); | ||
100 | pc = preempt_count(); | 115 | pc = preempt_count(); |
101 | trace_function(tr, ip, parent_ip, flags, pc); | 116 | trace_function(tr, ip, parent_ip, flags, pc); |
102 | } | 117 | } |
@@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly = | |||
158 | .flags = FTRACE_OPS_FL_GLOBAL, | 173 | .flags = FTRACE_OPS_FL_GLOBAL, |
159 | }; | 174 | }; |
160 | 175 | ||
161 | /* Our two options */ | ||
162 | enum { | ||
163 | TRACE_FUNC_OPT_STACK = 0x1, | ||
164 | }; | ||
165 | |||
166 | static struct tracer_opt func_opts[] = { | 176 | static struct tracer_opt func_opts[] = { |
167 | #ifdef CONFIG_STACKTRACE | 177 | #ifdef CONFIG_STACKTRACE |
168 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, | 178 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, |
169 | #endif | 179 | #endif |
180 | #ifdef CONFIG_PSTORE_FTRACE | ||
181 | { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, | ||
182 | #endif | ||
170 | { } /* Always set a last empty entry */ | 183 | { } /* Always set a last empty entry */ |
171 | }; | 184 | }; |
172 | 185 | ||
@@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void) | |||
204 | 217 | ||
205 | static int func_set_flag(u32 old_flags, u32 bit, int set) | 218 | static int func_set_flag(u32 old_flags, u32 bit, int set) |
206 | { | 219 | { |
207 | if (bit == TRACE_FUNC_OPT_STACK) { | 220 | switch (bit) { |
221 | case TRACE_FUNC_OPT_STACK: | ||
208 | /* do nothing if already set */ | 222 | /* do nothing if already set */ |
209 | if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) | 223 | if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) |
210 | return 0; | 224 | break; |
211 | 225 | ||
212 | if (set) { | 226 | if (set) { |
213 | unregister_ftrace_function(&trace_ops); | 227 | unregister_ftrace_function(&trace_ops); |
@@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) | |||
217 | register_ftrace_function(&trace_ops); | 231 | register_ftrace_function(&trace_ops); |
218 | } | 232 | } |
219 | 233 | ||
220 | return 0; | 234 | break; |
235 | case TRACE_FUNC_OPT_PSTORE: | ||
236 | break; | ||
237 | default: | ||
238 | return -EINVAL; | ||
221 | } | 239 | } |
222 | 240 | ||
223 | return -EINVAL; | 241 | return 0; |
224 | } | 242 | } |
225 | 243 | ||
226 | static struct tracer function_trace __read_mostly = | 244 | static struct tracer function_trace __read_mostly = |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7d2a4c653d8..ce27c8ba8d31 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
538 | next = &data->ret; | 538 | next = &data->ret; |
539 | } else { | 539 | } else { |
540 | 540 | ||
541 | ring_iter = iter->buffer_iter[iter->cpu]; | 541 | ring_iter = trace_buffer_iter(iter, iter->cpu); |
542 | 542 | ||
543 | /* First peek to compare current entry and the next one */ | 543 | /* First peek to compare current entry and the next one */ |
544 | if (ring_iter) | 544 | if (ring_iter) |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b31d3d5699fe..1a2117043bb1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
1002 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1002 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1003 | 1003 | ||
1004 | head = this_cpu_ptr(call->perf_events); | 1004 | head = this_cpu_ptr(call->perf_events); |
1005 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); | 1005 | perf_trace_buf_submit(entry, size, rctx, |
1006 | entry->ip, 1, regs, head, NULL); | ||
1006 | } | 1007 | } |
1007 | 1008 | ||
1008 | /* Kretprobe profile handler */ | 1009 | /* Kretprobe profile handler */ |
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
1033 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1034 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1034 | 1035 | ||
1035 | head = this_cpu_ptr(call->perf_events); | 1036 | head = this_cpu_ptr(call->perf_events); |
1036 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); | 1037 | perf_trace_buf_submit(entry, size, rctx, |
1038 | entry->ret_ip, 1, regs, head, NULL); | ||
1037 | } | 1039 | } |
1038 | #endif /* CONFIG_PERF_EVENTS */ | 1040 | #endif /* CONFIG_PERF_EVENTS */ |
1039 | 1041 | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index df611a0e76c5..123b189c732c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -1325,4 +1325,4 @@ __init static int init_events(void) | |||
1325 | 1325 | ||
1326 | return 0; | 1326 | return 0; |
1327 | } | 1327 | } |
1328 | device_initcall(init_events); | 1328 | early_initcall(init_events); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 96fc73369099..60e4d7875672 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
532 | (unsigned long *)&rec->args); | 532 | (unsigned long *)&rec->args); |
533 | 533 | ||
534 | head = this_cpu_ptr(sys_data->enter_event->perf_events); | 534 | head = this_cpu_ptr(sys_data->enter_event->perf_events); |
535 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); | 535 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
536 | } | 536 | } |
537 | 537 | ||
538 | int perf_sysenter_enable(struct ftrace_event_call *call) | 538 | int perf_sysenter_enable(struct ftrace_event_call *call) |
@@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
608 | rec->ret = syscall_get_return_value(current, regs); | 608 | rec->ret = syscall_get_return_value(current, regs); |
609 | 609 | ||
610 | head = this_cpu_ptr(sys_data->exit_event->perf_events); | 610 | head = this_cpu_ptr(sys_data->exit_event->perf_events); |
611 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); | 611 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
612 | } | 612 | } |
613 | 613 | ||
614 | int perf_sysexit_enable(struct ftrace_event_call *call) | 614 | int perf_sysexit_enable(struct ftrace_event_call *call) |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2b36ac68549e..03003cd7dd96 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | |||
670 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | 670 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); |
671 | 671 | ||
672 | head = this_cpu_ptr(call->perf_events); | 672 | head = this_cpu_ptr(call->perf_events); |
673 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); | 673 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); |
674 | 674 | ||
675 | out: | 675 | out: |
676 | preempt_enable(); | 676 | preempt_enable(); |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85b8c7c..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -372,6 +372,13 @@ static int watchdog(void *unused) | |||
372 | 372 | ||
373 | 373 | ||
374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
375 | /* | ||
376 | * People like the simple clean cpu node info on boot. | ||
377 | * Reduce the watchdog noise by only printing messages | ||
378 | * that are different from what cpu0 displayed. | ||
379 | */ | ||
380 | static unsigned long cpu0_err; | ||
381 | |||
375 | static int watchdog_nmi_enable(int cpu) | 382 | static int watchdog_nmi_enable(int cpu) |
376 | { | 383 | { |
377 | struct perf_event_attr *wd_attr; | 384 | struct perf_event_attr *wd_attr; |
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) | |||
390 | 397 | ||
391 | /* Try to register using hardware perf events */ | 398 | /* Try to register using hardware perf events */ |
392 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 399 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
400 | |||
401 | /* save cpu0 error for future comparision */ | ||
402 | if (cpu == 0 && IS_ERR(event)) | ||
403 | cpu0_err = PTR_ERR(event); | ||
404 | |||
393 | if (!IS_ERR(event)) { | 405 | if (!IS_ERR(event)) { |
394 | pr_info("enabled, takes one hw-pmu counter.\n"); | 406 | /* only print for cpu0 or different than cpu0 */ |
407 | if (cpu == 0 || cpu0_err) | ||
408 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
395 | goto out_save; | 409 | goto out_save; |
396 | } | 410 | } |
397 | 411 | ||
412 | /* skip displaying the same error again */ | ||
413 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
414 | return PTR_ERR(event); | ||
398 | 415 | ||
399 | /* vary the KERN level based on the returned errno */ | 416 | /* vary the KERN level based on the returned errno */ |
400 | if (PTR_ERR(event) == -EOPNOTSUPP) | 417 | if (PTR_ERR(event) == -EOPNOTSUPP) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a3128dc67df..692d97628a10 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -45,32 +45,41 @@ | |||
45 | #include "workqueue_sched.h" | 45 | #include "workqueue_sched.h" |
46 | 46 | ||
47 | enum { | 47 | enum { |
48 | /* global_cwq flags */ | 48 | /* |
49 | GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | 49 | * global_cwq flags |
50 | GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ | 50 | * |
51 | GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | 51 | * A bound gcwq is either associated or disassociated with its CPU. |
52 | GCWQ_FREEZING = 1 << 3, /* freeze in progress */ | 52 | * While associated (!DISASSOCIATED), all workers are bound to the |
53 | GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ | 53 | * CPU and none has %WORKER_UNBOUND set and concurrency management |
54 | * is in effect. | ||
55 | * | ||
56 | * While DISASSOCIATED, the cpu may be offline and all workers have | ||
57 | * %WORKER_UNBOUND set and concurrency management disabled, and may | ||
58 | * be executing on any CPU. The gcwq behaves as an unbound one. | ||
59 | * | ||
60 | * Note that DISASSOCIATED can be flipped only while holding | ||
61 | * managership of all pools on the gcwq to avoid changing binding | ||
62 | * state while create_worker() is in progress. | ||
63 | */ | ||
64 | GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ | ||
65 | GCWQ_FREEZING = 1 << 1, /* freeze in progress */ | ||
66 | |||
67 | /* pool flags */ | ||
68 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | ||
54 | 69 | ||
55 | /* worker flags */ | 70 | /* worker flags */ |
56 | WORKER_STARTED = 1 << 0, /* started */ | 71 | WORKER_STARTED = 1 << 0, /* started */ |
57 | WORKER_DIE = 1 << 1, /* die die die */ | 72 | WORKER_DIE = 1 << 1, /* die die die */ |
58 | WORKER_IDLE = 1 << 2, /* is idle */ | 73 | WORKER_IDLE = 1 << 2, /* is idle */ |
59 | WORKER_PREP = 1 << 3, /* preparing to run works */ | 74 | WORKER_PREP = 1 << 3, /* preparing to run works */ |
60 | WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ | ||
61 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ | 75 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ |
62 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ | 76 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ |
63 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ | 77 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ |
64 | 78 | ||
65 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | | 79 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | |
66 | WORKER_CPU_INTENSIVE | WORKER_UNBOUND, | 80 | WORKER_CPU_INTENSIVE, |
67 | 81 | ||
68 | /* gcwq->trustee_state */ | 82 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ |
69 | TRUSTEE_START = 0, /* start */ | ||
70 | TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ | ||
71 | TRUSTEE_BUTCHER = 2, /* butcher workers */ | ||
72 | TRUSTEE_RELEASE = 3, /* release workers */ | ||
73 | TRUSTEE_DONE = 4, /* trustee is done */ | ||
74 | 83 | ||
75 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ | 84 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ |
76 | BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, | 85 | BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, |
@@ -84,13 +93,13 @@ enum { | |||
84 | (min two ticks) */ | 93 | (min two ticks) */ |
85 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | 94 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ |
86 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | 95 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ |
87 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | ||
88 | 96 | ||
89 | /* | 97 | /* |
90 | * Rescue workers are used only on emergencies and shared by | 98 | * Rescue workers are used only on emergencies and shared by |
91 | * all cpus. Give -20. | 99 | * all cpus. Give -20. |
92 | */ | 100 | */ |
93 | RESCUER_NICE_LEVEL = -20, | 101 | RESCUER_NICE_LEVEL = -20, |
102 | HIGHPRI_NICE_LEVEL = -20, | ||
94 | }; | 103 | }; |
95 | 104 | ||
96 | /* | 105 | /* |
@@ -115,6 +124,8 @@ enum { | |||
115 | */ | 124 | */ |
116 | 125 | ||
117 | struct global_cwq; | 126 | struct global_cwq; |
127 | struct worker_pool; | ||
128 | struct idle_rebind; | ||
118 | 129 | ||
119 | /* | 130 | /* |
120 | * The poor guys doing the actual heavy lifting. All on-duty workers | 131 | * The poor guys doing the actual heavy lifting. All on-duty workers |
@@ -131,12 +142,31 @@ struct worker { | |||
131 | struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ | 142 | struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ |
132 | struct list_head scheduled; /* L: scheduled works */ | 143 | struct list_head scheduled; /* L: scheduled works */ |
133 | struct task_struct *task; /* I: worker task */ | 144 | struct task_struct *task; /* I: worker task */ |
134 | struct global_cwq *gcwq; /* I: the associated gcwq */ | 145 | struct worker_pool *pool; /* I: the associated pool */ |
135 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | 146 | /* 64 bytes boundary on 64bit, 32 on 32bit */ |
136 | unsigned long last_active; /* L: last active timestamp */ | 147 | unsigned long last_active; /* L: last active timestamp */ |
137 | unsigned int flags; /* X: flags */ | 148 | unsigned int flags; /* X: flags */ |
138 | int id; /* I: worker id */ | 149 | int id; /* I: worker id */ |
139 | struct work_struct rebind_work; /* L: rebind worker to cpu */ | 150 | |
151 | /* for rebinding worker to CPU */ | ||
152 | struct idle_rebind *idle_rebind; /* L: for idle worker */ | ||
153 | struct work_struct rebind_work; /* L: for busy worker */ | ||
154 | }; | ||
155 | |||
156 | struct worker_pool { | ||
157 | struct global_cwq *gcwq; /* I: the owning gcwq */ | ||
158 | unsigned int flags; /* X: flags */ | ||
159 | |||
160 | struct list_head worklist; /* L: list of pending works */ | ||
161 | int nr_workers; /* L: total number of workers */ | ||
162 | int nr_idle; /* L: currently idle ones */ | ||
163 | |||
164 | struct list_head idle_list; /* X: list of idle workers */ | ||
165 | struct timer_list idle_timer; /* L: worker idle timeout */ | ||
166 | struct timer_list mayday_timer; /* L: SOS timer for workers */ | ||
167 | |||
168 | struct mutex manager_mutex; /* mutex manager should hold */ | ||
169 | struct ida worker_ida; /* L: for worker IDs */ | ||
140 | }; | 170 | }; |
141 | 171 | ||
142 | /* | 172 | /* |
@@ -146,27 +176,16 @@ struct worker { | |||
146 | */ | 176 | */ |
147 | struct global_cwq { | 177 | struct global_cwq { |
148 | spinlock_t lock; /* the gcwq lock */ | 178 | spinlock_t lock; /* the gcwq lock */ |
149 | struct list_head worklist; /* L: list of pending works */ | ||
150 | unsigned int cpu; /* I: the associated cpu */ | 179 | unsigned int cpu; /* I: the associated cpu */ |
151 | unsigned int flags; /* L: GCWQ_* flags */ | 180 | unsigned int flags; /* L: GCWQ_* flags */ |
152 | 181 | ||
153 | int nr_workers; /* L: total number of workers */ | 182 | /* workers are chained either in busy_hash or pool idle_list */ |
154 | int nr_idle; /* L: currently idle ones */ | ||
155 | |||
156 | /* workers are chained either in the idle_list or busy_hash */ | ||
157 | struct list_head idle_list; /* X: list of idle workers */ | ||
158 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; | 183 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; |
159 | /* L: hash of busy workers */ | 184 | /* L: hash of busy workers */ |
160 | 185 | ||
161 | struct timer_list idle_timer; /* L: worker idle timeout */ | 186 | struct worker_pool pools[2]; /* normal and highpri pools */ |
162 | struct timer_list mayday_timer; /* L: SOS timer for dworkers */ | ||
163 | |||
164 | struct ida worker_ida; /* L: for worker IDs */ | ||
165 | 187 | ||
166 | struct task_struct *trustee; /* L: for gcwq shutdown */ | 188 | wait_queue_head_t rebind_hold; /* rebind hold wait */ |
167 | unsigned int trustee_state; /* L: trustee state */ | ||
168 | wait_queue_head_t trustee_wait; /* trustee wait */ | ||
169 | struct worker *first_idle; /* L: first idle worker */ | ||
170 | } ____cacheline_aligned_in_smp; | 189 | } ____cacheline_aligned_in_smp; |
171 | 190 | ||
172 | /* | 191 | /* |
@@ -175,7 +194,7 @@ struct global_cwq { | |||
175 | * aligned at two's power of the number of flag bits. | 194 | * aligned at two's power of the number of flag bits. |
176 | */ | 195 | */ |
177 | struct cpu_workqueue_struct { | 196 | struct cpu_workqueue_struct { |
178 | struct global_cwq *gcwq; /* I: the associated gcwq */ | 197 | struct worker_pool *pool; /* I: the associated pool */ |
179 | struct workqueue_struct *wq; /* I: the owning workqueue */ | 198 | struct workqueue_struct *wq; /* I: the owning workqueue */ |
180 | int work_color; /* L: current color */ | 199 | int work_color; /* L: current color */ |
181 | int flush_color; /* L: flushing color */ | 200 | int flush_color; /* L: flushing color */ |
@@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); | |||
264 | #define CREATE_TRACE_POINTS | 283 | #define CREATE_TRACE_POINTS |
265 | #include <trace/events/workqueue.h> | 284 | #include <trace/events/workqueue.h> |
266 | 285 | ||
286 | #define for_each_worker_pool(pool, gcwq) \ | ||
287 | for ((pool) = &(gcwq)->pools[0]; \ | ||
288 | (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) | ||
289 | |||
267 | #define for_each_busy_worker(worker, i, pos, gcwq) \ | 290 | #define for_each_busy_worker(worker, i, pos, gcwq) \ |
268 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ | 291 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ |
269 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) | 292 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) |
@@ -444,7 +467,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ | |||
444 | * try_to_wake_up(). Put it in a separate cacheline. | 467 | * try_to_wake_up(). Put it in a separate cacheline. |
445 | */ | 468 | */ |
446 | static DEFINE_PER_CPU(struct global_cwq, global_cwq); | 469 | static DEFINE_PER_CPU(struct global_cwq, global_cwq); |
447 | static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); | 470 | static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); |
448 | 471 | ||
449 | /* | 472 | /* |
450 | * Global cpu workqueue and nr_running counter for unbound gcwq. The | 473 | * Global cpu workqueue and nr_running counter for unbound gcwq. The |
@@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); | |||
452 | * workers have WORKER_UNBOUND set. | 475 | * workers have WORKER_UNBOUND set. |
453 | */ | 476 | */ |
454 | static struct global_cwq unbound_global_cwq; | 477 | static struct global_cwq unbound_global_cwq; |
455 | static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ | 478 | static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { |
479 | [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ | ||
480 | }; | ||
456 | 481 | ||
457 | static int worker_thread(void *__worker); | 482 | static int worker_thread(void *__worker); |
458 | 483 | ||
484 | static int worker_pool_pri(struct worker_pool *pool) | ||
485 | { | ||
486 | return pool - pool->gcwq->pools; | ||
487 | } | ||
488 | |||
459 | static struct global_cwq *get_gcwq(unsigned int cpu) | 489 | static struct global_cwq *get_gcwq(unsigned int cpu) |
460 | { | 490 | { |
461 | if (cpu != WORK_CPU_UNBOUND) | 491 | if (cpu != WORK_CPU_UNBOUND) |
@@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu) | |||
464 | return &unbound_global_cwq; | 494 | return &unbound_global_cwq; |
465 | } | 495 | } |
466 | 496 | ||
467 | static atomic_t *get_gcwq_nr_running(unsigned int cpu) | 497 | static atomic_t *get_pool_nr_running(struct worker_pool *pool) |
468 | { | 498 | { |
499 | int cpu = pool->gcwq->cpu; | ||
500 | int idx = worker_pool_pri(pool); | ||
501 | |||
469 | if (cpu != WORK_CPU_UNBOUND) | 502 | if (cpu != WORK_CPU_UNBOUND) |
470 | return &per_cpu(gcwq_nr_running, cpu); | 503 | return &per_cpu(pool_nr_running, cpu)[idx]; |
471 | else | 504 | else |
472 | return &unbound_gcwq_nr_running; | 505 | return &unbound_pool_nr_running[idx]; |
473 | } | 506 | } |
474 | 507 | ||
475 | static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, | 508 | static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, |
@@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) | |||
555 | 588 | ||
556 | if (data & WORK_STRUCT_CWQ) | 589 | if (data & WORK_STRUCT_CWQ) |
557 | return ((struct cpu_workqueue_struct *) | 590 | return ((struct cpu_workqueue_struct *) |
558 | (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; | 591 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; |
559 | 592 | ||
560 | cpu = data >> WORK_STRUCT_FLAG_BITS; | 593 | cpu = data >> WORK_STRUCT_FLAG_BITS; |
561 | if (cpu == WORK_CPU_NONE) | 594 | if (cpu == WORK_CPU_NONE) |
@@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) | |||
566 | } | 599 | } |
567 | 600 | ||
568 | /* | 601 | /* |
569 | * Policy functions. These define the policies on how the global | 602 | * Policy functions. These define the policies on how the global worker |
570 | * worker pool is managed. Unless noted otherwise, these functions | 603 | * pools are managed. Unless noted otherwise, these functions assume that |
571 | * assume that they're being called with gcwq->lock held. | 604 | * they're being called with gcwq->lock held. |
572 | */ | 605 | */ |
573 | 606 | ||
574 | static bool __need_more_worker(struct global_cwq *gcwq) | 607 | static bool __need_more_worker(struct worker_pool *pool) |
575 | { | 608 | { |
576 | return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || | 609 | return !atomic_read(get_pool_nr_running(pool)); |
577 | gcwq->flags & GCWQ_HIGHPRI_PENDING; | ||
578 | } | 610 | } |
579 | 611 | ||
580 | /* | 612 | /* |
581 | * Need to wake up a worker? Called from anything but currently | 613 | * Need to wake up a worker? Called from anything but currently |
582 | * running workers. | 614 | * running workers. |
615 | * | ||
616 | * Note that, because unbound workers never contribute to nr_running, this | ||
617 | * function will always return %true for unbound gcwq as long as the | ||
618 | * worklist isn't empty. | ||
583 | */ | 619 | */ |
584 | static bool need_more_worker(struct global_cwq *gcwq) | 620 | static bool need_more_worker(struct worker_pool *pool) |
585 | { | 621 | { |
586 | return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); | 622 | return !list_empty(&pool->worklist) && __need_more_worker(pool); |
587 | } | 623 | } |
588 | 624 | ||
589 | /* Can I start working? Called from busy but !running workers. */ | 625 | /* Can I start working? Called from busy but !running workers. */ |
590 | static bool may_start_working(struct global_cwq *gcwq) | 626 | static bool may_start_working(struct worker_pool *pool) |
591 | { | 627 | { |
592 | return gcwq->nr_idle; | 628 | return pool->nr_idle; |
593 | } | 629 | } |
594 | 630 | ||
595 | /* Do I need to keep working? Called from currently running workers. */ | 631 | /* Do I need to keep working? Called from currently running workers. */ |
596 | static bool keep_working(struct global_cwq *gcwq) | 632 | static bool keep_working(struct worker_pool *pool) |
597 | { | 633 | { |
598 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | 634 | atomic_t *nr_running = get_pool_nr_running(pool); |
599 | 635 | ||
600 | return !list_empty(&gcwq->worklist) && | 636 | return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; |
601 | (atomic_read(nr_running) <= 1 || | ||
602 | gcwq->flags & GCWQ_HIGHPRI_PENDING); | ||
603 | } | 637 | } |
604 | 638 | ||
605 | /* Do we need a new worker? Called from manager. */ | 639 | /* Do we need a new worker? Called from manager. */ |
606 | static bool need_to_create_worker(struct global_cwq *gcwq) | 640 | static bool need_to_create_worker(struct worker_pool *pool) |
607 | { | 641 | { |
608 | return need_more_worker(gcwq) && !may_start_working(gcwq); | 642 | return need_more_worker(pool) && !may_start_working(pool); |
609 | } | 643 | } |
610 | 644 | ||
611 | /* Do I need to be the manager? */ | 645 | /* Do I need to be the manager? */ |
612 | static bool need_to_manage_workers(struct global_cwq *gcwq) | 646 | static bool need_to_manage_workers(struct worker_pool *pool) |
613 | { | 647 | { |
614 | return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; | 648 | return need_to_create_worker(pool) || |
649 | (pool->flags & POOL_MANAGE_WORKERS); | ||
615 | } | 650 | } |
616 | 651 | ||
617 | /* Do we have too many workers and should some go away? */ | 652 | /* Do we have too many workers and should some go away? */ |
618 | static bool too_many_workers(struct global_cwq *gcwq) | 653 | static bool too_many_workers(struct worker_pool *pool) |
619 | { | 654 | { |
620 | bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; | 655 | bool managing = mutex_is_locked(&pool->manager_mutex); |
621 | int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ | 656 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
622 | int nr_busy = gcwq->nr_workers - nr_idle; | 657 | int nr_busy = pool->nr_workers - nr_idle; |
623 | 658 | ||
624 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; | 659 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; |
625 | } | 660 | } |
@@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq) | |||
629 | */ | 664 | */ |
630 | 665 | ||
631 | /* Return the first worker. Safe with preemption disabled */ | 666 | /* Return the first worker. Safe with preemption disabled */ |
632 | static struct worker *first_worker(struct global_cwq *gcwq) | 667 | static struct worker *first_worker(struct worker_pool *pool) |
633 | { | 668 | { |
634 | if (unlikely(list_empty(&gcwq->idle_list))) | 669 | if (unlikely(list_empty(&pool->idle_list))) |
635 | return NULL; | 670 | return NULL; |
636 | 671 | ||
637 | return list_first_entry(&gcwq->idle_list, struct worker, entry); | 672 | return list_first_entry(&pool->idle_list, struct worker, entry); |
638 | } | 673 | } |
639 | 674 | ||
640 | /** | 675 | /** |
641 | * wake_up_worker - wake up an idle worker | 676 | * wake_up_worker - wake up an idle worker |
642 | * @gcwq: gcwq to wake worker for | 677 | * @pool: worker pool to wake worker from |
643 | * | 678 | * |
644 | * Wake up the first idle worker of @gcwq. | 679 | * Wake up the first idle worker of @pool. |
645 | * | 680 | * |
646 | * CONTEXT: | 681 | * CONTEXT: |
647 | * spin_lock_irq(gcwq->lock). | 682 | * spin_lock_irq(gcwq->lock). |
648 | */ | 683 | */ |
649 | static void wake_up_worker(struct global_cwq *gcwq) | 684 | static void wake_up_worker(struct worker_pool *pool) |
650 | { | 685 | { |
651 | struct worker *worker = first_worker(gcwq); | 686 | struct worker *worker = first_worker(pool); |
652 | 687 | ||
653 | if (likely(worker)) | 688 | if (likely(worker)) |
654 | wake_up_process(worker->task); | 689 | wake_up_process(worker->task); |
@@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
670 | struct worker *worker = kthread_data(task); | 705 | struct worker *worker = kthread_data(task); |
671 | 706 | ||
672 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 707 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
673 | atomic_inc(get_gcwq_nr_running(cpu)); | 708 | atomic_inc(get_pool_nr_running(worker->pool)); |
674 | } | 709 | } |
675 | 710 | ||
676 | /** | 711 | /** |
@@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
692 | unsigned int cpu) | 727 | unsigned int cpu) |
693 | { | 728 | { |
694 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; | 729 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; |
695 | struct global_cwq *gcwq = get_gcwq(cpu); | 730 | struct worker_pool *pool = worker->pool; |
696 | atomic_t *nr_running = get_gcwq_nr_running(cpu); | 731 | atomic_t *nr_running = get_pool_nr_running(pool); |
697 | 732 | ||
698 | if (worker->flags & WORKER_NOT_RUNNING) | 733 | if (worker->flags & WORKER_NOT_RUNNING) |
699 | return NULL; | 734 | return NULL; |
@@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
706 | * worklist not empty test sequence is in insert_work(). | 741 | * worklist not empty test sequence is in insert_work(). |
707 | * Please read comment there. | 742 | * Please read comment there. |
708 | * | 743 | * |
709 | * NOT_RUNNING is clear. This means that trustee is not in | 744 | * NOT_RUNNING is clear. This means that we're bound to and |
710 | * charge and we're running on the local cpu w/ rq lock held | 745 | * running on the local cpu w/ rq lock held and preemption |
711 | * and preemption disabled, which in turn means that none else | 746 | * disabled, which in turn means that none else could be |
712 | * could be manipulating idle_list, so dereferencing idle_list | 747 | * manipulating idle_list, so dereferencing idle_list without gcwq |
713 | * without gcwq lock is safe. | 748 | * lock is safe. |
714 | */ | 749 | */ |
715 | if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) | 750 | if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) |
716 | to_wakeup = first_worker(gcwq); | 751 | to_wakeup = first_worker(pool); |
717 | return to_wakeup ? to_wakeup->task : NULL; | 752 | return to_wakeup ? to_wakeup->task : NULL; |
718 | } | 753 | } |
719 | 754 | ||
@@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
733 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, | 768 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, |
734 | bool wakeup) | 769 | bool wakeup) |
735 | { | 770 | { |
736 | struct global_cwq *gcwq = worker->gcwq; | 771 | struct worker_pool *pool = worker->pool; |
737 | 772 | ||
738 | WARN_ON_ONCE(worker->task != current); | 773 | WARN_ON_ONCE(worker->task != current); |
739 | 774 | ||
@@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, | |||
744 | */ | 779 | */ |
745 | if ((flags & WORKER_NOT_RUNNING) && | 780 | if ((flags & WORKER_NOT_RUNNING) && |
746 | !(worker->flags & WORKER_NOT_RUNNING)) { | 781 | !(worker->flags & WORKER_NOT_RUNNING)) { |
747 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | 782 | atomic_t *nr_running = get_pool_nr_running(pool); |
748 | 783 | ||
749 | if (wakeup) { | 784 | if (wakeup) { |
750 | if (atomic_dec_and_test(nr_running) && | 785 | if (atomic_dec_and_test(nr_running) && |
751 | !list_empty(&gcwq->worklist)) | 786 | !list_empty(&pool->worklist)) |
752 | wake_up_worker(gcwq); | 787 | wake_up_worker(pool); |
753 | } else | 788 | } else |
754 | atomic_dec(nr_running); | 789 | atomic_dec(nr_running); |
755 | } | 790 | } |
@@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, | |||
769 | */ | 804 | */ |
770 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | 805 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) |
771 | { | 806 | { |
772 | struct global_cwq *gcwq = worker->gcwq; | 807 | struct worker_pool *pool = worker->pool; |
773 | unsigned int oflags = worker->flags; | 808 | unsigned int oflags = worker->flags; |
774 | 809 | ||
775 | WARN_ON_ONCE(worker->task != current); | 810 | WARN_ON_ONCE(worker->task != current); |
@@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
783 | */ | 818 | */ |
784 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | 819 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) |
785 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 820 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
786 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | 821 | atomic_inc(get_pool_nr_running(pool)); |
787 | } | 822 | } |
788 | 823 | ||
789 | /** | 824 | /** |
@@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | |||
867 | } | 902 | } |
868 | 903 | ||
869 | /** | 904 | /** |
870 | * gcwq_determine_ins_pos - find insertion position | ||
871 | * @gcwq: gcwq of interest | ||
872 | * @cwq: cwq a work is being queued for | ||
873 | * | ||
874 | * A work for @cwq is about to be queued on @gcwq, determine insertion | ||
875 | * position for the work. If @cwq is for HIGHPRI wq, the work is | ||
876 | * queued at the head of the queue but in FIFO order with respect to | ||
877 | * other HIGHPRI works; otherwise, at the end of the queue. This | ||
878 | * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that | ||
879 | * there are HIGHPRI works pending. | ||
880 | * | ||
881 | * CONTEXT: | ||
882 | * spin_lock_irq(gcwq->lock). | ||
883 | * | ||
884 | * RETURNS: | ||
885 | * Pointer to inserstion position. | ||
886 | */ | ||
887 | static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, | ||
888 | struct cpu_workqueue_struct *cwq) | ||
889 | { | ||
890 | struct work_struct *twork; | ||
891 | |||
892 | if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) | ||
893 | return &gcwq->worklist; | ||
894 | |||
895 | list_for_each_entry(twork, &gcwq->worklist, entry) { | ||
896 | struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); | ||
897 | |||
898 | if (!(tcwq->wq->flags & WQ_HIGHPRI)) | ||
899 | break; | ||
900 | } | ||
901 | |||
902 | gcwq->flags |= GCWQ_HIGHPRI_PENDING; | ||
903 | return &twork->entry; | ||
904 | } | ||
905 | |||
906 | /** | ||
907 | * insert_work - insert a work into gcwq | 905 | * insert_work - insert a work into gcwq |
908 | * @cwq: cwq @work belongs to | 906 | * @cwq: cwq @work belongs to |
909 | * @work: work to insert | 907 | * @work: work to insert |
@@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
920 | struct work_struct *work, struct list_head *head, | 918 | struct work_struct *work, struct list_head *head, |
921 | unsigned int extra_flags) | 919 | unsigned int extra_flags) |
922 | { | 920 | { |
923 | struct global_cwq *gcwq = cwq->gcwq; | 921 | struct worker_pool *pool = cwq->pool; |
924 | 922 | ||
925 | /* we own @work, set data and link */ | 923 | /* we own @work, set data and link */ |
926 | set_work_cwq(work, cwq, extra_flags); | 924 | set_work_cwq(work, cwq, extra_flags); |
@@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
940 | */ | 938 | */ |
941 | smp_mb(); | 939 | smp_mb(); |
942 | 940 | ||
943 | if (__need_more_worker(gcwq)) | 941 | if (__need_more_worker(pool)) |
944 | wake_up_worker(gcwq); | 942 | wake_up_worker(pool); |
945 | } | 943 | } |
946 | 944 | ||
947 | /* | 945 | /* |
@@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1043 | if (likely(cwq->nr_active < cwq->max_active)) { | 1041 | if (likely(cwq->nr_active < cwq->max_active)) { |
1044 | trace_workqueue_activate_work(work); | 1042 | trace_workqueue_activate_work(work); |
1045 | cwq->nr_active++; | 1043 | cwq->nr_active++; |
1046 | worklist = gcwq_determine_ins_pos(gcwq, cwq); | 1044 | worklist = &cwq->pool->worklist; |
1047 | } else { | 1045 | } else { |
1048 | work_flags |= WORK_STRUCT_DELAYED; | 1046 | work_flags |= WORK_STRUCT_DELAYED; |
1049 | worklist = &cwq->delayed_works; | 1047 | worklist = &cwq->delayed_works; |
@@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); | |||
1192 | */ | 1190 | */ |
1193 | static void worker_enter_idle(struct worker *worker) | 1191 | static void worker_enter_idle(struct worker *worker) |
1194 | { | 1192 | { |
1195 | struct global_cwq *gcwq = worker->gcwq; | 1193 | struct worker_pool *pool = worker->pool; |
1194 | struct global_cwq *gcwq = pool->gcwq; | ||
1196 | 1195 | ||
1197 | BUG_ON(worker->flags & WORKER_IDLE); | 1196 | BUG_ON(worker->flags & WORKER_IDLE); |
1198 | BUG_ON(!list_empty(&worker->entry) && | 1197 | BUG_ON(!list_empty(&worker->entry) && |
@@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker) | |||
1200 | 1199 | ||
1201 | /* can't use worker_set_flags(), also called from start_worker() */ | 1200 | /* can't use worker_set_flags(), also called from start_worker() */ |
1202 | worker->flags |= WORKER_IDLE; | 1201 | worker->flags |= WORKER_IDLE; |
1203 | gcwq->nr_idle++; | 1202 | pool->nr_idle++; |
1204 | worker->last_active = jiffies; | 1203 | worker->last_active = jiffies; |
1205 | 1204 | ||
1206 | /* idle_list is LIFO */ | 1205 | /* idle_list is LIFO */ |
1207 | list_add(&worker->entry, &gcwq->idle_list); | 1206 | list_add(&worker->entry, &pool->idle_list); |
1208 | 1207 | ||
1209 | if (likely(!(worker->flags & WORKER_ROGUE))) { | 1208 | if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) |
1210 | if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) | 1209 | mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); |
1211 | mod_timer(&gcwq->idle_timer, | ||
1212 | jiffies + IDLE_WORKER_TIMEOUT); | ||
1213 | } else | ||
1214 | wake_up_all(&gcwq->trustee_wait); | ||
1215 | 1210 | ||
1216 | /* | 1211 | /* |
1217 | * Sanity check nr_running. Because trustee releases gcwq->lock | 1212 | * Sanity check nr_running. Because gcwq_unbind_fn() releases |
1218 | * between setting %WORKER_ROGUE and zapping nr_running, the | 1213 | * gcwq->lock between setting %WORKER_UNBOUND and zapping |
1219 | * warning may trigger spuriously. Check iff trustee is idle. | 1214 | * nr_running, the warning may trigger spuriously. Check iff |
1215 | * unbind is not in progress. | ||
1220 | */ | 1216 | */ |
1221 | WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && | 1217 | WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && |
1222 | gcwq->nr_workers == gcwq->nr_idle && | 1218 | pool->nr_workers == pool->nr_idle && |
1223 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | 1219 | atomic_read(get_pool_nr_running(pool))); |
1224 | } | 1220 | } |
1225 | 1221 | ||
1226 | /** | 1222 | /** |
@@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker) | |||
1234 | */ | 1230 | */ |
1235 | static void worker_leave_idle(struct worker *worker) | 1231 | static void worker_leave_idle(struct worker *worker) |
1236 | { | 1232 | { |
1237 | struct global_cwq *gcwq = worker->gcwq; | 1233 | struct worker_pool *pool = worker->pool; |
1238 | 1234 | ||
1239 | BUG_ON(!(worker->flags & WORKER_IDLE)); | 1235 | BUG_ON(!(worker->flags & WORKER_IDLE)); |
1240 | worker_clr_flags(worker, WORKER_IDLE); | 1236 | worker_clr_flags(worker, WORKER_IDLE); |
1241 | gcwq->nr_idle--; | 1237 | pool->nr_idle--; |
1242 | list_del_init(&worker->entry); | 1238 | list_del_init(&worker->entry); |
1243 | } | 1239 | } |
1244 | 1240 | ||
@@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker) | |||
1258 | * verbatim as it's best effort and blocking and gcwq may be | 1254 | * verbatim as it's best effort and blocking and gcwq may be |
1259 | * [dis]associated in the meantime. | 1255 | * [dis]associated in the meantime. |
1260 | * | 1256 | * |
1261 | * This function tries set_cpus_allowed() and locks gcwq and verifies | 1257 | * This function tries set_cpus_allowed() and locks gcwq and verifies the |
1262 | * the binding against GCWQ_DISASSOCIATED which is set during | 1258 | * binding against %GCWQ_DISASSOCIATED which is set during |
1263 | * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters | 1259 | * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker |
1264 | * idle state or fetches works without dropping lock, it can guarantee | 1260 | * enters idle state or fetches works without dropping lock, it can |
1265 | * the scheduling requirement described in the first paragraph. | 1261 | * guarantee the scheduling requirement described in the first paragraph. |
1266 | * | 1262 | * |
1267 | * CONTEXT: | 1263 | * CONTEXT: |
1268 | * Might sleep. Called without any lock but returns with gcwq->lock | 1264 | * Might sleep. Called without any lock but returns with gcwq->lock |
@@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker) | |||
1275 | static bool worker_maybe_bind_and_lock(struct worker *worker) | 1271 | static bool worker_maybe_bind_and_lock(struct worker *worker) |
1276 | __acquires(&gcwq->lock) | 1272 | __acquires(&gcwq->lock) |
1277 | { | 1273 | { |
1278 | struct global_cwq *gcwq = worker->gcwq; | 1274 | struct global_cwq *gcwq = worker->pool->gcwq; |
1279 | struct task_struct *task = worker->task; | 1275 | struct task_struct *task = worker->task; |
1280 | 1276 | ||
1281 | while (true) { | 1277 | while (true) { |
@@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock) | |||
1308 | } | 1304 | } |
1309 | } | 1305 | } |
1310 | 1306 | ||
1307 | struct idle_rebind { | ||
1308 | int cnt; /* # workers to be rebound */ | ||
1309 | struct completion done; /* all workers rebound */ | ||
1310 | }; | ||
1311 | |||
1312 | /* | ||
1313 | * Rebind an idle @worker to its CPU. During CPU onlining, this has to | ||
1314 | * happen synchronously for idle workers. worker_thread() will test | ||
1315 | * %WORKER_REBIND before leaving idle and call this function. | ||
1316 | */ | ||
1317 | static void idle_worker_rebind(struct worker *worker) | ||
1318 | { | ||
1319 | struct global_cwq *gcwq = worker->pool->gcwq; | ||
1320 | |||
1321 | /* CPU must be online at this point */ | ||
1322 | WARN_ON(!worker_maybe_bind_and_lock(worker)); | ||
1323 | if (!--worker->idle_rebind->cnt) | ||
1324 | complete(&worker->idle_rebind->done); | ||
1325 | spin_unlock_irq(&worker->pool->gcwq->lock); | ||
1326 | |||
1327 | /* we did our part, wait for rebind_workers() to finish up */ | ||
1328 | wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); | ||
1329 | } | ||
1330 | |||
1311 | /* | 1331 | /* |
1312 | * Function for worker->rebind_work used to rebind rogue busy workers | 1332 | * Function for @worker->rebind.work used to rebind unbound busy workers to |
1313 | * to the associated cpu which is coming back online. This is | 1333 | * the associated cpu which is coming back online. This is scheduled by |
1314 | * scheduled by cpu up but can race with other cpu hotplug operations | 1334 | * cpu up but can race with other cpu hotplug operations and may be |
1315 | * and may be executed twice without intervening cpu down. | 1335 | * executed twice without intervening cpu down. |
1316 | */ | 1336 | */ |
1317 | static void worker_rebind_fn(struct work_struct *work) | 1337 | static void busy_worker_rebind_fn(struct work_struct *work) |
1318 | { | 1338 | { |
1319 | struct worker *worker = container_of(work, struct worker, rebind_work); | 1339 | struct worker *worker = container_of(work, struct worker, rebind_work); |
1320 | struct global_cwq *gcwq = worker->gcwq; | 1340 | struct global_cwq *gcwq = worker->pool->gcwq; |
1321 | 1341 | ||
1322 | if (worker_maybe_bind_and_lock(worker)) | 1342 | if (worker_maybe_bind_and_lock(worker)) |
1323 | worker_clr_flags(worker, WORKER_REBIND); | 1343 | worker_clr_flags(worker, WORKER_REBIND); |
@@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work) | |||
1325 | spin_unlock_irq(&gcwq->lock); | 1345 | spin_unlock_irq(&gcwq->lock); |
1326 | } | 1346 | } |
1327 | 1347 | ||
1348 | /** | ||
1349 | * rebind_workers - rebind all workers of a gcwq to the associated CPU | ||
1350 | * @gcwq: gcwq of interest | ||
1351 | * | ||
1352 | * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding | ||
1353 | * is different for idle and busy ones. | ||
1354 | * | ||
1355 | * The idle ones should be rebound synchronously and idle rebinding should | ||
1356 | * be complete before any worker starts executing work items with | ||
1357 | * concurrency management enabled; otherwise, scheduler may oops trying to | ||
1358 | * wake up non-local idle worker from wq_worker_sleeping(). | ||
1359 | * | ||
1360 | * This is achieved by repeatedly requesting rebinding until all idle | ||
1361 | * workers are known to have been rebound under @gcwq->lock and holding all | ||
1362 | * idle workers from becoming busy until idle rebinding is complete. | ||
1363 | * | ||
1364 | * Once idle workers are rebound, busy workers can be rebound as they | ||
1365 | * finish executing their current work items. Queueing the rebind work at | ||
1366 | * the head of their scheduled lists is enough. Note that nr_running will | ||
1367 | * be properbly bumped as busy workers rebind. | ||
1368 | * | ||
1369 | * On return, all workers are guaranteed to either be bound or have rebind | ||
1370 | * work item scheduled. | ||
1371 | */ | ||
1372 | static void rebind_workers(struct global_cwq *gcwq) | ||
1373 | __releases(&gcwq->lock) __acquires(&gcwq->lock) | ||
1374 | { | ||
1375 | struct idle_rebind idle_rebind; | ||
1376 | struct worker_pool *pool; | ||
1377 | struct worker *worker; | ||
1378 | struct hlist_node *pos; | ||
1379 | int i; | ||
1380 | |||
1381 | lockdep_assert_held(&gcwq->lock); | ||
1382 | |||
1383 | for_each_worker_pool(pool, gcwq) | ||
1384 | lockdep_assert_held(&pool->manager_mutex); | ||
1385 | |||
1386 | /* | ||
1387 | * Rebind idle workers. Interlocked both ways. We wait for | ||
1388 | * workers to rebind via @idle_rebind.done. Workers will wait for | ||
1389 | * us to finish up by watching %WORKER_REBIND. | ||
1390 | */ | ||
1391 | init_completion(&idle_rebind.done); | ||
1392 | retry: | ||
1393 | idle_rebind.cnt = 1; | ||
1394 | INIT_COMPLETION(idle_rebind.done); | ||
1395 | |||
1396 | /* set REBIND and kick idle ones, we'll wait for these later */ | ||
1397 | for_each_worker_pool(pool, gcwq) { | ||
1398 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
1399 | if (worker->flags & WORKER_REBIND) | ||
1400 | continue; | ||
1401 | |||
1402 | /* morph UNBOUND to REBIND */ | ||
1403 | worker->flags &= ~WORKER_UNBOUND; | ||
1404 | worker->flags |= WORKER_REBIND; | ||
1405 | |||
1406 | idle_rebind.cnt++; | ||
1407 | worker->idle_rebind = &idle_rebind; | ||
1408 | |||
1409 | /* worker_thread() will call idle_worker_rebind() */ | ||
1410 | wake_up_process(worker->task); | ||
1411 | } | ||
1412 | } | ||
1413 | |||
1414 | if (--idle_rebind.cnt) { | ||
1415 | spin_unlock_irq(&gcwq->lock); | ||
1416 | wait_for_completion(&idle_rebind.done); | ||
1417 | spin_lock_irq(&gcwq->lock); | ||
1418 | /* busy ones might have become idle while waiting, retry */ | ||
1419 | goto retry; | ||
1420 | } | ||
1421 | |||
1422 | /* | ||
1423 | * All idle workers are rebound and waiting for %WORKER_REBIND to | ||
1424 | * be cleared inside idle_worker_rebind(). Clear and release. | ||
1425 | * Clearing %WORKER_REBIND from this foreign context is safe | ||
1426 | * because these workers are still guaranteed to be idle. | ||
1427 | */ | ||
1428 | for_each_worker_pool(pool, gcwq) | ||
1429 | list_for_each_entry(worker, &pool->idle_list, entry) | ||
1430 | worker->flags &= ~WORKER_REBIND; | ||
1431 | |||
1432 | wake_up_all(&gcwq->rebind_hold); | ||
1433 | |||
1434 | /* rebind busy workers */ | ||
1435 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
1436 | struct work_struct *rebind_work = &worker->rebind_work; | ||
1437 | |||
1438 | /* morph UNBOUND to REBIND */ | ||
1439 | worker->flags &= ~WORKER_UNBOUND; | ||
1440 | worker->flags |= WORKER_REBIND; | ||
1441 | |||
1442 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | ||
1443 | work_data_bits(rebind_work))) | ||
1444 | continue; | ||
1445 | |||
1446 | /* wq doesn't matter, use the default one */ | ||
1447 | debug_work_activate(rebind_work); | ||
1448 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
1449 | worker->scheduled.next, | ||
1450 | work_color_to_flags(WORK_NO_COLOR)); | ||
1451 | } | ||
1452 | } | ||
1453 | |||
1328 | static struct worker *alloc_worker(void) | 1454 | static struct worker *alloc_worker(void) |
1329 | { | 1455 | { |
1330 | struct worker *worker; | 1456 | struct worker *worker; |
@@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void) | |||
1333 | if (worker) { | 1459 | if (worker) { |
1334 | INIT_LIST_HEAD(&worker->entry); | 1460 | INIT_LIST_HEAD(&worker->entry); |
1335 | INIT_LIST_HEAD(&worker->scheduled); | 1461 | INIT_LIST_HEAD(&worker->scheduled); |
1336 | INIT_WORK(&worker->rebind_work, worker_rebind_fn); | 1462 | INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); |
1337 | /* on creation a worker is in !idle && prep state */ | 1463 | /* on creation a worker is in !idle && prep state */ |
1338 | worker->flags = WORKER_PREP; | 1464 | worker->flags = WORKER_PREP; |
1339 | } | 1465 | } |
@@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void) | |||
1342 | 1468 | ||
1343 | /** | 1469 | /** |
1344 | * create_worker - create a new workqueue worker | 1470 | * create_worker - create a new workqueue worker |
1345 | * @gcwq: gcwq the new worker will belong to | 1471 | * @pool: pool the new worker will belong to |
1346 | * @bind: whether to set affinity to @cpu or not | ||
1347 | * | 1472 | * |
1348 | * Create a new worker which is bound to @gcwq. The returned worker | 1473 | * Create a new worker which is bound to @pool. The returned worker |
1349 | * can be started by calling start_worker() or destroyed using | 1474 | * can be started by calling start_worker() or destroyed using |
1350 | * destroy_worker(). | 1475 | * destroy_worker(). |
1351 | * | 1476 | * |
@@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void) | |||
1355 | * RETURNS: | 1480 | * RETURNS: |
1356 | * Pointer to the newly created worker. | 1481 | * Pointer to the newly created worker. |
1357 | */ | 1482 | */ |
1358 | static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | 1483 | static struct worker *create_worker(struct worker_pool *pool) |
1359 | { | 1484 | { |
1360 | bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; | 1485 | struct global_cwq *gcwq = pool->gcwq; |
1486 | const char *pri = worker_pool_pri(pool) ? "H" : ""; | ||
1361 | struct worker *worker = NULL; | 1487 | struct worker *worker = NULL; |
1362 | int id = -1; | 1488 | int id = -1; |
1363 | 1489 | ||
1364 | spin_lock_irq(&gcwq->lock); | 1490 | spin_lock_irq(&gcwq->lock); |
1365 | while (ida_get_new(&gcwq->worker_ida, &id)) { | 1491 | while (ida_get_new(&pool->worker_ida, &id)) { |
1366 | spin_unlock_irq(&gcwq->lock); | 1492 | spin_unlock_irq(&gcwq->lock); |
1367 | if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) | 1493 | if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) |
1368 | goto fail; | 1494 | goto fail; |
1369 | spin_lock_irq(&gcwq->lock); | 1495 | spin_lock_irq(&gcwq->lock); |
1370 | } | 1496 | } |
@@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | |||
1374 | if (!worker) | 1500 | if (!worker) |
1375 | goto fail; | 1501 | goto fail; |
1376 | 1502 | ||
1377 | worker->gcwq = gcwq; | 1503 | worker->pool = pool; |
1378 | worker->id = id; | 1504 | worker->id = id; |
1379 | 1505 | ||
1380 | if (!on_unbound_cpu) | 1506 | if (gcwq->cpu != WORK_CPU_UNBOUND) |
1381 | worker->task = kthread_create_on_node(worker_thread, | 1507 | worker->task = kthread_create_on_node(worker_thread, |
1382 | worker, | 1508 | worker, cpu_to_node(gcwq->cpu), |
1383 | cpu_to_node(gcwq->cpu), | 1509 | "kworker/%u:%d%s", gcwq->cpu, id, pri); |
1384 | "kworker/%u:%d", gcwq->cpu, id); | ||
1385 | else | 1510 | else |
1386 | worker->task = kthread_create(worker_thread, worker, | 1511 | worker->task = kthread_create(worker_thread, worker, |
1387 | "kworker/u:%d", id); | 1512 | "kworker/u:%d%s", id, pri); |
1388 | if (IS_ERR(worker->task)) | 1513 | if (IS_ERR(worker->task)) |
1389 | goto fail; | 1514 | goto fail; |
1390 | 1515 | ||
1516 | if (worker_pool_pri(pool)) | ||
1517 | set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); | ||
1518 | |||
1391 | /* | 1519 | /* |
1392 | * A rogue worker will become a regular one if CPU comes | 1520 | * Determine CPU binding of the new worker depending on |
1393 | * online later on. Make sure every worker has | 1521 | * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the |
1394 | * PF_THREAD_BOUND set. | 1522 | * flag remains stable across this function. See the comments |
1523 | * above the flag definition for details. | ||
1524 | * | ||
1525 | * As an unbound worker may later become a regular one if CPU comes | ||
1526 | * online, make sure every worker has %PF_THREAD_BOUND set. | ||
1395 | */ | 1527 | */ |
1396 | if (bind && !on_unbound_cpu) | 1528 | if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { |
1397 | kthread_bind(worker->task, gcwq->cpu); | 1529 | kthread_bind(worker->task, gcwq->cpu); |
1398 | else { | 1530 | } else { |
1399 | worker->task->flags |= PF_THREAD_BOUND; | 1531 | worker->task->flags |= PF_THREAD_BOUND; |
1400 | if (on_unbound_cpu) | 1532 | worker->flags |= WORKER_UNBOUND; |
1401 | worker->flags |= WORKER_UNBOUND; | ||
1402 | } | 1533 | } |
1403 | 1534 | ||
1404 | return worker; | 1535 | return worker; |
1405 | fail: | 1536 | fail: |
1406 | if (id >= 0) { | 1537 | if (id >= 0) { |
1407 | spin_lock_irq(&gcwq->lock); | 1538 | spin_lock_irq(&gcwq->lock); |
1408 | ida_remove(&gcwq->worker_ida, id); | 1539 | ida_remove(&pool->worker_ida, id); |
1409 | spin_unlock_irq(&gcwq->lock); | 1540 | spin_unlock_irq(&gcwq->lock); |
1410 | } | 1541 | } |
1411 | kfree(worker); | 1542 | kfree(worker); |
@@ -1424,7 +1555,7 @@ fail: | |||
1424 | static void start_worker(struct worker *worker) | 1555 | static void start_worker(struct worker *worker) |
1425 | { | 1556 | { |
1426 | worker->flags |= WORKER_STARTED; | 1557 | worker->flags |= WORKER_STARTED; |
1427 | worker->gcwq->nr_workers++; | 1558 | worker->pool->nr_workers++; |
1428 | worker_enter_idle(worker); | 1559 | worker_enter_idle(worker); |
1429 | wake_up_process(worker->task); | 1560 | wake_up_process(worker->task); |
1430 | } | 1561 | } |
@@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker) | |||
1440 | */ | 1571 | */ |
1441 | static void destroy_worker(struct worker *worker) | 1572 | static void destroy_worker(struct worker *worker) |
1442 | { | 1573 | { |
1443 | struct global_cwq *gcwq = worker->gcwq; | 1574 | struct worker_pool *pool = worker->pool; |
1575 | struct global_cwq *gcwq = pool->gcwq; | ||
1444 | int id = worker->id; | 1576 | int id = worker->id; |
1445 | 1577 | ||
1446 | /* sanity check frenzy */ | 1578 | /* sanity check frenzy */ |
@@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker) | |||
1448 | BUG_ON(!list_empty(&worker->scheduled)); | 1580 | BUG_ON(!list_empty(&worker->scheduled)); |
1449 | 1581 | ||
1450 | if (worker->flags & WORKER_STARTED) | 1582 | if (worker->flags & WORKER_STARTED) |
1451 | gcwq->nr_workers--; | 1583 | pool->nr_workers--; |
1452 | if (worker->flags & WORKER_IDLE) | 1584 | if (worker->flags & WORKER_IDLE) |
1453 | gcwq->nr_idle--; | 1585 | pool->nr_idle--; |
1454 | 1586 | ||
1455 | list_del_init(&worker->entry); | 1587 | list_del_init(&worker->entry); |
1456 | worker->flags |= WORKER_DIE; | 1588 | worker->flags |= WORKER_DIE; |
@@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker) | |||
1461 | kfree(worker); | 1593 | kfree(worker); |
1462 | 1594 | ||
1463 | spin_lock_irq(&gcwq->lock); | 1595 | spin_lock_irq(&gcwq->lock); |
1464 | ida_remove(&gcwq->worker_ida, id); | 1596 | ida_remove(&pool->worker_ida, id); |
1465 | } | 1597 | } |
1466 | 1598 | ||
1467 | static void idle_worker_timeout(unsigned long __gcwq) | 1599 | static void idle_worker_timeout(unsigned long __pool) |
1468 | { | 1600 | { |
1469 | struct global_cwq *gcwq = (void *)__gcwq; | 1601 | struct worker_pool *pool = (void *)__pool; |
1602 | struct global_cwq *gcwq = pool->gcwq; | ||
1470 | 1603 | ||
1471 | spin_lock_irq(&gcwq->lock); | 1604 | spin_lock_irq(&gcwq->lock); |
1472 | 1605 | ||
1473 | if (too_many_workers(gcwq)) { | 1606 | if (too_many_workers(pool)) { |
1474 | struct worker *worker; | 1607 | struct worker *worker; |
1475 | unsigned long expires; | 1608 | unsigned long expires; |
1476 | 1609 | ||
1477 | /* idle_list is kept in LIFO order, check the last one */ | 1610 | /* idle_list is kept in LIFO order, check the last one */ |
1478 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); | 1611 | worker = list_entry(pool->idle_list.prev, struct worker, entry); |
1479 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | 1612 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
1480 | 1613 | ||
1481 | if (time_before(jiffies, expires)) | 1614 | if (time_before(jiffies, expires)) |
1482 | mod_timer(&gcwq->idle_timer, expires); | 1615 | mod_timer(&pool->idle_timer, expires); |
1483 | else { | 1616 | else { |
1484 | /* it's been idle for too long, wake up manager */ | 1617 | /* it's been idle for too long, wake up manager */ |
1485 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | 1618 | pool->flags |= POOL_MANAGE_WORKERS; |
1486 | wake_up_worker(gcwq); | 1619 | wake_up_worker(pool); |
1487 | } | 1620 | } |
1488 | } | 1621 | } |
1489 | 1622 | ||
@@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work) | |||
1500 | return false; | 1633 | return false; |
1501 | 1634 | ||
1502 | /* mayday mayday mayday */ | 1635 | /* mayday mayday mayday */ |
1503 | cpu = cwq->gcwq->cpu; | 1636 | cpu = cwq->pool->gcwq->cpu; |
1504 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ | 1637 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ |
1505 | if (cpu == WORK_CPU_UNBOUND) | 1638 | if (cpu == WORK_CPU_UNBOUND) |
1506 | cpu = 0; | 1639 | cpu = 0; |
@@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work) | |||
1509 | return true; | 1642 | return true; |
1510 | } | 1643 | } |
1511 | 1644 | ||
1512 | static void gcwq_mayday_timeout(unsigned long __gcwq) | 1645 | static void gcwq_mayday_timeout(unsigned long __pool) |
1513 | { | 1646 | { |
1514 | struct global_cwq *gcwq = (void *)__gcwq; | 1647 | struct worker_pool *pool = (void *)__pool; |
1648 | struct global_cwq *gcwq = pool->gcwq; | ||
1515 | struct work_struct *work; | 1649 | struct work_struct *work; |
1516 | 1650 | ||
1517 | spin_lock_irq(&gcwq->lock); | 1651 | spin_lock_irq(&gcwq->lock); |
1518 | 1652 | ||
1519 | if (need_to_create_worker(gcwq)) { | 1653 | if (need_to_create_worker(pool)) { |
1520 | /* | 1654 | /* |
1521 | * We've been trying to create a new worker but | 1655 | * We've been trying to create a new worker but |
1522 | * haven't been successful. We might be hitting an | 1656 | * haven't been successful. We might be hitting an |
1523 | * allocation deadlock. Send distress signals to | 1657 | * allocation deadlock. Send distress signals to |
1524 | * rescuers. | 1658 | * rescuers. |
1525 | */ | 1659 | */ |
1526 | list_for_each_entry(work, &gcwq->worklist, entry) | 1660 | list_for_each_entry(work, &pool->worklist, entry) |
1527 | send_mayday(work); | 1661 | send_mayday(work); |
1528 | } | 1662 | } |
1529 | 1663 | ||
1530 | spin_unlock_irq(&gcwq->lock); | 1664 | spin_unlock_irq(&gcwq->lock); |
1531 | 1665 | ||
1532 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); | 1666 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); |
1533 | } | 1667 | } |
1534 | 1668 | ||
1535 | /** | 1669 | /** |
1536 | * maybe_create_worker - create a new worker if necessary | 1670 | * maybe_create_worker - create a new worker if necessary |
1537 | * @gcwq: gcwq to create a new worker for | 1671 | * @pool: pool to create a new worker for |
1538 | * | 1672 | * |
1539 | * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to | 1673 | * Create a new worker for @pool if necessary. @pool is guaranteed to |
1540 | * have at least one idle worker on return from this function. If | 1674 | * have at least one idle worker on return from this function. If |
1541 | * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is | 1675 | * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is |
1542 | * sent to all rescuers with works scheduled on @gcwq to resolve | 1676 | * sent to all rescuers with works scheduled on @pool to resolve |
1543 | * possible allocation deadlock. | 1677 | * possible allocation deadlock. |
1544 | * | 1678 | * |
1545 | * On return, need_to_create_worker() is guaranteed to be false and | 1679 | * On return, need_to_create_worker() is guaranteed to be false and |
@@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) | |||
1554 | * false if no action was taken and gcwq->lock stayed locked, true | 1688 | * false if no action was taken and gcwq->lock stayed locked, true |
1555 | * otherwise. | 1689 | * otherwise. |
1556 | */ | 1690 | */ |
1557 | static bool maybe_create_worker(struct global_cwq *gcwq) | 1691 | static bool maybe_create_worker(struct worker_pool *pool) |
1558 | __releases(&gcwq->lock) | 1692 | __releases(&gcwq->lock) |
1559 | __acquires(&gcwq->lock) | 1693 | __acquires(&gcwq->lock) |
1560 | { | 1694 | { |
1561 | if (!need_to_create_worker(gcwq)) | 1695 | struct global_cwq *gcwq = pool->gcwq; |
1696 | |||
1697 | if (!need_to_create_worker(pool)) | ||
1562 | return false; | 1698 | return false; |
1563 | restart: | 1699 | restart: |
1564 | spin_unlock_irq(&gcwq->lock); | 1700 | spin_unlock_irq(&gcwq->lock); |
1565 | 1701 | ||
1566 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ | 1702 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ |
1567 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); | 1703 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); |
1568 | 1704 | ||
1569 | while (true) { | 1705 | while (true) { |
1570 | struct worker *worker; | 1706 | struct worker *worker; |
1571 | 1707 | ||
1572 | worker = create_worker(gcwq, true); | 1708 | worker = create_worker(pool); |
1573 | if (worker) { | 1709 | if (worker) { |
1574 | del_timer_sync(&gcwq->mayday_timer); | 1710 | del_timer_sync(&pool->mayday_timer); |
1575 | spin_lock_irq(&gcwq->lock); | 1711 | spin_lock_irq(&gcwq->lock); |
1576 | start_worker(worker); | 1712 | start_worker(worker); |
1577 | BUG_ON(need_to_create_worker(gcwq)); | 1713 | BUG_ON(need_to_create_worker(pool)); |
1578 | return true; | 1714 | return true; |
1579 | } | 1715 | } |
1580 | 1716 | ||
1581 | if (!need_to_create_worker(gcwq)) | 1717 | if (!need_to_create_worker(pool)) |
1582 | break; | 1718 | break; |
1583 | 1719 | ||
1584 | __set_current_state(TASK_INTERRUPTIBLE); | 1720 | __set_current_state(TASK_INTERRUPTIBLE); |
1585 | schedule_timeout(CREATE_COOLDOWN); | 1721 | schedule_timeout(CREATE_COOLDOWN); |
1586 | 1722 | ||
1587 | if (!need_to_create_worker(gcwq)) | 1723 | if (!need_to_create_worker(pool)) |
1588 | break; | 1724 | break; |
1589 | } | 1725 | } |
1590 | 1726 | ||
1591 | del_timer_sync(&gcwq->mayday_timer); | 1727 | del_timer_sync(&pool->mayday_timer); |
1592 | spin_lock_irq(&gcwq->lock); | 1728 | spin_lock_irq(&gcwq->lock); |
1593 | if (need_to_create_worker(gcwq)) | 1729 | if (need_to_create_worker(pool)) |
1594 | goto restart; | 1730 | goto restart; |
1595 | return true; | 1731 | return true; |
1596 | } | 1732 | } |
1597 | 1733 | ||
1598 | /** | 1734 | /** |
1599 | * maybe_destroy_worker - destroy workers which have been idle for a while | 1735 | * maybe_destroy_worker - destroy workers which have been idle for a while |
1600 | * @gcwq: gcwq to destroy workers for | 1736 | * @pool: pool to destroy workers for |
1601 | * | 1737 | * |
1602 | * Destroy @gcwq workers which have been idle for longer than | 1738 | * Destroy @pool workers which have been idle for longer than |
1603 | * IDLE_WORKER_TIMEOUT. | 1739 | * IDLE_WORKER_TIMEOUT. |
1604 | * | 1740 | * |
1605 | * LOCKING: | 1741 | * LOCKING: |
@@ -1610,19 +1746,19 @@ restart: | |||
1610 | * false if no action was taken and gcwq->lock stayed locked, true | 1746 | * false if no action was taken and gcwq->lock stayed locked, true |
1611 | * otherwise. | 1747 | * otherwise. |
1612 | */ | 1748 | */ |
1613 | static bool maybe_destroy_workers(struct global_cwq *gcwq) | 1749 | static bool maybe_destroy_workers(struct worker_pool *pool) |
1614 | { | 1750 | { |
1615 | bool ret = false; | 1751 | bool ret = false; |
1616 | 1752 | ||
1617 | while (too_many_workers(gcwq)) { | 1753 | while (too_many_workers(pool)) { |
1618 | struct worker *worker; | 1754 | struct worker *worker; |
1619 | unsigned long expires; | 1755 | unsigned long expires; |
1620 | 1756 | ||
1621 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); | 1757 | worker = list_entry(pool->idle_list.prev, struct worker, entry); |
1622 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | 1758 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
1623 | 1759 | ||
1624 | if (time_before(jiffies, expires)) { | 1760 | if (time_before(jiffies, expires)) { |
1625 | mod_timer(&gcwq->idle_timer, expires); | 1761 | mod_timer(&pool->idle_timer, expires); |
1626 | break; | 1762 | break; |
1627 | } | 1763 | } |
1628 | 1764 | ||
@@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq) | |||
1655 | */ | 1791 | */ |
1656 | static bool manage_workers(struct worker *worker) | 1792 | static bool manage_workers(struct worker *worker) |
1657 | { | 1793 | { |
1658 | struct global_cwq *gcwq = worker->gcwq; | 1794 | struct worker_pool *pool = worker->pool; |
1659 | bool ret = false; | 1795 | bool ret = false; |
1660 | 1796 | ||
1661 | if (gcwq->flags & GCWQ_MANAGING_WORKERS) | 1797 | if (!mutex_trylock(&pool->manager_mutex)) |
1662 | return ret; | 1798 | return ret; |
1663 | 1799 | ||
1664 | gcwq->flags &= ~GCWQ_MANAGE_WORKERS; | 1800 | pool->flags &= ~POOL_MANAGE_WORKERS; |
1665 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
1666 | 1801 | ||
1667 | /* | 1802 | /* |
1668 | * Destroy and then create so that may_start_working() is true | 1803 | * Destroy and then create so that may_start_working() is true |
1669 | * on return. | 1804 | * on return. |
1670 | */ | 1805 | */ |
1671 | ret |= maybe_destroy_workers(gcwq); | 1806 | ret |= maybe_destroy_workers(pool); |
1672 | ret |= maybe_create_worker(gcwq); | 1807 | ret |= maybe_create_worker(pool); |
1673 | |||
1674 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
1675 | |||
1676 | /* | ||
1677 | * The trustee might be waiting to take over the manager | ||
1678 | * position, tell it we're done. | ||
1679 | */ | ||
1680 | if (unlikely(gcwq->trustee)) | ||
1681 | wake_up_all(&gcwq->trustee_wait); | ||
1682 | 1808 | ||
1809 | mutex_unlock(&pool->manager_mutex); | ||
1683 | return ret; | 1810 | return ret; |
1684 | } | 1811 | } |
1685 | 1812 | ||
@@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | |||
1728 | { | 1855 | { |
1729 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | 1856 | struct work_struct *work = list_first_entry(&cwq->delayed_works, |
1730 | struct work_struct, entry); | 1857 | struct work_struct, entry); |
1731 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); | ||
1732 | 1858 | ||
1733 | trace_workqueue_activate_work(work); | 1859 | trace_workqueue_activate_work(work); |
1734 | move_linked_works(work, pos, NULL); | 1860 | move_linked_works(work, &cwq->pool->worklist, NULL); |
1735 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | 1861 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); |
1736 | cwq->nr_active++; | 1862 | cwq->nr_active++; |
1737 | } | 1863 | } |
@@ -1804,7 +1930,8 @@ __releases(&gcwq->lock) | |||
1804 | __acquires(&gcwq->lock) | 1930 | __acquires(&gcwq->lock) |
1805 | { | 1931 | { |
1806 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | 1932 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); |
1807 | struct global_cwq *gcwq = cwq->gcwq; | 1933 | struct worker_pool *pool = worker->pool; |
1934 | struct global_cwq *gcwq = pool->gcwq; | ||
1808 | struct hlist_head *bwh = busy_worker_head(gcwq, work); | 1935 | struct hlist_head *bwh = busy_worker_head(gcwq, work); |
1809 | bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; | 1936 | bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; |
1810 | work_func_t f = work->func; | 1937 | work_func_t f = work->func; |
@@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock) | |||
1823 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); | 1950 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); |
1824 | #endif | 1951 | #endif |
1825 | /* | 1952 | /* |
1953 | * Ensure we're on the correct CPU. DISASSOCIATED test is | ||
1954 | * necessary to avoid spurious warnings from rescuers servicing the | ||
1955 | * unbound or a disassociated gcwq. | ||
1956 | */ | ||
1957 | WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && | ||
1958 | !(gcwq->flags & GCWQ_DISASSOCIATED) && | ||
1959 | raw_smp_processor_id() != gcwq->cpu); | ||
1960 | |||
1961 | /* | ||
1826 | * A single work shouldn't be executed concurrently by | 1962 | * A single work shouldn't be executed concurrently by |
1827 | * multiple workers on a single cpu. Check whether anyone is | 1963 | * multiple workers on a single cpu. Check whether anyone is |
1828 | * already processing the work. If so, defer the work to the | 1964 | * already processing the work. If so, defer the work to the |
@@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock) | |||
1846 | list_del_init(&work->entry); | 1982 | list_del_init(&work->entry); |
1847 | 1983 | ||
1848 | /* | 1984 | /* |
1849 | * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, | ||
1850 | * wake up another worker; otherwise, clear HIGHPRI_PENDING. | ||
1851 | */ | ||
1852 | if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { | ||
1853 | struct work_struct *nwork = list_first_entry(&gcwq->worklist, | ||
1854 | struct work_struct, entry); | ||
1855 | |||
1856 | if (!list_empty(&gcwq->worklist) && | ||
1857 | get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) | ||
1858 | wake_up_worker(gcwq); | ||
1859 | else | ||
1860 | gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; | ||
1861 | } | ||
1862 | |||
1863 | /* | ||
1864 | * CPU intensive works don't participate in concurrency | 1985 | * CPU intensive works don't participate in concurrency |
1865 | * management. They're the scheduler's responsibility. | 1986 | * management. They're the scheduler's responsibility. |
1866 | */ | 1987 | */ |
1867 | if (unlikely(cpu_intensive)) | 1988 | if (unlikely(cpu_intensive)) |
1868 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); | 1989 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); |
1869 | 1990 | ||
1991 | /* | ||
1992 | * Unbound gcwq isn't concurrency managed and work items should be | ||
1993 | * executed ASAP. Wake up another worker if necessary. | ||
1994 | */ | ||
1995 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) | ||
1996 | wake_up_worker(pool); | ||
1997 | |||
1870 | spin_unlock_irq(&gcwq->lock); | 1998 | spin_unlock_irq(&gcwq->lock); |
1871 | 1999 | ||
1872 | work_clear_pending(work); | 2000 | work_clear_pending(work); |
@@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker) | |||
1939 | static int worker_thread(void *__worker) | 2067 | static int worker_thread(void *__worker) |
1940 | { | 2068 | { |
1941 | struct worker *worker = __worker; | 2069 | struct worker *worker = __worker; |
1942 | struct global_cwq *gcwq = worker->gcwq; | 2070 | struct worker_pool *pool = worker->pool; |
2071 | struct global_cwq *gcwq = pool->gcwq; | ||
1943 | 2072 | ||
1944 | /* tell the scheduler that this is a workqueue worker */ | 2073 | /* tell the scheduler that this is a workqueue worker */ |
1945 | worker->task->flags |= PF_WQ_WORKER; | 2074 | worker->task->flags |= PF_WQ_WORKER; |
1946 | woke_up: | 2075 | woke_up: |
1947 | spin_lock_irq(&gcwq->lock); | 2076 | spin_lock_irq(&gcwq->lock); |
1948 | 2077 | ||
1949 | /* DIE can be set only while we're idle, checking here is enough */ | 2078 | /* |
1950 | if (worker->flags & WORKER_DIE) { | 2079 | * DIE can be set only while idle and REBIND set while busy has |
2080 | * @worker->rebind_work scheduled. Checking here is enough. | ||
2081 | */ | ||
2082 | if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { | ||
1951 | spin_unlock_irq(&gcwq->lock); | 2083 | spin_unlock_irq(&gcwq->lock); |
1952 | worker->task->flags &= ~PF_WQ_WORKER; | 2084 | |
1953 | return 0; | 2085 | if (worker->flags & WORKER_DIE) { |
2086 | worker->task->flags &= ~PF_WQ_WORKER; | ||
2087 | return 0; | ||
2088 | } | ||
2089 | |||
2090 | idle_worker_rebind(worker); | ||
2091 | goto woke_up; | ||
1954 | } | 2092 | } |
1955 | 2093 | ||
1956 | worker_leave_idle(worker); | 2094 | worker_leave_idle(worker); |
1957 | recheck: | 2095 | recheck: |
1958 | /* no more worker necessary? */ | 2096 | /* no more worker necessary? */ |
1959 | if (!need_more_worker(gcwq)) | 2097 | if (!need_more_worker(pool)) |
1960 | goto sleep; | 2098 | goto sleep; |
1961 | 2099 | ||
1962 | /* do we need to manage? */ | 2100 | /* do we need to manage? */ |
1963 | if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) | 2101 | if (unlikely(!may_start_working(pool)) && manage_workers(worker)) |
1964 | goto recheck; | 2102 | goto recheck; |
1965 | 2103 | ||
1966 | /* | 2104 | /* |
@@ -1979,7 +2117,7 @@ recheck: | |||
1979 | 2117 | ||
1980 | do { | 2118 | do { |
1981 | struct work_struct *work = | 2119 | struct work_struct *work = |
1982 | list_first_entry(&gcwq->worklist, | 2120 | list_first_entry(&pool->worklist, |
1983 | struct work_struct, entry); | 2121 | struct work_struct, entry); |
1984 | 2122 | ||
1985 | if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { | 2123 | if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { |
@@ -1991,11 +2129,11 @@ recheck: | |||
1991 | move_linked_works(work, &worker->scheduled, NULL); | 2129 | move_linked_works(work, &worker->scheduled, NULL); |
1992 | process_scheduled_works(worker); | 2130 | process_scheduled_works(worker); |
1993 | } | 2131 | } |
1994 | } while (keep_working(gcwq)); | 2132 | } while (keep_working(pool)); |
1995 | 2133 | ||
1996 | worker_set_flags(worker, WORKER_PREP, false); | 2134 | worker_set_flags(worker, WORKER_PREP, false); |
1997 | sleep: | 2135 | sleep: |
1998 | if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) | 2136 | if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) |
1999 | goto recheck; | 2137 | goto recheck; |
2000 | 2138 | ||
2001 | /* | 2139 | /* |
@@ -2053,14 +2191,15 @@ repeat: | |||
2053 | for_each_mayday_cpu(cpu, wq->mayday_mask) { | 2191 | for_each_mayday_cpu(cpu, wq->mayday_mask) { |
2054 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; | 2192 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; |
2055 | struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); | 2193 | struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); |
2056 | struct global_cwq *gcwq = cwq->gcwq; | 2194 | struct worker_pool *pool = cwq->pool; |
2195 | struct global_cwq *gcwq = pool->gcwq; | ||
2057 | struct work_struct *work, *n; | 2196 | struct work_struct *work, *n; |
2058 | 2197 | ||
2059 | __set_current_state(TASK_RUNNING); | 2198 | __set_current_state(TASK_RUNNING); |
2060 | mayday_clear_cpu(cpu, wq->mayday_mask); | 2199 | mayday_clear_cpu(cpu, wq->mayday_mask); |
2061 | 2200 | ||
2062 | /* migrate to the target cpu if possible */ | 2201 | /* migrate to the target cpu if possible */ |
2063 | rescuer->gcwq = gcwq; | 2202 | rescuer->pool = pool; |
2064 | worker_maybe_bind_and_lock(rescuer); | 2203 | worker_maybe_bind_and_lock(rescuer); |
2065 | 2204 | ||
2066 | /* | 2205 | /* |
@@ -2068,7 +2207,7 @@ repeat: | |||
2068 | * process'em. | 2207 | * process'em. |
2069 | */ | 2208 | */ |
2070 | BUG_ON(!list_empty(&rescuer->scheduled)); | 2209 | BUG_ON(!list_empty(&rescuer->scheduled)); |
2071 | list_for_each_entry_safe(work, n, &gcwq->worklist, entry) | 2210 | list_for_each_entry_safe(work, n, &pool->worklist, entry) |
2072 | if (get_work_cwq(work) == cwq) | 2211 | if (get_work_cwq(work) == cwq) |
2073 | move_linked_works(work, scheduled, &n); | 2212 | move_linked_works(work, scheduled, &n); |
2074 | 2213 | ||
@@ -2079,8 +2218,8 @@ repeat: | |||
2079 | * regular worker; otherwise, we end up with 0 concurrency | 2218 | * regular worker; otherwise, we end up with 0 concurrency |
2080 | * and stalling the execution. | 2219 | * and stalling the execution. |
2081 | */ | 2220 | */ |
2082 | if (keep_working(gcwq)) | 2221 | if (keep_working(pool)) |
2083 | wake_up_worker(gcwq); | 2222 | wake_up_worker(pool); |
2084 | 2223 | ||
2085 | spin_unlock_irq(&gcwq->lock); | 2224 | spin_unlock_irq(&gcwq->lock); |
2086 | } | 2225 | } |
@@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, | |||
2205 | 2344 | ||
2206 | for_each_cwq_cpu(cpu, wq) { | 2345 | for_each_cwq_cpu(cpu, wq) { |
2207 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 2346 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
2208 | struct global_cwq *gcwq = cwq->gcwq; | 2347 | struct global_cwq *gcwq = cwq->pool->gcwq; |
2209 | 2348 | ||
2210 | spin_lock_irq(&gcwq->lock); | 2349 | spin_lock_irq(&gcwq->lock); |
2211 | 2350 | ||
@@ -2421,9 +2560,9 @@ reflush: | |||
2421 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 2560 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
2422 | bool drained; | 2561 | bool drained; |
2423 | 2562 | ||
2424 | spin_lock_irq(&cwq->gcwq->lock); | 2563 | spin_lock_irq(&cwq->pool->gcwq->lock); |
2425 | drained = !cwq->nr_active && list_empty(&cwq->delayed_works); | 2564 | drained = !cwq->nr_active && list_empty(&cwq->delayed_works); |
2426 | spin_unlock_irq(&cwq->gcwq->lock); | 2565 | spin_unlock_irq(&cwq->pool->gcwq->lock); |
2427 | 2566 | ||
2428 | if (drained) | 2567 | if (drained) |
2429 | continue; | 2568 | continue; |
@@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | |||
2463 | */ | 2602 | */ |
2464 | smp_rmb(); | 2603 | smp_rmb(); |
2465 | cwq = get_work_cwq(work); | 2604 | cwq = get_work_cwq(work); |
2466 | if (unlikely(!cwq || gcwq != cwq->gcwq)) | 2605 | if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) |
2467 | goto already_gone; | 2606 | goto already_gone; |
2468 | } else if (wait_executing) { | 2607 | } else if (wait_executing) { |
2469 | worker = find_worker_executing_work(gcwq, work); | 2608 | worker = find_worker_executing_work(gcwq, work); |
@@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
2984 | if (flags & WQ_MEM_RECLAIM) | 3123 | if (flags & WQ_MEM_RECLAIM) |
2985 | flags |= WQ_RESCUER; | 3124 | flags |= WQ_RESCUER; |
2986 | 3125 | ||
2987 | /* | ||
2988 | * Unbound workqueues aren't concurrency managed and should be | ||
2989 | * dispatched to workers immediately. | ||
2990 | */ | ||
2991 | if (flags & WQ_UNBOUND) | ||
2992 | flags |= WQ_HIGHPRI; | ||
2993 | |||
2994 | max_active = max_active ?: WQ_DFL_ACTIVE; | 3126 | max_active = max_active ?: WQ_DFL_ACTIVE; |
2995 | max_active = wq_clamp_max_active(max_active, flags, wq->name); | 3127 | max_active = wq_clamp_max_active(max_active, flags, wq->name); |
2996 | 3128 | ||
@@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
3011 | for_each_cwq_cpu(cpu, wq) { | 3143 | for_each_cwq_cpu(cpu, wq) { |
3012 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3144 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3013 | struct global_cwq *gcwq = get_gcwq(cpu); | 3145 | struct global_cwq *gcwq = get_gcwq(cpu); |
3146 | int pool_idx = (bool)(flags & WQ_HIGHPRI); | ||
3014 | 3147 | ||
3015 | BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); | 3148 | BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); |
3016 | cwq->gcwq = gcwq; | 3149 | cwq->pool = &gcwq->pools[pool_idx]; |
3017 | cwq->wq = wq; | 3150 | cwq->wq = wq; |
3018 | cwq->flush_color = -1; | 3151 | cwq->flush_color = -1; |
3019 | cwq->max_active = max_active; | 3152 | cwq->max_active = max_active; |
@@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy); | |||
3225 | * gcwqs serve mix of short, long and very long running works making | 3358 | * gcwqs serve mix of short, long and very long running works making |
3226 | * blocked draining impractical. | 3359 | * blocked draining impractical. |
3227 | * | 3360 | * |
3228 | * This is solved by allowing a gcwq to be detached from CPU, running | 3361 | * This is solved by allowing a gcwq to be disassociated from the CPU |
3229 | * it with unbound (rogue) workers and allowing it to be reattached | 3362 | * running as an unbound one and allowing it to be reattached later if the |
3230 | * later if the cpu comes back online. A separate thread is created | 3363 | * cpu comes back online. |
3231 | * to govern a gcwq in such state and is called the trustee of the | ||
3232 | * gcwq. | ||
3233 | * | ||
3234 | * Trustee states and their descriptions. | ||
3235 | * | ||
3236 | * START Command state used on startup. On CPU_DOWN_PREPARE, a | ||
3237 | * new trustee is started with this state. | ||
3238 | * | ||
3239 | * IN_CHARGE Once started, trustee will enter this state after | ||
3240 | * assuming the manager role and making all existing | ||
3241 | * workers rogue. DOWN_PREPARE waits for trustee to | ||
3242 | * enter this state. After reaching IN_CHARGE, trustee | ||
3243 | * tries to execute the pending worklist until it's empty | ||
3244 | * and the state is set to BUTCHER, or the state is set | ||
3245 | * to RELEASE. | ||
3246 | * | ||
3247 | * BUTCHER Command state which is set by the cpu callback after | ||
3248 | * the cpu has went down. Once this state is set trustee | ||
3249 | * knows that there will be no new works on the worklist | ||
3250 | * and once the worklist is empty it can proceed to | ||
3251 | * killing idle workers. | ||
3252 | * | ||
3253 | * RELEASE Command state which is set by the cpu callback if the | ||
3254 | * cpu down has been canceled or it has come online | ||
3255 | * again. After recognizing this state, trustee stops | ||
3256 | * trying to drain or butcher and clears ROGUE, rebinds | ||
3257 | * all remaining workers back to the cpu and releases | ||
3258 | * manager role. | ||
3259 | * | ||
3260 | * DONE Trustee will enter this state after BUTCHER or RELEASE | ||
3261 | * is complete. | ||
3262 | * | ||
3263 | * trustee CPU draining | ||
3264 | * took over down complete | ||
3265 | * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE | ||
3266 | * | | ^ | ||
3267 | * | CPU is back online v return workers | | ||
3268 | * ----------------> RELEASE -------------- | ||
3269 | */ | 3364 | */ |
3270 | 3365 | ||
3271 | /** | 3366 | /* claim manager positions of all pools */ |
3272 | * trustee_wait_event_timeout - timed event wait for trustee | 3367 | static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) |
3273 | * @cond: condition to wait for | ||
3274 | * @timeout: timeout in jiffies | ||
3275 | * | ||
3276 | * wait_event_timeout() for trustee to use. Handles locking and | ||
3277 | * checks for RELEASE request. | ||
3278 | * | ||
3279 | * CONTEXT: | ||
3280 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3281 | * multiple times. To be used by trustee. | ||
3282 | * | ||
3283 | * RETURNS: | ||
3284 | * Positive indicating left time if @cond is satisfied, 0 if timed | ||
3285 | * out, -1 if canceled. | ||
3286 | */ | ||
3287 | #define trustee_wait_event_timeout(cond, timeout) ({ \ | ||
3288 | long __ret = (timeout); \ | ||
3289 | while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ | ||
3290 | __ret) { \ | ||
3291 | spin_unlock_irq(&gcwq->lock); \ | ||
3292 | __wait_event_timeout(gcwq->trustee_wait, (cond) || \ | ||
3293 | (gcwq->trustee_state == TRUSTEE_RELEASE), \ | ||
3294 | __ret); \ | ||
3295 | spin_lock_irq(&gcwq->lock); \ | ||
3296 | } \ | ||
3297 | gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ | ||
3298 | }) | ||
3299 | |||
3300 | /** | ||
3301 | * trustee_wait_event - event wait for trustee | ||
3302 | * @cond: condition to wait for | ||
3303 | * | ||
3304 | * wait_event() for trustee to use. Automatically handles locking and | ||
3305 | * checks for CANCEL request. | ||
3306 | * | ||
3307 | * CONTEXT: | ||
3308 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3309 | * multiple times. To be used by trustee. | ||
3310 | * | ||
3311 | * RETURNS: | ||
3312 | * 0 if @cond is satisfied, -1 if canceled. | ||
3313 | */ | ||
3314 | #define trustee_wait_event(cond) ({ \ | ||
3315 | long __ret1; \ | ||
3316 | __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ | ||
3317 | __ret1 < 0 ? -1 : 0; \ | ||
3318 | }) | ||
3319 | |||
3320 | static int __cpuinit trustee_thread(void *__gcwq) | ||
3321 | { | 3368 | { |
3322 | struct global_cwq *gcwq = __gcwq; | 3369 | struct worker_pool *pool; |
3323 | struct worker *worker; | ||
3324 | struct work_struct *work; | ||
3325 | struct hlist_node *pos; | ||
3326 | long rc; | ||
3327 | int i; | ||
3328 | |||
3329 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
3330 | 3370 | ||
3371 | for_each_worker_pool(pool, gcwq) | ||
3372 | mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); | ||
3331 | spin_lock_irq(&gcwq->lock); | 3373 | spin_lock_irq(&gcwq->lock); |
3332 | /* | 3374 | } |
3333 | * Claim the manager position and make all workers rogue. | ||
3334 | * Trustee must be bound to the target cpu and can't be | ||
3335 | * cancelled. | ||
3336 | */ | ||
3337 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
3338 | rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); | ||
3339 | BUG_ON(rc < 0); | ||
3340 | |||
3341 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
3342 | |||
3343 | list_for_each_entry(worker, &gcwq->idle_list, entry) | ||
3344 | worker->flags |= WORKER_ROGUE; | ||
3345 | 3375 | ||
3346 | for_each_busy_worker(worker, i, pos, gcwq) | 3376 | /* release manager positions */ |
3347 | worker->flags |= WORKER_ROGUE; | 3377 | static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) |
3378 | { | ||
3379 | struct worker_pool *pool; | ||
3348 | 3380 | ||
3349 | /* | ||
3350 | * Call schedule() so that we cross rq->lock and thus can | ||
3351 | * guarantee sched callbacks see the rogue flag. This is | ||
3352 | * necessary as scheduler callbacks may be invoked from other | ||
3353 | * cpus. | ||
3354 | */ | ||
3355 | spin_unlock_irq(&gcwq->lock); | 3381 | spin_unlock_irq(&gcwq->lock); |
3356 | schedule(); | 3382 | for_each_worker_pool(pool, gcwq) |
3357 | spin_lock_irq(&gcwq->lock); | 3383 | mutex_unlock(&pool->manager_mutex); |
3384 | } | ||
3358 | 3385 | ||
3359 | /* | 3386 | static void gcwq_unbind_fn(struct work_struct *work) |
3360 | * Sched callbacks are disabled now. Zap nr_running. After | 3387 | { |
3361 | * this, nr_running stays zero and need_more_worker() and | 3388 | struct global_cwq *gcwq = get_gcwq(smp_processor_id()); |
3362 | * keep_working() are always true as long as the worklist is | 3389 | struct worker_pool *pool; |
3363 | * not empty. | 3390 | struct worker *worker; |
3364 | */ | 3391 | struct hlist_node *pos; |
3365 | atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); | 3392 | int i; |
3366 | 3393 | ||
3367 | spin_unlock_irq(&gcwq->lock); | 3394 | BUG_ON(gcwq->cpu != smp_processor_id()); |
3368 | del_timer_sync(&gcwq->idle_timer); | ||
3369 | spin_lock_irq(&gcwq->lock); | ||
3370 | 3395 | ||
3371 | /* | 3396 | gcwq_claim_management_and_lock(gcwq); |
3372 | * We're now in charge. Notify and proceed to drain. We need | ||
3373 | * to keep the gcwq running during the whole CPU down | ||
3374 | * procedure as other cpu hotunplug callbacks may need to | ||
3375 | * flush currently running tasks. | ||
3376 | */ | ||
3377 | gcwq->trustee_state = TRUSTEE_IN_CHARGE; | ||
3378 | wake_up_all(&gcwq->trustee_wait); | ||
3379 | 3397 | ||
3380 | /* | 3398 | /* |
3381 | * The original cpu is in the process of dying and may go away | 3399 | * We've claimed all manager positions. Make all workers unbound |
3382 | * anytime now. When that happens, we and all workers would | 3400 | * and set DISASSOCIATED. Before this, all workers except for the |
3383 | * be migrated to other cpus. Try draining any left work. We | 3401 | * ones which are still executing works from before the last CPU |
3384 | * want to get it over with ASAP - spam rescuers, wake up as | 3402 | * down must be on the cpu. After this, they may become diasporas. |
3385 | * many idlers as necessary and create new ones till the | ||
3386 | * worklist is empty. Note that if the gcwq is frozen, there | ||
3387 | * may be frozen works in freezable cwqs. Don't declare | ||
3388 | * completion while frozen. | ||
3389 | */ | 3403 | */ |
3390 | while (gcwq->nr_workers != gcwq->nr_idle || | 3404 | for_each_worker_pool(pool, gcwq) |
3391 | gcwq->flags & GCWQ_FREEZING || | 3405 | list_for_each_entry(worker, &pool->idle_list, entry) |
3392 | gcwq->trustee_state == TRUSTEE_IN_CHARGE) { | 3406 | worker->flags |= WORKER_UNBOUND; |
3393 | int nr_works = 0; | ||
3394 | |||
3395 | list_for_each_entry(work, &gcwq->worklist, entry) { | ||
3396 | send_mayday(work); | ||
3397 | nr_works++; | ||
3398 | } | ||
3399 | 3407 | ||
3400 | list_for_each_entry(worker, &gcwq->idle_list, entry) { | 3408 | for_each_busy_worker(worker, i, pos, gcwq) |
3401 | if (!nr_works--) | 3409 | worker->flags |= WORKER_UNBOUND; |
3402 | break; | ||
3403 | wake_up_process(worker->task); | ||
3404 | } | ||
3405 | 3410 | ||
3406 | if (need_to_create_worker(gcwq)) { | 3411 | gcwq->flags |= GCWQ_DISASSOCIATED; |
3407 | spin_unlock_irq(&gcwq->lock); | ||
3408 | worker = create_worker(gcwq, false); | ||
3409 | spin_lock_irq(&gcwq->lock); | ||
3410 | if (worker) { | ||
3411 | worker->flags |= WORKER_ROGUE; | ||
3412 | start_worker(worker); | ||
3413 | } | ||
3414 | } | ||
3415 | 3412 | ||
3416 | /* give a breather */ | 3413 | gcwq_release_management_and_unlock(gcwq); |
3417 | if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) | ||
3418 | break; | ||
3419 | } | ||
3420 | 3414 | ||
3421 | /* | 3415 | /* |
3422 | * Either all works have been scheduled and cpu is down, or | 3416 | * Call schedule() so that we cross rq->lock and thus can guarantee |
3423 | * cpu down has already been canceled. Wait for and butcher | 3417 | * sched callbacks see the %WORKER_UNBOUND flag. This is necessary |
3424 | * all workers till we're canceled. | 3418 | * as scheduler callbacks may be invoked from other cpus. |
3425 | */ | 3419 | */ |
3426 | do { | 3420 | schedule(); |
3427 | rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); | ||
3428 | while (!list_empty(&gcwq->idle_list)) | ||
3429 | destroy_worker(list_first_entry(&gcwq->idle_list, | ||
3430 | struct worker, entry)); | ||
3431 | } while (gcwq->nr_workers && rc >= 0); | ||
3432 | 3421 | ||
3433 | /* | 3422 | /* |
3434 | * At this point, either draining has completed and no worker | 3423 | * Sched callbacks are disabled now. Zap nr_running. After this, |
3435 | * is left, or cpu down has been canceled or the cpu is being | 3424 | * nr_running stays zero and need_more_worker() and keep_working() |
3436 | * brought back up. There shouldn't be any idle one left. | 3425 | * are always true as long as the worklist is not empty. @gcwq now |
3437 | * Tell the remaining busy ones to rebind once it finishes the | 3426 | * behaves as unbound (in terms of concurrency management) gcwq |
3438 | * currently scheduled works by scheduling the rebind_work. | 3427 | * which is served by workers tied to the CPU. |
3428 | * | ||
3429 | * On return from this function, the current worker would trigger | ||
3430 | * unbound chain execution of pending work items if other workers | ||
3431 | * didn't already. | ||
3439 | */ | 3432 | */ |
3440 | WARN_ON(!list_empty(&gcwq->idle_list)); | 3433 | for_each_worker_pool(pool, gcwq) |
3441 | 3434 | atomic_set(get_pool_nr_running(pool), 0); | |
3442 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
3443 | struct work_struct *rebind_work = &worker->rebind_work; | ||
3444 | |||
3445 | /* | ||
3446 | * Rebind_work may race with future cpu hotplug | ||
3447 | * operations. Use a separate flag to mark that | ||
3448 | * rebinding is scheduled. | ||
3449 | */ | ||
3450 | worker->flags |= WORKER_REBIND; | ||
3451 | worker->flags &= ~WORKER_ROGUE; | ||
3452 | |||
3453 | /* queue rebind_work, wq doesn't matter, use the default one */ | ||
3454 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | ||
3455 | work_data_bits(rebind_work))) | ||
3456 | continue; | ||
3457 | |||
3458 | debug_work_activate(rebind_work); | ||
3459 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
3460 | worker->scheduled.next, | ||
3461 | work_color_to_flags(WORK_NO_COLOR)); | ||
3462 | } | ||
3463 | |||
3464 | /* relinquish manager role */ | ||
3465 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
3466 | |||
3467 | /* notify completion */ | ||
3468 | gcwq->trustee = NULL; | ||
3469 | gcwq->trustee_state = TRUSTEE_DONE; | ||
3470 | wake_up_all(&gcwq->trustee_wait); | ||
3471 | spin_unlock_irq(&gcwq->lock); | ||
3472 | return 0; | ||
3473 | } | 3435 | } |
3474 | 3436 | ||
3475 | /** | 3437 | /* |
3476 | * wait_trustee_state - wait for trustee to enter the specified state | 3438 | * Workqueues should be brought up before normal priority CPU notifiers. |
3477 | * @gcwq: gcwq the trustee of interest belongs to | 3439 | * This will be registered high priority CPU notifier. |
3478 | * @state: target state to wait for | ||
3479 | * | ||
3480 | * Wait for the trustee to reach @state. DONE is already matched. | ||
3481 | * | ||
3482 | * CONTEXT: | ||
3483 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3484 | * multiple times. To be used by cpu_callback. | ||
3485 | */ | 3440 | */ |
3486 | static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) | 3441 | static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, |
3487 | __releases(&gcwq->lock) | 3442 | unsigned long action, |
3488 | __acquires(&gcwq->lock) | 3443 | void *hcpu) |
3489 | { | ||
3490 | if (!(gcwq->trustee_state == state || | ||
3491 | gcwq->trustee_state == TRUSTEE_DONE)) { | ||
3492 | spin_unlock_irq(&gcwq->lock); | ||
3493 | __wait_event(gcwq->trustee_wait, | ||
3494 | gcwq->trustee_state == state || | ||
3495 | gcwq->trustee_state == TRUSTEE_DONE); | ||
3496 | spin_lock_irq(&gcwq->lock); | ||
3497 | } | ||
3498 | } | ||
3499 | |||
3500 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | ||
3501 | unsigned long action, | ||
3502 | void *hcpu) | ||
3503 | { | 3444 | { |
3504 | unsigned int cpu = (unsigned long)hcpu; | 3445 | unsigned int cpu = (unsigned long)hcpu; |
3505 | struct global_cwq *gcwq = get_gcwq(cpu); | 3446 | struct global_cwq *gcwq = get_gcwq(cpu); |
3506 | struct task_struct *new_trustee = NULL; | 3447 | struct worker_pool *pool; |
3507 | struct worker *uninitialized_var(new_worker); | ||
3508 | unsigned long flags; | ||
3509 | |||
3510 | action &= ~CPU_TASKS_FROZEN; | ||
3511 | 3448 | ||
3512 | switch (action) { | 3449 | switch (action & ~CPU_TASKS_FROZEN) { |
3513 | case CPU_DOWN_PREPARE: | ||
3514 | new_trustee = kthread_create(trustee_thread, gcwq, | ||
3515 | "workqueue_trustee/%d\n", cpu); | ||
3516 | if (IS_ERR(new_trustee)) | ||
3517 | return notifier_from_errno(PTR_ERR(new_trustee)); | ||
3518 | kthread_bind(new_trustee, cpu); | ||
3519 | /* fall through */ | ||
3520 | case CPU_UP_PREPARE: | 3450 | case CPU_UP_PREPARE: |
3521 | BUG_ON(gcwq->first_idle); | 3451 | for_each_worker_pool(pool, gcwq) { |
3522 | new_worker = create_worker(gcwq, false); | 3452 | struct worker *worker; |
3523 | if (!new_worker) { | ||
3524 | if (new_trustee) | ||
3525 | kthread_stop(new_trustee); | ||
3526 | return NOTIFY_BAD; | ||
3527 | } | ||
3528 | } | ||
3529 | |||
3530 | /* some are called w/ irq disabled, don't disturb irq status */ | ||
3531 | spin_lock_irqsave(&gcwq->lock, flags); | ||
3532 | 3453 | ||
3533 | switch (action) { | 3454 | if (pool->nr_workers) |
3534 | case CPU_DOWN_PREPARE: | 3455 | continue; |
3535 | /* initialize trustee and tell it to acquire the gcwq */ | ||
3536 | BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); | ||
3537 | gcwq->trustee = new_trustee; | ||
3538 | gcwq->trustee_state = TRUSTEE_START; | ||
3539 | wake_up_process(gcwq->trustee); | ||
3540 | wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); | ||
3541 | /* fall through */ | ||
3542 | case CPU_UP_PREPARE: | ||
3543 | BUG_ON(gcwq->first_idle); | ||
3544 | gcwq->first_idle = new_worker; | ||
3545 | break; | ||
3546 | 3456 | ||
3547 | case CPU_DYING: | 3457 | worker = create_worker(pool); |
3548 | /* | 3458 | if (!worker) |
3549 | * Before this, the trustee and all workers except for | 3459 | return NOTIFY_BAD; |
3550 | * the ones which are still executing works from | ||
3551 | * before the last CPU down must be on the cpu. After | ||
3552 | * this, they'll all be diasporas. | ||
3553 | */ | ||
3554 | gcwq->flags |= GCWQ_DISASSOCIATED; | ||
3555 | break; | ||
3556 | 3460 | ||
3557 | case CPU_POST_DEAD: | 3461 | spin_lock_irq(&gcwq->lock); |
3558 | gcwq->trustee_state = TRUSTEE_BUTCHER; | 3462 | start_worker(worker); |
3559 | /* fall through */ | 3463 | spin_unlock_irq(&gcwq->lock); |
3560 | case CPU_UP_CANCELED: | 3464 | } |
3561 | destroy_worker(gcwq->first_idle); | ||
3562 | gcwq->first_idle = NULL; | ||
3563 | break; | 3465 | break; |
3564 | 3466 | ||
3565 | case CPU_DOWN_FAILED: | 3467 | case CPU_DOWN_FAILED: |
3566 | case CPU_ONLINE: | 3468 | case CPU_ONLINE: |
3469 | gcwq_claim_management_and_lock(gcwq); | ||
3567 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | 3470 | gcwq->flags &= ~GCWQ_DISASSOCIATED; |
3568 | if (gcwq->trustee_state != TRUSTEE_DONE) { | 3471 | rebind_workers(gcwq); |
3569 | gcwq->trustee_state = TRUSTEE_RELEASE; | 3472 | gcwq_release_management_and_unlock(gcwq); |
3570 | wake_up_process(gcwq->trustee); | ||
3571 | wait_trustee_state(gcwq, TRUSTEE_DONE); | ||
3572 | } | ||
3573 | |||
3574 | /* | ||
3575 | * Trustee is done and there might be no worker left. | ||
3576 | * Put the first_idle in and request a real manager to | ||
3577 | * take a look. | ||
3578 | */ | ||
3579 | spin_unlock_irq(&gcwq->lock); | ||
3580 | kthread_bind(gcwq->first_idle->task, cpu); | ||
3581 | spin_lock_irq(&gcwq->lock); | ||
3582 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | ||
3583 | start_worker(gcwq->first_idle); | ||
3584 | gcwq->first_idle = NULL; | ||
3585 | break; | 3473 | break; |
3586 | } | 3474 | } |
3475 | return NOTIFY_OK; | ||
3476 | } | ||
3587 | 3477 | ||
3588 | spin_unlock_irqrestore(&gcwq->lock, flags); | 3478 | /* |
3479 | * Workqueues should be brought down after normal priority CPU notifiers. | ||
3480 | * This will be registered as low priority CPU notifier. | ||
3481 | */ | ||
3482 | static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, | ||
3483 | unsigned long action, | ||
3484 | void *hcpu) | ||
3485 | { | ||
3486 | unsigned int cpu = (unsigned long)hcpu; | ||
3487 | struct work_struct unbind_work; | ||
3589 | 3488 | ||
3590 | return notifier_from_errno(0); | 3489 | switch (action & ~CPU_TASKS_FROZEN) { |
3490 | case CPU_DOWN_PREPARE: | ||
3491 | /* unbinding should happen on the local CPU */ | ||
3492 | INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); | ||
3493 | schedule_work_on(cpu, &unbind_work); | ||
3494 | flush_work(&unbind_work); | ||
3495 | break; | ||
3496 | } | ||
3497 | return NOTIFY_OK; | ||
3591 | } | 3498 | } |
3592 | 3499 | ||
3593 | #ifdef CONFIG_SMP | 3500 | #ifdef CONFIG_SMP |
@@ -3746,6 +3653,7 @@ void thaw_workqueues(void) | |||
3746 | 3653 | ||
3747 | for_each_gcwq_cpu(cpu) { | 3654 | for_each_gcwq_cpu(cpu) { |
3748 | struct global_cwq *gcwq = get_gcwq(cpu); | 3655 | struct global_cwq *gcwq = get_gcwq(cpu); |
3656 | struct worker_pool *pool; | ||
3749 | struct workqueue_struct *wq; | 3657 | struct workqueue_struct *wq; |
3750 | 3658 | ||
3751 | spin_lock_irq(&gcwq->lock); | 3659 | spin_lock_irq(&gcwq->lock); |
@@ -3767,7 +3675,8 @@ void thaw_workqueues(void) | |||
3767 | cwq_activate_first_delayed(cwq); | 3675 | cwq_activate_first_delayed(cwq); |
3768 | } | 3676 | } |
3769 | 3677 | ||
3770 | wake_up_worker(gcwq); | 3678 | for_each_worker_pool(pool, gcwq) |
3679 | wake_up_worker(pool); | ||
3771 | 3680 | ||
3772 | spin_unlock_irq(&gcwq->lock); | 3681 | spin_unlock_irq(&gcwq->lock); |
3773 | } | 3682 | } |
@@ -3783,46 +3692,57 @@ static int __init init_workqueues(void) | |||
3783 | unsigned int cpu; | 3692 | unsigned int cpu; |
3784 | int i; | 3693 | int i; |
3785 | 3694 | ||
3786 | cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); | 3695 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); |
3696 | cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); | ||
3787 | 3697 | ||
3788 | /* initialize gcwqs */ | 3698 | /* initialize gcwqs */ |
3789 | for_each_gcwq_cpu(cpu) { | 3699 | for_each_gcwq_cpu(cpu) { |
3790 | struct global_cwq *gcwq = get_gcwq(cpu); | 3700 | struct global_cwq *gcwq = get_gcwq(cpu); |
3701 | struct worker_pool *pool; | ||
3791 | 3702 | ||
3792 | spin_lock_init(&gcwq->lock); | 3703 | spin_lock_init(&gcwq->lock); |
3793 | INIT_LIST_HEAD(&gcwq->worklist); | ||
3794 | gcwq->cpu = cpu; | 3704 | gcwq->cpu = cpu; |
3795 | gcwq->flags |= GCWQ_DISASSOCIATED; | 3705 | gcwq->flags |= GCWQ_DISASSOCIATED; |
3796 | 3706 | ||
3797 | INIT_LIST_HEAD(&gcwq->idle_list); | ||
3798 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) | 3707 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) |
3799 | INIT_HLIST_HEAD(&gcwq->busy_hash[i]); | 3708 | INIT_HLIST_HEAD(&gcwq->busy_hash[i]); |
3800 | 3709 | ||
3801 | init_timer_deferrable(&gcwq->idle_timer); | 3710 | for_each_worker_pool(pool, gcwq) { |
3802 | gcwq->idle_timer.function = idle_worker_timeout; | 3711 | pool->gcwq = gcwq; |
3803 | gcwq->idle_timer.data = (unsigned long)gcwq; | 3712 | INIT_LIST_HEAD(&pool->worklist); |
3713 | INIT_LIST_HEAD(&pool->idle_list); | ||
3714 | |||
3715 | init_timer_deferrable(&pool->idle_timer); | ||
3716 | pool->idle_timer.function = idle_worker_timeout; | ||
3717 | pool->idle_timer.data = (unsigned long)pool; | ||
3804 | 3718 | ||
3805 | setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, | 3719 | setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, |
3806 | (unsigned long)gcwq); | 3720 | (unsigned long)pool); |
3807 | 3721 | ||
3808 | ida_init(&gcwq->worker_ida); | 3722 | mutex_init(&pool->manager_mutex); |
3723 | ida_init(&pool->worker_ida); | ||
3724 | } | ||
3809 | 3725 | ||
3810 | gcwq->trustee_state = TRUSTEE_DONE; | 3726 | init_waitqueue_head(&gcwq->rebind_hold); |
3811 | init_waitqueue_head(&gcwq->trustee_wait); | ||
3812 | } | 3727 | } |
3813 | 3728 | ||
3814 | /* create the initial worker */ | 3729 | /* create the initial worker */ |
3815 | for_each_online_gcwq_cpu(cpu) { | 3730 | for_each_online_gcwq_cpu(cpu) { |
3816 | struct global_cwq *gcwq = get_gcwq(cpu); | 3731 | struct global_cwq *gcwq = get_gcwq(cpu); |
3817 | struct worker *worker; | 3732 | struct worker_pool *pool; |
3818 | 3733 | ||
3819 | if (cpu != WORK_CPU_UNBOUND) | 3734 | if (cpu != WORK_CPU_UNBOUND) |
3820 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | 3735 | gcwq->flags &= ~GCWQ_DISASSOCIATED; |
3821 | worker = create_worker(gcwq, true); | 3736 | |
3822 | BUG_ON(!worker); | 3737 | for_each_worker_pool(pool, gcwq) { |
3823 | spin_lock_irq(&gcwq->lock); | 3738 | struct worker *worker; |
3824 | start_worker(worker); | 3739 | |
3825 | spin_unlock_irq(&gcwq->lock); | 3740 | worker = create_worker(pool); |
3741 | BUG_ON(!worker); | ||
3742 | spin_lock_irq(&gcwq->lock); | ||
3743 | start_worker(worker); | ||
3744 | spin_unlock_irq(&gcwq->lock); | ||
3745 | } | ||
3826 | } | 3746 | } |
3827 | 3747 | ||
3828 | system_wq = alloc_workqueue("events", 0, 0); | 3748 | system_wq = alloc_workqueue("events", 0, 0); |